Python学习笔记,51job进一步分析
分析北京,上海,深圳,杭州,南京,无锡的工资情况
思路:
1.在爬虫中,爬取了地名,我们可以通过for循环+if语句来筛选各地的工资;
2.将数据清洗的程序,封装为一个函数,方便使用;
3.将直方图和饼图程序也封装为函数;
4.进行数据可视化分析;
以下为代码:
import pymysql
import pandas as pda
import numpy as npy
import re
import matplotlib.pylab as pyl
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
#连接数据库
conn = pymysql.connect(host="127.0.0.1",user="root",passwd="lxw19961230",db="51job")
sql = "select * from python_1"
data = pda.read_sql(sql,conn)
#筛选城市数据:北京,上海,深圳,南京,杭州,无锡
data_money = data['money'].dropna()
data_adress = data['adress'].dropna()
beijing_money = []
count_1 = 0
for i in range(0,len(data_adress)):
if "北京" in data_adress[i]:
count_1 += 1
beijing_money.append(data_money[i])
shanghai_money = []
count_2 = 0
for i in range(0,len(data_adress)):
if "上海" in data_adress[i]:
count_2 += 1
shanghai_money.append(data_money[i])
shenzheng_money = []
count_3 = 0
for i in range(0,len(data_adress)):
if "深圳" in data_adress[i]:
count_3 += 1
shenzheng_money.append(data_money[i])
hangzhou_money = []
count_4 = 0
for i in range(0,len(data_adress)):
if "杭州" in data_adress[i]:
count_4 += 1
hangzhou_money.append(data_money[i])
nanjing_money = []
count_5 = 0
for i in range(0,len(data_adress)):
if "南京" in data_adress[i]:
count_5 += 1
nanjing_money.append(data_money[i])
wuxi_money = []
count_6 = 0
for i in range(0,len(data_adress)):
if "无锡" in data_adress[i]:
count_6 += 1
wuxi_money.append(data_money[i])
'''--------------------------------整理数据----------------------------------------'''
#绘制散点图比较,各城市python人才需求量
a = ["beijing","shanghai","shenzheng","hangzhou","nanjing","wuxi"]
b = [count_1,count_2,count_3,count_4,count_5,count_6]
pyl.title("city_wages")
pyl.xlabel("city")
pyl.ylabel("wages")
pyl.plot(a,b,'o')
pyl.savefig("D:/python/爬虫/51job分析/city-wages.png")
pyl.show()
'''------------------------------绘制散点图初步分析--------------------------------'''
#整理各城市工资数据,整理为 k/月,函数
def change_wages_data(data_money):
#统计万和月的数据
wan_yue=[]
#统计万以上的数据
wan_yis=[]
#统计万和年的数据
wan_nian=[]
count11=0
count21=0
count31=0
for i in data_money:
if '以上' in i and '月' in i:
wan_yis.append(i)
count11 += 1
continue
if "年" in i and "万" in i and "以下" not in i and "以上" not in i:
wan_nian.append(i)
count31 += 1
continue
if '月' in i and type(i)==str:
if '万' in i:
wan_yue.append(i)
'''----------------------------------------------------------------------------------'''
#字符串分割,去掉月,整理为 1-2万 的格式
wan_yue_split=[]
for c in wan_yue:
wan_yue_split.append(re.split('[-/]',c))
#最小值组成列表
minw=[]
for e in wan_yue_split:
minw.append(e[0])
#整理最小值格式,转化为以k为单位的数据(str类型)
minww=[]
for ff in minw:
if len(ff)==1:
ff=ff+'0'
else:
ff=ff.replace('.','')
minww.append(ff)
#minww求和,先转化为float类型
wan_yue_min = []
for hh in minww:
hhh=float(hh)
wan_yue_min.append(hhh)
sum_wan_yue_min = sum(wan_yue_min)
#所有工资最小值的平均值
wan_yue_min_avg=sum_wan_yue_min/(len(wan_yue_min)-count11)
print("最低工资平均值(筛选的万/月,单位k):",wan_yue_min_avg)
#最大值组成列表
maxw=list(map(lambda x:x[1].strip('万'),wan_yue_split))
#整理最大值格式,转化为以k为单位的数据(str类型)
maxww=[]
for f in maxw:
if len(f)==1:
f=f+'0'
else:
f=f.replace('.','')
maxww.append(f)
#maxww求和,先转化为float类型
wan_yue_max=[]
for h in maxww:
wan_yue_max.append(float(h))
sum_wanyue_max = sum(wan_yue_max)
#所有工资最大值的平均值
wan_yue_max_avg=sum_wanyue_max/(len(maxww)-count11)
print("最低工资平均值(筛选万/月,单位k):",wan_yue_max_avg)
'''-------------------------------------------------------------------------------'''
#万以上的数据(万以上/月)
wan_yis_split=[]
for i in wan_yis:
wan_yis_split.append(i.replace("万以上/月",""))
wan_yis_av=[]
for j in wan_yis_split:
wan_yis_av.append(float(j))
wan_yis_sum = sum(wan_yis_av)
if count11 == 0:
print("没有万以上/月的")
if count11:
wan_yis_avg = wan_yis_sum/count11
print("万以上/月的平均工资为:",wan_yis_avg)
'''-----------------------------------------------------------------------------'''
#万和年的数据
wan_nian_split=[]
#整理为20-30万的格式
for c in wan_nian:
wan_nian_split.append(re.split('[-/]',c))
#最小值组成列表
minnn=[]
for e in wan_nian_split:
minnn.append(e[0])
#minnn求和,先转化为float类型
wan_nian_min = []
for hh in minnn:
hhh=float(hh)
wan_nian_min.append(hhh)
sum_wan_nian_min = sum(wan_nian_min)
#所有工资最小值的平均值,转化为多少万/月,方便比较
wan_nian_min_avg=sum_wan_nian_min/(count31*1.2)
print("万/年的最低工资平均值(转化为万/月,单位k):",wan_nian_min_avg)
#最大值组成列表
maxnn=list(map(lambda x:x[1].strip('万'),wan_nian_split))
#整理最大值格式,转化为以k为单位的数据(str类型)
wan_nian_max=[]
for f in maxnn:
wan_nian_max.append(f)
#wan_nian_max求和,先转化为float类型
wan_nian_max_av=[]
for h in wan_nian_max:
wan_nian_max_av.append(float(h))
sum_wanyue_max=sum(wan_nian_max_av)
#所有工资最大值的平均值
wan_nian_max_avg=sum_wanyue_max/(count31*1.2)
print("万/年的最高工资平均值(转化为万/月,单位k):",wan_nian_max_avg)
'''------------------------------------------------------------------------------------'''
#筛选出千和月的
qian_yue=[]
count21 = 0
aa=data_money
for j in aa:
if '以下' in j:
count21 += 1
continue
if '月' in j and type(j)==str:
if '千' in j:
qian_yue.append(j)
#分割字符
qian_yue_split=[]
for n in qian_yue:
qian_yue_split.append(re.split('[-/]',n))
#最小值列表
minq=[]
for x in qian_yue_split:
minq.append(x[0])
#最小求和,转化为float类型
qian_yue_min=[]
for y in minq:
qian_yue_min.append(float(y))
sum_qian_yue_min=sum(qian_yue_min)
#求平均值
qian_yue_min_avg=sum_qian_yue_min/(len(qian_yue_min)-count21)
print("最低工资平均值(筛选千/月,单位k):",qian_yue_min_avg)
#最大值列表
v=list(map(lambda x:x[1].strip('千'),qian_yue_split))
qian_yue_max=[]
for i in v:
qian_yue_max.append(float(i))
sum_qian_yue_max=sum(qian_yue_max)
qian_yue_max_avg=sum_qian_yue_max/(len(qian_yue_max)-count21)
print("最高工资平均值(筛选千/月,单位k):",qian_yue_max_avg)
'''-------------以上为各数据平均值分析---------------------'''
#整合数据
#万/月,计算其平均值,存储到一个列表中
avg=[]
for i in range(0,len(wan_yue_min)):
avg.append((wan_yue_min[i]+wan_yue_max[i])/2)
#万以上的整合入列表中
for i in range(0,len(wan_yis_av)):
avg.append(wan_yis_av[i])
#将万/年整理进列表2.4=1.2*2
for i in range(0,len(wan_nian_min)):
avg.append((wan_nian_min[i]+wan_nian_max_av[i])/2.4)
#千/月整理进列表中
for i in range(0,len(qian_yue_min)):
avg.append((qian_yue_min[i]+qian_yue_max[i])/2)
return avg
'''----------------------整理数据的函数--------------------------------'''
#绘制直方图的函数
def hist(avg,city):
#初步统计,整理为直方图,方便观察
#设置直方图组距
avg_max = max(avg)
avg_min = min(avg)
#极差
avg_rg = avg_max - avg_min
#组距
avg_dst = avg_rg/50
#设置参数
avg_sty = npy.arange(avg_min,avg_max,avg_dst)
pyl.title(city + "wages--count--assortment")
pyl.xlabel("wages")
pyl.ylabel("nmber")
pyl.hist(avg,avg_sty)
pyl.savefig("D:/python/爬虫/51job分析/"+city+"-wages.jpg")
pyl.show()
'''------------画好直方图,可进行分析----------------------'''
#绘制饼图的函数
def pie(avg,city):
#可画饼图分析,各个工资段的百分比
count1 = 0
count2 = 0
count3 = 0
count4 = 0
count5 = 0
count6 = 0
#对数据进行处理,以便来画饼图
for i in avg:
if float(i) < 5.0:
count1 += 1
if float(i) >= 5.0 and float(i) < 10.0:
count2 += 1
if float(i) >= 10.0 and float(i) < 20.0:
count3 += 1
if float(i) >= 20.0 and float(i) < 30.0:
count4 += 1
if float(i) >= 30.0 and float(i) < 40.0:
count5 += 1
if float(i) >= 40.0:
count6 += 1
#设置标签
labels = ['under-5k','5k-10k','10k-20k','20k-30k','30k-40k','over-40k']
#数据
count = [count1,count2,count3,count4,count5,count6]
#设置突出部分
expl = [0,0,0.1,0,0,0]
#设置饼图属性
fig = plt.figure(1, figsize=(6,6))
#autopct数据百分数,小数设置;shadow是否有阴影,labeldistance文字离饼图的距离
plt.pie(count,explode=expl,labels=labels,autopct='%1.2f%%',shadow=True,labeldistance=1.1)
plt.title(city+"wages-count")
plt.savefig("D:/python/爬虫/51job分析/"+city+"-wages.png")
plt.show()
#调用各个函数画图分析数据
print("北京:")
beijing_avg = change_wages_data(beijing_money)
hist(beijing_avg,"beijing")
pie(beijing_avg,"beijing")
print("\n")
print("上海")
shanghai_avg = change_wages_data(shanghai_money)
hist(shanghai_avg,"shanghai")
pie(shanghai_avg,"shanghai")
print("\n")
print("深圳")
shenzheng_avg = change_wages_data(shenzheng_money)
hist(shenzheng_avg,"shenzheng")
pie(shenzheng_avg,"shenzheng")
print("\n")
print("南京")
nanjing_avg = change_wages_data(nanjing_money)
hist(nanjing_avg,"nanjing")
pie(nanjing_avg,"nanjing")
print("\n")
print("杭州")
hangzhou_avg = change_wages_data(hangzhou_money)
hist(hangzhou_avg,"hangzhou")
pie(hangzhou_avg,"hangzhou")
print("\n")
print("无锡")
wuxi_avg = change_wages_data(wuxi_money)
hist(wuxi_avg,"wuxi")
pie(wuxi_avg,"wuxi")
此程序是之前程序的整合,不多做解释。
以下为可视化图:
北京:
直方图:
饼图:
上海:
直方图:
饼图:
深圳:
直方图:
饼图:
南京:
直方图:
饼图:
杭州:
直方图:
饼图:
无锡:
直方图:
饼图: