Beautiful Soup爬虫——爬取智联招聘的信息并存入数据库
本人目前在校本科萌新…第一次写有所不足还请见谅
前期准备
智联招聘网页
让我们来搜索一下python
发现网页跳转到这
让我们看一下源代码
发现并没有我们所需要的数据
一开始我不信邪用requests尝试了一下
import requests
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'Host': 'sou.zhaopin.com',
'Referer': 'https://www.zhaopin.com/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
url = 'https://sou.zhaopin.com/?pageSize=60&jl=530&kw=python&kt=3'
re = requests.get(url,headers=headers)
print(re.text)
发现确实没有我们需要的数据
最简单的办法是
发现网页为:https://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=python&sm=0&p=1
明显城市,关键字,页数都在上面了
尝试一下
import requests
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'Host': 'sou.zhaopin.com',
'Referer': 'https://www.zhaopin.com/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie':'ZP_OLD_FLAG=true'
}
url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=python&sm=0&p=1'
re = requests.get(url,headers=headers)
print(re.text)
cookie这里表示是旧版网页
发现确实有招聘的数据,这里就不发截图了。
代码
我用了json存储了一些变量,方便更改 spider.json
{
"host":"localhost",
"user":"root",
"password":"",
"dbname":"vacation",
"port":3306,
"city":"北京",
"keyword":"python",
"page":90,
"Cookie":"ZP_OLD_FLAG=true;"
}
代码
from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
import pymysql
import json
f = open("spider.json",encoding='utf-8')
setting = json.load(f)
host = setting['host']
user = setting['user']
password = setting['password']
dbname = setting['dbname']
port = setting['port']
city = setting['city']
keyword = setting['keyword']
pagenum = setting['page']
Cookie = setting['Cookie']
def get_one_page(city, keyword, page):
'''
获取网页html内容并返回
'''
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'Host': 'sou.zhaopin.com',
'Referer': 'https://www.zhaopin.com/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie':Cookie
}
url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&sm=0&p={}'.format(city,keyword,page)
try:
# 获取网页内容,返回html数据
response = requests.get(url, headers=headers)
# 通过状态码判断是否获取成功
if response.status_code == 200:
return response.text
return None
except RequestException as e:
return None
def readonepage(html,db):
cur = db.cursor()
soup = BeautifulSoup(html,'lxml')
for x in soup.find_all('td'):
try:
sybo = x.get('class')
if sybo ==['zwmc']:
jobname = x.div.a.get_text() #岗位名称
jobhref = x.div.a.get('href')
if jobhref[9] == 'i':
pass
list = get_detailed(jobhref)
list.append(jobname)
print(jobname)
sql = "INSERT INTO companyinfo(company_name,work_experience,edu_background,salary,describes,work_city,work_address,nature,types,scales,url,benefits,station,station_id)VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s',%d)" % (
list[0], list[1], list[2], list[3], list[4], list[5], list[6], list[7], list[8], list[9], list[10],
list[11], list[12],counterid())
# try:
cur.execute(sql)
db.commit()
except Exception as e:
pass
def counterid(last=[0]):#用来存储数据库的id
#last[0]将列表里面的第一个元素取出,然后加1,赋值给next
next = last[0] + 1
#修改列表里面第一个元素的值
last[0] = next
#返回此时运行的次数
return next
def get_detailed(href):
res = requests.get(href)
soup = BeautifulSoup(res.text, 'lxml')
for x in soup.find_all('ul'):
try:
sybo = x.get('class')
# print(sybo)
if sybo == ['terminal-ul', 'clearfix']:
jobinfor = x.get_text()
str = jobinfor.split('\n')
salary = str[1].split(':')[1]#薪水
salary = "".join(salary.split()) #去掉特殊符号
city = str[2].split(':')[1]#城市
exp = str[5].split(':')[1]#工作经验
edu = str[6].split(':')[1]#学历
# numb = str[7].split(':')[1]需求人数
except Exception as e:
print(e)
for x in soup.find_all('div'):
try:
sybo = x.get('class')
if sybo == ['company-box']:
str2 = x.get_text().split('\n')
while '' in str2:
str2.remove('')
if '查看公司地图' in str2:
str2.remove('查看公司地图')
comname = str2[0]#公司名称
scale = str2[1].split(':')[1]#企业规模
nature = str2[2].split(':')[1]# 民营 国营
type = str2[3].split(':')[1]# 类型:计算机/教育/so on
place = str2[-1]#具体地址
if len(str2) == 6:#有的公司没有网址
website = ' '
else:
website = str2[4].split(':')[1]#公司网站
# print(comname, scale, nature, type, place, website)
if sybo == ['tab-inner-cont']:
sty = x.get('style')
if sty == None:
descrip = x.get_text().split('\n')[1]#工作需求
descrip = "".join(descrip.split())#去掉特殊符号
# print(descrip)
if sybo == ['welfare-tab-box']:
fuli=''
for elem in x:
fuli = fuli + elem.string +' '
# print(x.get_text())
except Exception as e:
print(e)
return [comname,exp,edu,salary,descrip,city,place,nature,type,scale,website,fuli]
def main(city, keyword, pages):
db = pymysql.connect(host=host, user=user , password=password, db=dbname, port=port)
for i in range(pages):
html = get_one_page(city, keyword, i)
readonepage(html,db)
db.close()
if __name__ == '__main__':
main(city, keyword, pagenum)
数据库结构
数据库里存储的信息