python网络爬虫01
from urllib import request
from urllib import parse
#urlretrieve下载保存
request.urlretrieve('https://www.baidu.com','baidu.html')
#urlencode编码
params = {'name':'张三','age':18,'greet':'hello world'}
result = parse.urlencode(params)
print(result)
url = 'http://www.baidu.com/s'
params = {"wd":"刘德华"}
qs = parse.urlencode(params)
url = url + "?" + qs
resp = request.urlopen(url) # urlopen()打开url
print(resp.read())
#parse_qs解码
params = {'name':'张三','age':18,'greet':'hello world'}
qs = parse.urlencode(params)
requset = parse.parse_qs(qs)
print(requset)
#urlparse()把URL分成几个部分,urlspit()除了没有params,和urlparse一模一样
url = 'https://baike.baidu.com/item/刘德华/114923?fr=aladdin'
result = parse.urlparse(url)
print(result)
实战拉勾网,因为拉勾网的反爬虫机制做的比较恶心。
#request.Request类在请求的时候添加请求头,比如添加一个人user-Agent,对于反爬虫网站的用法
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36'
,'Referer':'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
}
data ={
'first':'true',
'pn':1,
'kd':'python'
}
req =req = request.Request(url,headers=headers, data = parse.urlencode(data).encode('utf-8') ,method = 'POST'
) # 创建到了req对象,data需经过urlencode,并且unicode字符串转成bytes类型,应该用encode('utf-8')
resp=request.urlopen(req)
print(resp.read().decode('utf-8')) # 将bytes类型解码成utf-8
首先,要找到拉勾网真正的网页源代码
加蓝部分并不是,下面才是
这些信息是我们要的,还有下面的
这些我们添加到代码中,足以伪装成浏览器访问了
最后看一下效果
把拉勾网爬去后,我们可以爬去小说网了,一般比较简单,基本没有反爬虫机制。
url = 'https://www.booktxt.net/0_31/12784.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36'
,'Referer':'https://www.booktxt.net/0_31/'
}
req =req = request.Request(url,headers=headers ,method = 'GET'
) # 创建到了req对象,data需经过urlencode,并且unicode字符串转成bytes类型,应该用encode('utf-8')
resp=request.urlopen(req)
再看看效果,为后面爬去一整本小说打基础(笑一笑)