Python爬取 知乎上“发现”页面的“热门话题”部分
目的:将其问题和答案同样保存成文本形式
import requests
from pyquery import PyQuery as pq
url = 'https://www.zhihu.com/explore'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
doc = pq(html)
items = doc('.explore-tab .feed-item').items()
for item in items:
question = item.find('h2').text()
author = item.find('.author-link-line').text()
answer = pq(item.find('.content').html()).text()
file = open('explore.txt', 'a', encoding='utf-8')
file.write('\n'.join([question, author, answer]))#返回通过指定字符连接序列中元素后生成的新字符串
file.write('\n' + '=' * 50 + '\n')
file.close()
基础知识:1,利用pquery库进行爬取,pquery基本使用(感谢崔老师):https://cuiqingcai.com/5551.html,
2,join的用法(感谢菜鸟教程):http://www.runoob.com/python/att-string-join.html
str.join(元组、列表、字典、字符串) 之后生成的只能是字符串。
所以很多地方很多时候生成了元组、列表、字典后,可以用 join() 来转化为字符串。
3,保存为TXT文件的简便方式:
爬取结果:
拓展:利用pquery库爬取 豆瓣读书 的书籍信息
import requests
from pyquery import PyQuery as pq
url = 'https://book.douban.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
html = requests.get(url, headers=headers).text
doc = pq(html)
items = doc('.list-col .list-col5 .list-express .slide-item').items()
for item in items:
author = item.find('.info').text()
print(author)
'''
author = item.find('.author-link-line').text()
answer = pq(item.find('.content').html()).text()
file = open('explore_test_one.txt', 'a', encoding='utf-8')
file.write('\n'.join([question, author, answer]))
file.write('\n' + '=' * 50 + '\n')
file.close()
'''
一直失败,,,,等过一阵再看看