python爬虫网页
首先是pip安装 把pythion下的script目录下的pip安装器 拖进cmd中 然后 install
import requests
import bs4
res = requests.get("https://movie.douban.com/top250")
soup = bs4.BeautifulSoup(res.text,"html.parser") // 指定解析器
targets = soup.find_all("div",class_="hd") //加下划线 是因为class 是关键字
for each in targets:
print(each.a.span.text)
返回了》》网页1的标题内容
肖申克的救赎
霸王别姬
这个杀手不太冷
阿甘正传
这是书250的排名 小修改下 加了文本输出
import requestsfrom bs4 import BeautifulSoup as bs
depth = 10 #一共有10页
books = [] #存储定向信息
for i in range(depth):
s = i*25
r = 'https://book.douban.com/top250?start='
url = r+str(s) #获取完整的url
demo = requests.get(url).text
soup = bs(demo,'html.parser')
book = soup.find_all('td',)
for i in book:
if i.a.get('title') == None:
pass
else:
name = i.a.get('title')
text = i.p.text.split('/')
author = text[0]
price = text[-1]
rate = i.find('span',{'class':'rating_nums'}).text
try:
info = i.find('span',{'class':'inq'}).text
except:
info = '无信息'
adr = i.a.get('href')
books.append([name,author,price,rate,info,adr])
with open("豆瓣读书250.txt","w",encoding="utf-8") as f:
count = 0
for each in books:
count += 1
booklist = "Top:{}\n书名:{}\n作者:{}\n定价:{}\n评分:{}\n一句话书评:{}\n信息查阅:{}\n"
#print(booklist.format(count,each[0],each[1],each[2],each[3],each[4],each[5]))
booklist=booklist.format(count,each[0],each[1],each[2],each[3],each[4],each[5])
#f.write(str(count)+each+'\n')
f.write(booklist)