使用beautifulsoup4,爬取一波贴吧的表情包
爱斗图的朋友们,一定不要错过这个技能!!!
环境:
python==3.7
安装两个包:
pip install beautifulsoup4
pip install urllib3
# _*_ coding:utf-8 _*_
import os
import urllib
import urllib.request
from bs4 import BeautifulSoup
import re
# 获取内容
def get_content(url):
html = urllib.request.urlopen(url)
content= html.read().decode('utf-8',"ignore")
html.close()
return content
#获取帖子的页数
def get_page_num(url):
content = get_content(url)
# < a href = "/p/2314539885?pn=31" >尾页 < / a >
pattern = r'<a href="/p/.*?pn=(.*)">尾页</a>'
return int(re.findall(pattern, content)[0])
# 保存图片
def get_images(info,photo_path,npage,photo_format):
if not os.path.exists(photo_path):
os.makedirs(photo_path)
soup = BeautifulSoup(info)
#找到所有 img 标签 然后后面跟的class = BDE_Image
all_img = soup.find_all('img',class_="BDE_Image")
#设置计数器
x=0
for img in all_img:
image_name = photo_path+str(npage)+"_"+str(x)+photo_format
urllib.request.urlretrieve(img['src'],image_name)
x += 1
print("find photo page %d NO. %d"%(npage,x))
return len(all_img)
# 主函数 可改动的参数
# URL——贴吧路径
# photo-path——图片保存的路径
# 图片格式——'.jpg' '.png' '.gif'
if __name__=='__main__':
url='https://tieba.baidu.com/p/5997922755'
photo_path="./0311/"
page_num=get_page_num(url)
for i in range(page_num):
page_url=url+'?pn='+str(i+1)
info = get_content(url)
print(get_images(info,photo_path,i+1,'.gif'))
贴吧网址为:https://tieba.baidu.com/p/5997922755
结果: