爬虫-某直播平台图片批量爬取url并下载……转载
爬虫-某直播平台图片批量爬取url并下载
import json
import time
import requests
from selenium import webdriver
class HuYa(object):
def __init__(self):
self.start_url = "https://www.hxyx.com/l"#某直播平台的url地址"x"为缺省值:地址自己定义
self.driver = webdriver.Chrome()
self.part_url = "https:"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
def run(self):
# 1. 准备start_url
# 2. 发送请求 获取响应
self.driver.get(self.start_url)
# 3. 提取数据
content_list, next_list = self.get_content_url()
# 4. 保存数据
self.save_coutent(content_list)
# 5. 请求下一页的数据
while next_list is not None:
next_list.click()
time.sleep(3)
# 提取数据
content_list, next_list = self.get_content_url()
# 保存数据
self.save_coutent(content_list)
def get_content_url(self):
li_list = self.driver.find_elements_by_xpath("//ul[@id='js-live-list']/li")
content_list = []
for li in li_list:
images = li.find_element_by_xpath(".//a/img[@class='pic']").get_attribute("data-original").split("?")[0]
if not images.startswith("https:"):
images = self.part_url + images
print(images)
content_list.append(images)
next_url = self.driver.find_elements_by_xpath("//a[@class='laypage_next']")
next_url = next_url[0] if len(next_url)>0 else None
return content_list, next_url
def save_coutent(self,conutent_list,):
#enumerate 函数用在for中作用是以序列递增
for i,img_url in enumerate(conutent_list):
response = requests.get(img_url, headers = self.headers)
with open("./image/" "huya"+ str(i) + ".png","wb") as f:
f.write(response.content)
if __name__ == '__main__':
huya = HuYa()
huya.run()
---------------------
作者:荒城以北
来源:****
原文:https://blog.****.net/weixin_44090435/article/details/86499413
版权声明:本文为博主原创文章,转载请附上博文链接!