python 爬取图片 封装代码
import requests
import re, os
from lxml import etree
from urllib import request
import json
MOGU_PROXY_URL = ‘http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=fc1a46b572d54ca0a12f375eceb3b5e8&count=20&expiryDate=0&format=1&newLine=2’
FREE_PROXY_URL = ‘http://192.168.221.221:5010/get/’
TIANTANG_INDEX_URL = ‘http://www.ivsky.com/’
GET_PROXY_TIMEOUT = 2
def get_mogu_proxies():
“”"
请求付费代理
:return:
“”"
try:
resp = requests.get(MOGU_PROXY_URL)
except Exception as e:
print(“获取代理失败”, e, resp.status_code)
if resp.status_code == 200:
resp_dict = json.loads(resp.text)
raw_proxies = resp_dict[‘msg’]
# 组装为requests包代理参数需要的格式
res_proxies = []
for proxy in raw_proxies:
proxy_type = ‘https’
proxy_url = ‘https://’ + proxy[‘ip’] + proxy[‘port’]
res_proxies.append({proxy_type, proxy_url})
return res_proxies
class IvskySpider(object):
def __init__(self):
self.url = 'http://www.ivsky.com/tupian/ziranfengguang/index_2.html'
# self.html = ''
self.title = ''
self.count = 0
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
# 创建文件夹
self.create_directry()
# 请求源代码
def get_html(self, url):
response = requests.get(url=url, headers=self.headers)
self.html = etree.HTML(response.text)
# 创建文件夹
def create_directry(self):
# 获取源代码提取标题
self.get_html(self.url)
pass
# 解析网页
def parse_html(self):
# 详情页
pattern ='//ul/li/div[@class="il_img"]/a/@href'
pattern2 = '//ul/li/div[@class="il_img"]/a/@title'
html_content_name = self.html.xpath(pattern2)
for name in html_content_name:
self.title = name
if not os.path.exists(self.title):
os.mkdir(name)
# 详情页链接
html_content_url = self.html.xpath(pattern)
for url in html_content_url:
self.url_get = 'http://www.ivsky.com' + url
# print(url_get)
self.html_content(pattern2)
# 解析详情页的图片
def html_content(self,path):
# print(self.url_get)
response = requests.get(url=self.url_get, headers=self.headers)
self.html = etree.HTML(response.text)
picture = self.html.xpath('//ul/li/div[@class="il_img"]//img/@src')
for pic in picture:
# pic = request.get(url=pic)
self.count += 1
print('正在下载第%s张图片,请稍后。。。。' % self.count)
# img_name = pic.split('/')[-1]
path = self.title + '/' + '%s.jpg' % self.count
# with open(path+'/'+self.count+'.jpg', 'wb') as f:
# f.write(pic)
# f.fileno()
# print(pic)
request.urlretrieve(pic, path)
# 下一页
def index_html(self):
# 下一页
page = '//div[@class="pagelist"]/a[@class="page-next"]/@href'
page_next = self.html.xpath(page)
for p_next in page_next:
self.url = 'http://www.ivsky.com' + p_next
print(self.url)
def run(self):
for x in range(9):
print('-'*12)
self.get_html(url=self.url)
self.index_html()
self.parse_html()
if name == ‘main’:
ivsky = IvskySpider()
ivsky.run()
这是爬取的天堂网图片,使用一些包,requests包、os包、获取网页中的图片信息,然后爬取下来。