python 使用正则表达式爬取淘宝店铺图片
找到网址源码,抓包url 分析正则
#导入模块
import re
import urllib.request
import random
#设置代理代理池 随机选取
def open_url(url):
req = urllib.request.Request(url)
req.add_header(“User-Agent”,“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36”)
proxy_list = [
{"http":'222.188.178.42:9999'},
{"http":'163.204.243.186:9999'},
{"http":'36.99.215.29:9999'},
{"http":'120.24.173.214:80'},
{"http":'112.85.165.40:9999'},
{"http":'171.80.2.137:9999'},
{"http":'113.128.30.247:808'},
]
proxy = random.choice(proxy_list)
proxy_suopport = urllib.request.ProxyHandler(proxy)
opener = urllib.request.build_opener(proxy_suopport)
urllib.request.install_opener(opener)
res = opener.open(req)
html = res.read().decode(“utf-8”)
return html
def get_img(html):
p = ‘https://img.alicdn.com/imgextra/i4.*?jpg_260x260.jpg’
imglist = re.findall(p,html)
#for i in imglist:
#print(i)
for each in imglist:
filename = each.split("/")[-1]
urllib.request.urlretrieve(each,filename,None)
if name == “main”:
url = “https://re.taobao.com/search_ou?keyword=夏季女装&catid=&refpid=mm_26632258_3504122_32538762&_input_charset=utf8&clk1=ed0c7d3f4b4060c7b9bb6e834b2ce444&spm=a2e15.8261149.07626516005.5.567e29b4qSfXMC”
get_img(open_url(url))