通过搜索简单python爬虫爬取洋码头网站推荐页的商品和价格信息
直接上源码:
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 17 15:18:50 2018
@author: Administrator
"""
import random
import logging
import sys
from urllib import request
from urllib import error
import time
import re
#import jieba
# 获取logger的实例
logger = logging.getLogger("testLogger")
# 指定logger的输出格式
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
# 文件日志,终端日志对象
file_handler = logging.FileHandler("testLogger.log")
# 文件日志按照指定的格式来写
file_handler.setFormatter(formatter)
console_handler = logging.StreamHandler(sys.stdout)
# 终端日志按照指定的格式来写
console_handler.setFormatter(formatter)
# 可以设置日志的级别
logger.setLevel(logging.INFO)
# 把文件日志,终端日志对象添加到日志处理器logger中
logger.addHandler(file_handler)
logger.addHandler(console_handler)
minRangeForProxy = 1
maxRangeForProxy = 1
headers = [('authority', 'www.ymatou.com'),
('method','GET'),
('path', '/products?k=%E9%A6%99%E5%A5%88%E5%84%BF'),
('scheme', 'https'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'),
#('accept-encoding','deflate, br'),
('accept-language', 'zh-CN,zh;q=0.9'),
('referer', 'http://www.ymatou.com/'),
('upgrade-insecure-requests', '1'),
('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')]
yangmatou = 'http://www.ymatou.com/products?k='
def downloadHtml(url,headers=[],proxy={'https':''},
userProxyRate=6,timeout=None,decodeInfo="utf-8",num_retries=5):
'''
这是一个爬取网页的数据的函数
它支持设置HTTP Request Handers,能设置UA;
它支持代理服务器的设置
它支持timeout超时机制
它支持网页编码的指定
它支持服务器返回错误的处理
'''
if random.randint(minRangeForProxy,maxRangeForProxy) > userProxyRate:
proxy = None
print('no proxy')
proxy_handler = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_handler)
opener.addheaders = headers
request.install_opener(opener)
html = None
try:
res = request.urlopen(url,timeout=timeout)
html = res.read().decode(decodeInfo)
except UnicodeDecodeError:
logger.error("UnicodeDecodeError")
except error.URLError or error.HTTPError as e:
logger.error('Download Error')
if num_retries >= 0:
time.sleep(random.randint(1,3))
if hasattr(e,'code') and 500 <=e.code<600:
html = downloadHtml(url,headers,proxy,userProxyRate,timeout,decodeInfo,num_retries-1)
finally:
return html
def getInfo(something,url= 'http://www.ymatou.com/products?k=',headers=
[('authority', 'www.ymatou.com'),
('method','GET'),
('path', '/products?k=%E9%A6%99%E5%A5%88%E5%84%BF'),
('scheme', 'https'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'),
#('accept-encoding','deflate, br'),
('accept-language', 'zh-CN,zh;q=0.9'),
('referer', 'http://www.ymatou.com/'),
('upgrade-insecure-requests', '1'),
('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')]):
# something = '香奈儿'
string = request.quote(something)
url = url + string
headers[2] = ('path','/products?k='+string)
html = downloadHtml(url=url,headers=headers)
pattern = re.compile('class="product-img" title="([\s\S]*?)"[\s\S]*?<em class="unit">¥</em>(\d*)')
Info = re.findall(pattern,html)
return Info
#提取信息如下格式:
#<li class="product-item " module_index keyword="香奈儿" sproductid="a99b2c0f-1b6a-44a4-917f-8f1e8c8d2e0c">
# <a href="//www.ymatou.com/product/a99b2c0f-1b6a-44a4-917f-8f1e8c8d2e0c.html" target="_blank" class="product-img" title="Chanel香奈儿丝绒唇釉 雾面短管唇釉6ml 140色">
# <img class="lazy" alt="Chanel香奈儿丝绒唇釉 雾面短管唇釉6ml 140色" src="data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEARwBHAAD/2wBDAAYEBAQFBAYFBQYJBgUGCQsIBgYICwwKCgsKCgwQDAwMDAwMEAwODxAPDgwTExQUExMcGxsbHCAgICAgICAgICD/2wBDAQcHBw0MDRgQEBgaFREVGiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICD/wAARCADSANIDAREAAhEBAxEB/8QAGwABAAMBAQEBAAAAAAAAAAAAAAMEBQIBBgj/xAA4EAABAwIEAwQIBQQDAAAAAAAAAQIDBBEFEiExE0FRIjJhcRQjNEKhscHRFXKBkeEkM1JzQ1PS/8QAFAEBAAAAAAAAAAAAAAAAAAAAAP/EABQRAQAAAAAAAAAAAAAAAAAAAAD/2gAMAwEAAhEDEQA/AP1SAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABHNM2JE0VznLZjE3VQKXp9TLJw2tbBqrUe7tork5X0AtU1Q56ujlTLNH3kTZb7KgE4AAAAAAAAAAAAAAAAAAAAAAAAAAAKUr/AOvc7lBCrk81/gCFqQ/hTWuka19s7VVbLmvdAOHYlTJNDOi3dlyzNRP1+CgTMxukXdHN80+wFqCspp3ZYn5nIl1Syp8wJgAAAAAAAAAAAAAAAAAAAAAAHMr8kT3/AOKKv7AYv4rXTPRkdmOctkRE+q3AqTPqeI/iuVH7PvoBFoB5oAuBewZbVqeLVQDfAAAAAAAAAAAAAAAAAAAAAAAQVy2o5vyL8UAwKH2yH86fMCTFUtXyeNl+CARtoqh9Px2JmZ0TfTwA9p6Gaoie+Oy5Pd5gcQ075Z2w2s5Vsvh1Auw0j6PEYUc5HZlW1t9UtsBqPraRjla6REcm6AR/ilHxWx575vf5IB4uL0KLbMv7KBw3GaVZsmqM/wCxeoF8CpNiVPDPwZLovN1tAFViUNPkVUztk1RzbATwTNmibK3uu2uBIAAAAAAAAAAAAACria2oZfL5qBg0ftcP+xvzAt4y3+tTlmamvwA7Z6RhiuunEhkTsqm2YD2lnpYHrLJI9kz09YzLpdf0Atsjhga+tWVzuI1LvVNddgKqSYcsjHpI98/EaudU8duSWA9xxsTcio1OI9VVXc9ALFBTUqRRo5rVnRqPX/LXVAKtNM+XE3M04WZ2lk2QCGma2bFtuznctvBNgN2RXNY5WpmciaN6gYzqmKuRIp04VQi2ZJy8lA4xWCOBIYmckVVXrcDUwz2GLy+oFoAAAAAAAAAAAAAFLF1tQv8AFU+YGJR+1w/7G/MC9jqevjd1bb9l/kDuidJPA10+lNT6/mVNr+QFfjUEznSVPE4rlXu2tbl8ANOf0b8NTPm4GVm3etpYDLmlw1IWpAx/Fat0cv1AYvOktWqJ3Y0y/cDrgJDivDYrlRmqde7fkB5RJVU0rpFppHqrVROyvP8AQCGCSejnSR0aovR6Kl0A3JKp6UzZ44XPV3/Hsv1Az0lelUtR6BJnXlra/XugVK2pfVzo7JlXuozcDQhrqqKJkaUUlmIic/8AyBPQVNZK97Z4VYm7XWt+moF0AAAAAAAAAAAAM/G1tRp4vT6gY1N7RF+dvzA08eT+y78yfICeWJZ8Ma2k0aqJ2fBN087gZ/Gw1uj6V2dNHdpd/wBwNOd9OmGo50d4crPV35aW1AzH1WH8B0bKZW5tUW/Pz1A9xNkaNp5Gtssjczl6rZAPVl42LcSBe93FXrk+4Fpz8Yb3nRtvtewGbW+ko9rJ3Z3Il0W99wPo40tG1OiIBnYniSRosMK3kXRzk5fyBWpcHklgV714bl/tp9wOW1lfRScOXtInuu6eCga1JXQVLewtnJuxdwLAAAAAAAAAAAAAUsVp5ZqdEjS+VcyonkBho/hPujLPavvarcDTxWRs9DDM3ZXfRfsBWosSWmp3x2zOVbs6eIFOSR8j1e9buduoGo6qSow9KWGN75MrGqqJp2bc/wBAIocEqHayuSNOm6/YC9UYW2ZkLOIqJE3LtuBImHU6VPpCXR3JqWy7W2AlnpoZ2ZZUunIDLiwSXi+scnDRfNVQDUqY3yQPYx2Rypo4CrSYTDCqPk9ZJ8EAvgQ1NLFUR5JE8l5oBDh+HtpWqq9qR3veAFwAAAAAAAAAAAAAFepoaeoTtt7XJ6bgZ02H1UdO+BqcViuRzFTrst0A4hwSodrK5I06br9gNCHCaOPdvEXq77bAW0RESyJZOiAegAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD//2Q==" data-src="http://pic1.ymatou.com/G02/shangou/M00/F2/B5/CgvUBVoWivWAIdQWAAGiFEV-j2Y000_1_1_n_w_l.jpg">
# <i class="product-icon pi-tuan"></i>
#
#
# </a>
# <p class="price"><em class="unit">¥</em>219
# <span class="type">
# <!--tariffType 0:卖家交税 1.买家交税 IsFreeShipping 包邮 -->
# <em>包邮包税</em>
# <!-- <em class="sales">买手促销</em></span>-->
# </p>
# <p class="name"><a href="//www.ymatou.com/product/a99b2c0f-1b6a-44a4-917f-8f1e8c8d2e0c.html" target="_blank">Chanel香奈儿丝绒唇釉 雾面短管唇釉6ml 140色</a></p>
# <div class="seller-site">
# <a class="seller sellerinfo" href="//www.ymatou.com/sellerhome/15206250">
# <span class="avatar"><img src="http://pic1.ymatou.com/G02/M07/9F/DC/CgvUBFg0Bb-ALwKNAAAaZUTYnCc953_1_1_o.jpg" alt="加号全球购">
# <!--{"LevelId":1,"LevelName":"Top"},{"LevelId":2,"LevelName":"Pro"},{"LevelId":3,"LevelName":"Semi-Pro"}-->
# <em class="seller-type "><i class="home-icon hi-type-small"></i></em>
# </span>
# <span class="txt">加号全球购</span>
# </a>
# <a class="site" href="javascript:void(0)">
# <span class="avatar"><img src="http://img.ymatou.com/app/flag/circle/Japan.png" alt="Japan"></span>
# <span class="txt">Japan</span>
# </a>
# <div class="seller-info-wrap">
# <i class="home-icon hi-arrow-small"></i>
# <div class="siw-hd">
# <!--超级买手 meiyou 中级middle-type 初级 low-type -->
# <!--{"LevelId":1,"LevelName":"Top"},{"LevelId":2,"LevelName":"Pro"},{"LevelId":3,"LevelName":"Semi-Pro"}-->
# <span class="siw-type "><i class="home-icon hi-seller-type"></i>专业买手</span>
#
# <!--<span class="fans">已被<em>87861</em>人关注</span>-->
# </div>
# <div class="siw-bd">
# <span class="siw-l">买家评分<br/>
# <em class="score">4.8</em>
# </span>
# <ul class="siw-r">
# <li class="siw-item">客户服务
# <em class="score">4.9</em>
# </li>
# <li class="siw-item">物流服务
# <em class="score">5.0</em>
# </li>
# <li class="siw-item">综合评分
# <em class="score">4.9</em>
# <span class="tips">
# <em class="arrow-left"></em><em class="txt">高于平均15.5%</em>
# </span>
#
# </li>
# </ul>
# </div>
# </div>
# </div>
# </li>
if __name__ == '__main__':
something = input('请输入要搜索的东西')
Info = getInfo(something)
with open('%s洋码头价格信息.txt'%something,'w') as f:
for i in Info:
print(i)
f.write('商品名称:%s: 商品单价:%s'%(i[0],i[1])+'\n')
运行结果如图:
保存文档如图。
就这样啦。