淘宝美食(pyspider)

代码还需要优化,可以根据需要修改(加代理……)

淘宝美食(pyspider)

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-03-18 22:00:28
# Project: taobao

from pyspider.libs.base_handler import *
from pyquery import PyQuery as py 
import time
import re
headers={
    'cookie':'tg=0; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; miid=1211858156889448784; t=fbf15d4cffdb3239e45f244b88f5ad8f; cna=BqYKFQstgz0CAXuDhhKY3nhr; v=0; cookie2=18f3687d1a717e5c41b36a6bfbd7a092; _tb_token_=35f05ee95863; unb=3325805238; sg=%E8%B0%8388; _l_g_=Ug%3D%3D; skt=8bf0bac1da4158d8; cookie1=UU6nQ%2FwhxoyfCdcqI5cBOn0yFBTZU9sQRb%2BoMEf5En4%3D; csg=0b47fa54; uc3=vt3=F8dByErV43QH5J0rjTw%3D&id2=UNN%2F6Td8a%2FzdLQ%3D%3D&nk2=oBGYyVctFds9k6CswkVFoqNf&lg2=UIHiLt3xD8xYTw%3D%3D; existShop=MTU1Mjk4MjkzMg%3D%3D; tracknick=%5Cu6BDB%5Cu7237%5Cu7237%5Cu8DDF%5Cu6211%5Cu8BF4%5Cu8981%5Cu4F4E%5Cu8C03; lgc=%5Cu6BDB%5Cu7237%5Cu7237%5Cu8DDF%5Cu6211%5Cu8BF4%5Cu8981%5Cu4F4E%5Cu8C03; _cc_=Vq8l%2BKCLiw%3D%3D; dnk=%5Cu6BDB%5Cu7237%5Cu7237%5Cu8DDF%5Cu6211%5Cu8BF4%5Cu8981%5Cu4F4E%5Cu8C03; _nk_=%5Cu6BDB%5Cu7237%5Cu7237%5Cu8DDF%5Cu6211%5Cu8BF4%5Cu8981%5Cu4F4E%5Cu8C03; cookie17=UNN%2F6Td8a%2FzdLQ%3D%3D; mt=ci=22_1; uc1=cookie14=UoTZ5ixzHzLaBQ%3D%3D&lng=zh_CN&cookie16=Vq8l%2BKCLySLZMFWHxqs8fwqnEw%3D%3D&existShop=false&cookie21=W5iHLLyFe3xm&tag=8&cookie15=U%2BGCWk%2F75gdr5Q%3D%3D&pas=0; enc=%2Faf%2FOsmzEND0%2FqIvlqjHWP2jjwgZND1bTK0ijHKV%2FDLGCj9cwZMxShUq2no%2FUNU2p%2F9bbMx5%2FnDEeVfKkXdCBg%3D%3D; alitrackid=i.taobao.com; lastalitrackid=i.taobao.com; __guid=154677242.3526456759438738400.1552982987295.243; swfstore=222224; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; JSESSIONID=8C53029B298D7E0CD5EAB900A3A22AEE; monitor_count=2; l=bBQ-H3fPvKeTdVTsBOCNqZTdx37OSIRADuWXpuc9i_5Be6L6t3_Olt30nFp6Vj5R_OLB4IFj7Ty9-etkj; isg=BC4udA9s_omXvQp-P7zD65g2f4T6njnduXK_iFj3mjHsO86VwL9COdQx8-dyr-pB; whl=-1%260%260%261552983196424',
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
url = 'https://s.taobao.com/search?q=%E9%9B%B6%E9%A3%9F&commend=all&ssid=s5-e&search_type=mall&sourceId=tb.index&area=c2c&spm=a1z02.1.6856637.d4910789'
class Handler(BaseHandler):
    crawl_config = {
        'headers' : headers
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(url=url, callback=self.index_page,validate_cert=False,fetch_type="js")

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        #res = py(response.text)
        #need = res('.J_ClickSta')
        #print(need)
        next_page= response.doc('.icon-tag')
        Parse = re.compile(' <a class="J_Ajax num icon-tag" .*?href="(.*?)"',re.S)
        next_url = re.findall(Parse,str(next_page))
        page = response.doc('.J_ClickStat')
        parse = re.compile('<span class.*?href="(.*?)"',re.S)
        need = re.findall(parse,str(page))
        for each in need:
            self.crawl(each,callback=self.detail_page,validate_cert=False,fetch_type="js")#注意:访问时要关掉SSL认真
        self.crawl(next_url[0],callback=self.index_page,validate_cert=False,fetch_type="js")
         
    @config(priority=2)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('title').text(),
            "price":response.doc('.tm-price').text(),
            "sell": response.doc('.tm-count').text(),
        }