scrapy框架学习
scrapy的工作流程
各部件的功能描述:
Scrapy Engine(引擎): 负责Spider、ItemPipeline、Downloader、Scheduler中间的通讯,信号、数据传递等。
Scheduler(调度器): 它负责接受引擎发送过来的Request请求,并按照一定的方式进行整理排列,入队,当引擎需要时,交还给引擎。URL队列
Downloader(下载器):负责下载Scrapy Engine(引擎)发送的所有Requests请求,并将其获取到的Responses交还给Scrapy Engine(引擎),由引擎交给Spider来处理,
Spider(爬虫):它负责处理所有Responses,从中分析提取数据,获取Item字段需要的数据,并将需要跟进的URL提交给引擎,再次进入Scheduler(调度器),
Item Pipeline(管道):它负责处理Spider中获取到的Item,并进行进行后期处理(详细分析、过滤、存储等)的地方.
Downloader Middlewares(下载中间件):你可以当作是一个可以自定义扩展下载功能的组件。
Spider Middlewares(Spider中间件):你可以理解为是一个可以自定扩展和操作引擎和Spider中间通信的功能组件(比如进入Spider的Responses;和从Spider出去的Requests)
工作流程:
1、引擎从调度器中取出一个URL用于抓取
2、引擎把URL封装成一个请求Request传给下载器
3、下载器把资源下载下来,并封装成Response送给Spider
4、爬虫解析Response
5、解析出实体item,交给实体管道进一步处理
6、若是解析出URL,则交给调度器等待抓取
目录结构如下
接下来我们讲解每一个py文件的功能作用
1、items.py 项目的目标文件
Item 定义结构化数据字段,用来保存爬取到的数据。类似Django中Model.py
import scrapy
class Test0409Item(scrapy.Item):
# define the fields for your item here like:
b_cate = scrapy.Field()
m_cate = scrapy.Field()
s_cate = scrapy.Field()
s_cate_href = scrapy.Field()
book_name = scrapy.Field()
book_href = scrapy.Field()
book_img = scrapy.Field()
book_author = scrapy.Field()
book_price = scrapy.Field()
book_publisher = scrapy.Field()
book_publish_date = scrapy.Field()
2、settting.py 项目的设置文件
进行相关配置
BOT_NAME = 'test0409'
SPIDER_MODULES = ['test0409.spiders']
NEWSPIDER_MODULE = 'test0409.spiders'
# UA的设置
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
# 是否遵守Robot协议,一般不遵守,嘻嘻
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 减慢爬取速度,下面设置为3s
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# 是否禁用cookies,在携带cookies登陆时候使用
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# 爬虫中间件
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'test0409.middlewares.Test0409SpiderMiddleware': 543,
#}
# 下载中间件
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'test0409.middlewares.Test0409DownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# item_pipelines 通道
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'test0409.pipelines.Test0409Pipeline': 300, # 权重越小,优先级越高
}
"""
使用日志提醒
"""
LOG_LEVEL = "WARNING" # 去除日志提醒
LOG_FILE = "./log.log" # 日志保存在本地
3、piplines.py 项目的管道文件
进行数据处理行为。写入数据库、数据存储输出啥的
from openpyxl import Workbook
class Test0409Pipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
self.ws.append(['大分类', '中分类', '小分类', '小分类地址',
'书名', '作者', '出版社' ,'出版日期', '图书图片' ,
'图书地址' ,'价钱'])
def process_item(self, item, spider):
line = [item["b_cate"], item["m_cate"],item["s_cate"], item["s_cate_href"],
item["book_name"],item["book_author"], item["book_publisher"], item["book_publish_date"],
item["book_img"], item["book_href"], item["book_price"]]
self.ws.append(line)
self.wb.save("0410.xlsx")
return item
4、middlewares.py 自定义中间件
自定义的中间件,可设置代理、UA;包括下载中间件和爬虫中间件
def process_request(self, request, spider):
"""
下载中间件请求
"""
# 随机user-agent
# ua = random.choice(spider.settings.get("USER_AGENTS_LIST"))
# request.headers["User-Agent"] = ua
# # 增加代理 最好是随机代理
# request.meta["proxy"] = "http://"+ redis.random()
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
5、jd.py 爬虫文件
最核心重要的也就是爬虫文件了
# -*- coding: utf-8 -*-
import scrapy
import json
import copy
import requests
from test0409.settings import USER_AGENT
from test0409.items import Test0409Item
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com', 'p.3.cn', 'rms.shop.jd.com']
start_urls = ['https://book.jd.com/booksort.html']
headers = {
"user-agent": USER_AGENT
}
def parse(self, response):
# item=Test0409Item() # 使用结构化item
item={} # 直接设置字典也可以
dt_list = response.xpath("//div[@class='mc']/dl/dt")
for dt in dt_list:
item["b_cast"] = dt.xpath("./a/text()").extract_first()
em_list = dt.xpath("./following-sibling::dd[1]/em")
for em in em_list:
item["s_cast"] = em.xpath("./a/text()").extract_first()
item["s_cast_href"] = em.xpath("./a/@href").extract_first()
item["s_cast_href"] = "https:"+ item["s_cast_href"]
yield scrapy.Request(
item["s_cast_href"],
callback= self.book_page,
meta={"item":copy.deepcopy(item)} # 防止item覆盖,所以使用深拷贝
)
def book_page(self, response):
item = response.meta["item"]
li_list = response.xpath("//ul[@class='gl-warp clearfix']/li")
for li in li_list:
book_sku = li.xpath("./div/@data-sku").extract_first()
venderId = li.xpath("./div/@venderid").extract_first()
item["shop"] = self.book_shop_url(venderId)
item["price"] = self.book_price_url(book_sku)
item["book_name"] = li.xpath("./div/div[3]/a/em/text()").extract_first()
item["author"] = li.xpath("./div/div[@class='p-bookdetails']/span[@class='p-bi-name']//a/@title").extract_first()
print(item)
# 获取下一页
next_url = response.xpath("//a[contains(text(), '下一页')]/@href").extract_first()
# print(next_url)
if next_url:
next_url = 'https://list.jd.com/' + next_url
yield scrapy.Request(
next_url,
callback=self.book_page,
meta={"item": response.meta["item"]}
)
# yield item
def book_price_url(self, book_sku):
# 通过book_sku,获取价格信息
book_price_url = "https://p.3.cn/prices/mgets?skuIds=J_{}".format(book_sku)
r = requests.get(book_price_url,headers=self.headers,timeout=5)
r.raise_for_status()
return json.loads(r.content.decode())[0]["op"]
def book_shop_url(self, venderId):
# 通过venderId,获取店铺信息
book_shop_url = "https://rms.shop.jd.com/json/pop/shopInfo.action?ids={}".format(venderId)
r = requests.get(book_shop_url ,headers=self.headers,timeout=5)
r.raise_for_status()
return json.loads(r.content.decode(encoding='gbk'))[0]["name"]