将scrapy框架爬取的数据保存到MongoDB中
以爬取豆瓣网为例子,使用scrapy框架爬取豆瓣网电影排行榜,并将数据保存到MongoDB中。
- 首先,创建项目 scrapy startproject douban
- tree一下
- 进入到spider目录,scrapy genspider doubanSpider
. ├── douban │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── __pycache_ │ │ └── __init__.cpython-36.pyc │ └── doubanSpider.py_ └── scrapy.cfg_
编写item文件
import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 电影标题 title = scrapy.Field() # 电影评分 score = scrapy.Field() # 电影信息 content = scrapy.Field() # 简洁 info = scrapy.Field()
编写spider文件
# -*- coding: utf-8 -*- import scrapy from ..items import DoubanItem class DoubanspiderSpider(scrapy.Spider): name = 'doubanSpider' # allowed_domains = ['douban.com'] start = 0 # 设置一个变量, 规律 每增加25 就是往下翻一页 url = 'https://movie.douban.com/top250?start=' end = '&filter=' start_urls = [url + str(start) + end] def parse(self, response): # 实例化item类 item = DoubanItem() # 找出每部电影总的div 并进行遍历 再取详细信息 for each in response.xpath("//div[@class='info']"): title = each.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract() content = each.xpath('div[@class="bd"]/p/text()').extract() score = each.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract() info = each.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() item['title'] = title[0] item['content'] = content[0] item['score'] = score[0] item['info'] = info yield item # 排行榜共250条 每25条一页 if self.start <= 225: self.start += 25 url = self.url + str(self.start) + self.end yield scrapy.Request(url,callback=self.parse)
设置setting 打开ITEM_PIPELINES,并设置数据库ip和端口和数据库名称,数据表
ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 300, } MONGODB_HOST = '127.0.0.1' # 端口号,默认27017 MONGODB_PORT = 27017 # 设置数据库名称 MONGODB_DBNAME = 'Douban' # 存放本数据的表名称 MONGODB_DOCNAME = 'DouBanMovies'
设置user-agent头,否则无法访问豆瓣网站
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 '
编写piplines文件
from scrapy.conf import settings import pymongo class DoubanPipeline(object): def __init__(self): # 获取setting主机名、端口号和数据库名称 host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbname = settings['MONGODB_DBNAME'] # 创建数据库连接 client = pymongo.MongoClient(host=host,port=port) # 指向指定数据库 mdb = client['Douban'] # 获取数据库里面存放数据的表名 self.post = mdb[settings['MONGODB_DOCNAME']] def process_item(self, item, spider): data = dict(item) # 向指定的表里添加数据 self.post.insert(data) return item
成果