如何使用Item Pipeline for Scrapy在数据库中存储刮取的项目?

问题描述:

我想用项目管道如何使用Item Pipeline for Scrapy在数据库中存储刮取的项目?

这是我的蜘蛛

from scrapy.spider import Spider 
from scrapy.selector import Selector 
from scrapy.selector import HtmlXPathSelector 
from scrapy.http import Request 
from scrapy.utils.python import unicode_to_str 
from scrapy.contrib.spiders import CrawlSpider,Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.exceptions import ScrapyDeprecationWarning 
from CollecteurImmobilier.items import CollecteurimmobilierItem 

class AnnonceSpider(CrawlSpider): 
name = "Annonce" 
allowed_domains = ["tayara.tn"] 
start_urls = ["http://www.tayara.tn/sousse/immobilier-%C3%A0_vendre"] 
rules = (Rule(SgmlLinkExtractor(allow=('\\?o=\\d')),'parse_start_url',follow=True),) 

def parse_start_url(self, response): 
    sel = Selector(response) 
    DivAnnonces = sel.xpath('//div[@class="item"]') 
    items = [] 
    for DivAnnonce in DivAnnonces: 
     item = CollecteurimmobilierItem() 
     item['link'] = DivAnnonce.xpath('.//h2/a/@href').extract() 
     titres = item['link'] 
     items.append(item) 
    return items 

存储在数据库刮项目这是我的管道

from datetime import datetime 
from hashlib import md5 
from scrapy import log 
from scrapy.exceptions import DropItem 
from twisted.enterprise import adbapi 
import sys 
import MySQLdb 
import hashlib 
from scrapy.exceptions import DropItem 
from scrapy.http import Request 

class MySQLStorePipeline(object): 

def __init__(self, dbpool): 
    self.dbpool = dbpool 

@classmethod 
def from_settings(cls, settings): 
    dbargs = dict(
     host=settings['MYSQL_HOST'], 
     db=settings['MYSQL_DBNAME'], 
     user=settings['MYSQL_USER'], 
     passwd=settings['MYSQL_PASSWD'], 
     charset='utf8', 
     use_unicode=True, 
    ) 
    dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs) 
    return cls(dbpool) 

def process_item(self, item, spider): 
    # run db query in the thread pool 
    query = self.dbpool.runInteraction(self._conditional_insert, item, spider) 
    query.addErrback(self._handle_error, item, spider) 
    # at the end return the item in case of success or failure 
    query.addBoth(lambda _: item) 
    # return the deferred instead the item. This makes the engine to 
    # process next item (according to CONCURRENT_ITEMS setting) after this 
    # operation (deferred) has finished. 
    return query 

def _conditional_insert(self, tx, item, spider): 

    tx.execute(""" 
     SELECT * FROM AnnonceGratuit WHERE link = %s 
    """, (item['link'])) 
    result = tx.fetchone() 
    if result: 
     print "Welcome to Python!" 
     log.msg("Item already stored in db: %s" % item, level=log.DEBUG) 
    else: 

     tx.execute(""" 
      INSERT INTO AnnonceGratuit (link) 
      VALUES (%s) 
     """, (item['link']) 
     ) 

     log.msg("Item stored in db: %s" % item, level=log.DEBUG) 

def _handle_error(self, failure, item, spider): 
    """Handle occurred on db interaction.""" 
    # do nothing, just log 
    log.err(failure) 

这是我mysql.sql

DROP TABLE IF EXISTS AnnonceGratuit; 
CREATE TABLE AnnonceGratuit (
link VARCHAR, 
title VARCHAR 
) DEFAULT CHARSET=utf8; 

和在我的设置我添加此行

ITEM_PIPELINES = { 
'CollecteurImmobilier.pipelines.MySQLStorePipeline': 300, 
} 

但是当我跑我的蜘蛛这样

scrapy crawl Annonce -o items.xml -t xml 

有我的终端

没有错误,我看到蜘蛛运行时“已存储在数据库项目”这条消息

我成功输出的文件items.xml 但没有什么是存储在我的数据库

请任何一个可以帮助我 认为

尝试使用DB指标检测重复

def _conditional_insert(self, tx, item, spider): 

    try: 

     tx.execute(""" 
      INSERT INTO AnnonceGratuit (link) 
      VALUES (%s) 
     """, (item['link']) 
     ) 

     log.msg("Item stored in db: %s" % item, level=log.DEBUG) 
    except: 
     log.msg("Item already stored in db: %s" % item, level=log.DEBUG) 

,如果你在数据库

的链接添加唯一索引约束这应该工作