如何使用Item Pipeline for Scrapy在数据库中存储刮取的项目?
问题描述:
我想用项目管道如何使用Item Pipeline for Scrapy在数据库中存储刮取的项目?
这是我的蜘蛛
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.utils.python import unicode_to_str
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.exceptions import ScrapyDeprecationWarning
from CollecteurImmobilier.items import CollecteurimmobilierItem
class AnnonceSpider(CrawlSpider):
name = "Annonce"
allowed_domains = ["tayara.tn"]
start_urls = ["http://www.tayara.tn/sousse/immobilier-%C3%A0_vendre"]
rules = (Rule(SgmlLinkExtractor(allow=('\\?o=\\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
sel = Selector(response)
DivAnnonces = sel.xpath('//div[@class="item"]')
items = []
for DivAnnonce in DivAnnonces:
item = CollecteurimmobilierItem()
item['link'] = DivAnnonce.xpath('.//h2/a/@href').extract()
titres = item['link']
items.append(item)
return items
存储在数据库刮项目这是我的管道
from datetime import datetime
from hashlib import md5
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MySQLStorePipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbargs = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8',
use_unicode=True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
def process_item(self, item, spider):
# run db query in the thread pool
query = self.dbpool.runInteraction(self._conditional_insert, item, spider)
query.addErrback(self._handle_error, item, spider)
# at the end return the item in case of success or failure
query.addBoth(lambda _: item)
# return the deferred instead the item. This makes the engine to
# process next item (according to CONCURRENT_ITEMS setting) after this
# operation (deferred) has finished.
return query
def _conditional_insert(self, tx, item, spider):
tx.execute("""
SELECT * FROM AnnonceGratuit WHERE link = %s
""", (item['link']))
result = tx.fetchone()
if result:
print "Welcome to Python!"
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute("""
INSERT INTO AnnonceGratuit (link)
VALUES (%s)
""", (item['link'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def _handle_error(self, failure, item, spider):
"""Handle occurred on db interaction."""
# do nothing, just log
log.err(failure)
这是我mysql.sql
DROP TABLE IF EXISTS AnnonceGratuit;
CREATE TABLE AnnonceGratuit (
link VARCHAR,
title VARCHAR
) DEFAULT CHARSET=utf8;
和在我的设置我添加此行
ITEM_PIPELINES = {
'CollecteurImmobilier.pipelines.MySQLStorePipeline': 300,
}
但是当我跑我的蜘蛛这样
scrapy crawl Annonce -o items.xml -t xml
有我的终端
没有错误,我看到蜘蛛运行时“已存储在数据库项目”这条消息
我成功输出的文件items.xml 但没有什么是存储在我的数据库
请任何一个可以帮助我 认为
答
尝试使用DB指标检测重复
def _conditional_insert(self, tx, item, spider):
try:
tx.execute("""
INSERT INTO AnnonceGratuit (link)
VALUES (%s)
""", (item['link'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
except:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
,如果你在数据库
的链接添加唯一索引约束这应该工作