python scrapy 爬取整本小说

点击小说网址
python scrapy 爬取整本小说

通过连图的分析
开始代码：
创建scrapy，在pycharm 右下角打开Terminal
输入scrapy startproject xiaoshuo 回车
然后输入 cd xiaoshuo 回车
最后 scrapy genspider biquge www.biquge.info/52_52968/ 回车

item（部分）

import scrapy
class XiaoshuoItem(scrapy.Item):
    # define the fields for your item here like:
    content = scrapy.Field()
    request = scrapy.Field()

settings（部分）

ITEM_PIPELINES = {
   'xiaoshuo.pipelines.XiaoshuoPipeline': 300,
}
BOT_NAME = 'xiaoshuo'
SPIDER_MODULES = ['xiaoshuo.spiders']
NEWSPIDER_MODULE = 'xiaoshuo.spiders'
LOG_LEVEL = 'WARNING'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
ROBOTSTXT_OBEY = False

pipelines（部分）

class XiaoshuoPipeline(object):
    def __init__(self):
        self.file = open('G://许你万丈光芒好(小说).txt', 'a+', encoding='utf-8')
    def process_item(self, item, spider):
        self.file.write(item['content'])
        return item

biquge(部分)

import scrapy
from scrapy.selector import Selector
class BiqugeSpider(scrapy.Spider):
    name = 'biquge'
    allowed_domains = ['biquge.info']
    start_urls = ['https://www.biquge.info/52_52968/']
    def parse(self, response):
        selector = Selector(response)
        #//*[@id="list"]/dl/dd[1]
        find_all = selector.xpath('/html/body/div[@id="wrapper"]/div[@class="box_con"][2]/div[@id="list"]/dl/dd')
        for section in find_all:
            href = section.xpath('.//@href').extract_first()
            real_url = response.urljoin(href)
            request = scrapy.Request(real_url, callback=self.parse_detail)
            yield request
            
    def parse_detail(self, response):
        selector = Selector(response)
        # //*[@id="content"]
        content_list = selector.xpath('//div[@id="content"]/text()').extract(） 
        content = '\n'.join(content_list)
        item = dict()
        item['content'] = content
        print(content)
        yield item

python scrapy 爬取整本小说
生活python

python scrapy 爬取整本小说

相关推荐