python scrapy 爬取整本小说
点击小说网址
通过连图的分析
开始代码:
创建scrapy,在pycharm 右下角打开Terminal
输入scrapy startproject xiaoshuo 回车
然后输入 cd xiaoshuo 回车
最后 scrapy genspider biquge www.biquge.info/52_52968/ 回车
item(部分)
import scrapy
class XiaoshuoItem(scrapy.Item):
# define the fields for your item here like:
content = scrapy.Field()
request = scrapy.Field()
settings(部分)
ITEM_PIPELINES = {
'xiaoshuo.pipelines.XiaoshuoPipeline': 300,
}
BOT_NAME = 'xiaoshuo'
SPIDER_MODULES = ['xiaoshuo.spiders']
NEWSPIDER_MODULE = 'xiaoshuo.spiders'
LOG_LEVEL = 'WARNING'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
ROBOTSTXT_OBEY = False
pipelines(部分)
class XiaoshuoPipeline(object):
def __init__(self):
self.file = open('G://许你万丈光芒好(小说).txt', 'a+', encoding='utf-8')
def process_item(self, item, spider):
self.file.write(item['content'])
return item
biquge(部分)
import scrapy
from scrapy.selector import Selector
class BiqugeSpider(scrapy.Spider):
name = 'biquge'
allowed_domains = ['biquge.info']
start_urls = ['https://www.biquge.info/52_52968/']
def parse(self, response):
selector = Selector(response)
#//*[@id="list"]/dl/dd[1]
find_all = selector.xpath('/html/body/div[@id="wrapper"]/div[@class="box_con"][2]/div[@id="list"]/dl/dd')
for section in find_all:
href = section.xpath('.//@href').extract_first()
real_url = response.urljoin(href)
request = scrapy.Request(real_url, callback=self.parse_detail)
yield request
def parse_detail(self, response):
selector = Selector(response)
# //*[@id="content"]
content_list = selector.xpath('//div[@id="content"]/text()').extract()
content = '\n'.join(content_list)
item = dict()
item['content'] = content
print(content)
yield item
生活python