P_010.~慢慢悠悠~使用Python的Scrapy框架成功爬取豆瓣电影的全部信息
写在前面的话:
Java程序员一枚,初入大数据神坑,爬虫算是第一个项目,项目细节无需赘述,几经挣扎最终决定放弃Java爬虫,使用Python来
写爬虫,Python爬虫当然绕不过去Scrapy神来之笔的框架!
环境搭建及安装各种工具包,相信每一位和我一样的初次入坑的小伙伴们都必需经历,痛并快乐着,最终放弃2.7版本,选择3.5版
本,毕竟掌握新技术总是能给人带来成就感!
听着豆瓣音乐,看着慢悠悠爬虫爬着豆瓣电影数据,没有冲动,忘却欢喜,只是一种放松,一种技术人的全身心彻底放松!
看图看真相:
关于IP代理池:
听说豆瓣封ip,所以第一时间就在找了porxyPool相关项目,一共尝试了两种方法。
第一种;是去国内高匿代理网站爬去免费代理,生成一个 proxy_list.json 然后将这个文件拷贝到自己项目根目录下,每次Request的时候从json文件中随机取一个IP,思路很好,但是免费的代理靠得住吗?看懂了代码,放弃了自我!前前后后折腾一上午,无疾而终!
第二中:与第一种类似,gitHub上小有名气的项目ProxyPool-master,依然是去各大免费网站爬取免费代理,然后存储到Redis,最后发布出来,在本地浏览器访问http://127.0.0.1:5000/random就会获取到一个代理,值得学习的是每个代理在入库的时候score是10分,异步测试成功变成100分,失败就从10开始自减,到0分的时候就从库中移除,但是依然摆脱不了免费代理的厄运,最终还是放弃了!
慢悠悠爬虫:
项目结构及目录
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for douban project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'douban' SPIDER_MODULES = ['douban.spiders'] NEWSPIDER_MODULE = 'douban.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'douban (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 20 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 5 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'douban.middlewares.DoubanSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'douban.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'douban.pipelines.DoubanPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' #输出Excel文件 FEED_EXPORTERS = {'excel':'douban.excelexport.ExcelItemExporter'} #输出文档的顺序 FEED_EXPORT_FIELDS = ['title', 'year', 'score', 'alias', 'commentCount', 'director', 'writer', 'performer', 'categories', 'website', 'area', 'language', 'pub', 'time', 'imdb', 'start', 'better', 'image', 'description']items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #pass #非info区域数据 title = scrapy.Field() year = scrapy.Field() score = scrapy.Field() commentCount = scrapy.Field() start = scrapy.Field() better = scrapy.Field() image = scrapy.Field() description = scrapy.Field() #info区域数据 director = scrapy.Field() writer = scrapy.Field() performer = scrapy.Field() categories = scrapy.Field() website = scrapy.Field() area = scrapy.Field() language = scrapy.Field() pub = scrapy.Field() time = scrapy.Field() alias = scrapy.Field() imdb = scrapy.Field()
excelexport.py
from scrapy.exporters import BaseItemExporter import xlwt class ExcelItemExporter(BaseItemExporter): def __init__(self, file, **kwargs): self._configure(kwargs) self.file = file self.workbook = xlwt.Workbook() self.worksheet = self.workbook.add_sheet('scrapy') self.row = 0 def finish_exporting(self): self.workbook.save(self.file) def export_item(self, item): fields = self._get_serialized_fields(item) for col, v in enumerate(x for _, x in fields): self.worksheet.write(self.row, col, v) self.row += 1
爬虫 movie_hot.py
# -*- coding: utf-8 -*- import scrapy import json import re import time from douban.items import DoubanItem class MovieHotSpider(scrapy.Spider): name = "movie_hot" allowed_domains = ["https://movie.douban.com"] # 拼接豆瓣电影URL BASE_URL = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%s&sort=recommend&page_limit=%s&page_start=%s' MOVIE_TAG = '最新' PAGE_LIMIT = 20 page_start = 0 domains = BASE_URL % (MOVIE_TAG, PAGE_LIMIT, page_start) headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip,deflate,br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Host": "movie.douban.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36" } # 爬虫从此开始 def start_requests(self): print('~~~~爬取列表: '+ self.domains) yield scrapy.Request( url = self.domains, headers=self.headers, callback=self.request_movies ) # 分析列表页 def request_movies(self, response): infos = response.text # 使用JSON模块解析响应结果 infos = json.loads(infos) # 迭代影片信息列表 for movie_info in infos['subjects']: print('~~~爬取电影: ' + movie_info['title'] + '/'+ movie_info['rate']) # 提取影片页面url,构造Request发送请求,并将item通过meta参数传递给影片页面解析函数 yield scrapy.Request( url = str(movie_info['url']), headers = self.headers, callback = self.request_movie, dont_filter=True ) # 如果json结果中包含的影片数量小于请求的影片数量,说明没有影片了,否则继续搜索 if len(infos['subjects']) == self.PAGE_LIMIT: self.page_start += self.PAGE_LIMIT url = self.BASE_URL % (self.MOVIE_TAG, self.PAGE_LIMIT, self.page_start) time.sleep(5) print('~~~~爬取列表: ' + url) yield scrapy.Request( url = url, headers = self.headers, callback = self.request_movies, dont_filter=True ) # 分析详情页 def request_movie(self, response): #组装数据 movie_item = DoubanItem() #获取非info区域数据 movie_item['title'] = response.css('div#content>h1>span:nth-child(1)::text').extract_first() movie_item['year'] = response.css('div#content>h1>span.year::text').extract_first()[1:-1] movie_item['score'] = response.css('strong.rating_num::text').extract_first() movie_item['commentCount'] = response.css('div.rating_sum>a.rating_people>span::text').extract_first() movie_item['start'] = '/'.join(response.css('span.rating_per::text').extract()) movie_item['better'] = '/'.join(response.css('div.rating_betterthan>a::text').extract()) movie_item['description'] = response.css('#link-report>span::text').extract_first().strip() movie_item['image'] = response.css('#mainpic>a>img::attr(src)').extract_first() # 获取整个信息字符串 info = response.css('div.subject div#info').xpath('string(.)').extract_first() # 提取所以字段名 fields = [s.strip().replace(':', '') for s in response.css('div#info span.pl::text').extract()] # 提取所有字段的值 values = [re.sub('\s+', '', s.strip()) for s in re.split('\s*(?:%s):\s*' % '|'.join(fields), info)][1:] # 处理列名称 for i in range(len(fields)): if '导演' == fields[i]: fields[i] = 'director' if '编剧' == fields[i]: fields[i] = 'writer' if '主演' == fields[i]: fields[i] = 'performer' if '类型' == fields[i]: fields[i] = 'categories' if '官方网站' == fields[i]: fields[i] = 'website' if '制片国家/地区' == fields[i]: fields[i] = 'area' if '语言' == fields[i]: fields[i] = 'language' if '上映日期' == fields[i]: fields[i] = 'pub' if '片长' == fields[i]: fields[i] = 'time' if '又名' == fields[i]: fields[i] = 'alias' if 'IMDb链接' == fields[i]: fields[i] = 'imdb' # 将所有信息填入item movie_item.update(dict(zip(fields, values))) # 处理缺失字段 if not 'director' in movie_item.keys(): movie_item['director'] = '/' if not 'writer' in movie_item.keys(): movie_item['writer'] = '/' if not 'performer' in movie_item.keys(): movie_item['performer'] = '/' if not 'categories' in movie_item.keys(): movie_item['categories'] = '/' if not 'website' in movie_item.keys(): movie_item['website'] = '/' if not 'area' in movie_item.keys(): movie_item['area'] = '/' if not 'language' in movie_item.keys(): movie_item['language'] = '/' if not 'pub' in movie_item.keys(): movie_item['pub'] = '/' if not 'time' in movie_item.keys(): movie_item['time'] = '/' if not 'alias' in movie_item.keys(): movie_item['alias'] = '/' if not 'imdb' in movie_item.keys(): movie_item['imdb'] = '/' print('~完成爬取电影: ' + movie_item['title'] + '/' + movie_item['score']) #将数据加入到字典中 yield movie_item # scrapy crawl movie_hot -t excel -o export_data/%(name)s/%(time)s.xls --nolog最后执行爬虫,不是run,而是Terminal中 输入
scrapy crawl movie_hot -t excel -o export_data/%(name)s/%(time)s.xls --nolog
如图: