怎么用Python写爬虫抓取网页数据
[url=][/url]
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 #http://www.smpeizi.com/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class AuthorInfo(scrapy.Item):12 authorName = scrapy.Field() # 作者昵称13 authorUrl = scrapy.Field() # 作者Url14 15 class ReplyItem(scrapy.Item):16 content = scrapy.Field() # 回复内容17 time = scrapy.Field() # 发布时间18 author = scrapy.Field() # 回复人(AuthorInfo)19 20 class TopicItem(scrapy.Item):21 title = scrapy.Field() # 帖子标题22 url = scrapy.Field() # 帖子页面Url23 content = scrapy.Field() # 帖子内容24 time = scrapy.Field() # 发布时间25 author = scrapy.Field() # 发帖人(AuthorInfo)26 reply = scrapy.Field() # 回复列表(ReplyItem list)27 replyCount = scrapy.Field() # 回复条数[url=][/url]
[url=]
[/url]
1 # -*- coding: utf-8 -*- 2 from scrapy.selector import Selector 3 from scrapy.spiders import CrawlSpider, Rule 4 fromscrapy.linkextractors import LinkExtractor 5 6 from kiwi.items import TopicItem, AuthorInfo, ReplyItem 7 classKiwiSpider(CrawlSpider): 8 name = "kiwi" 9 allowed_domains = ["douban.com"] 10 11 anchorTitleXPath = 'a/text()'12 anchorHrefXPath = 'a/@href' 13 14 start_urls = [ 15 "https://www.pzzs168.com/group/topic/90895393/?start=0", 16 ] 17 rules = ( 18 Rule( 19 LinkExtractor(allow=(r'/group/[^/]+/discussion\?start=\d+',)), 20 callback='parse_topic_list', 21 follow=True 22 ), 23 Rule( 24 LinkExtractor(allow=(r'/group/topic/\d+/$',)), # 帖子内容页面 25 callback='parse_topic_content', 26 follow=True 27 ), 28 Rule( 29 LinkExtractor(allow=(r'/group/topic/\d+/\?start=\d+',)), # 帖子内容页面 30 callback='parse_topic_content', 31 follow=True 32 ), 33 ) 34 35 # 帖子详情页面 36 defparse_topic_content(self, response): 37 # 标题XPath 38 titleXPath = '//html/head/title/text()' 39 # 帖子内容XPath 40 contentXPath = '//div[@class="topic-content"]/p/text()' 41 # 发帖时间XPath 42 timeXPath ='//div[@class="topic-doc"]/h3/span[@class="color-green"]/text()' 43 # 发帖人XPath 44 authorXPath ='//div[@class="topic-doc"]/h3/span[@class="from"]' 45 46 item = TopicItem() 47 # 当前页面Url 48 item['url'] = response.url 49 # 标题 50 titleFragment = Selector(response).xpath(titleXPath) 51 item['title'] = str(titleFragment.extract()[0]).strip() 52 53 # 帖子内容 54 contentFragment = Selector(response).xpath(contentXPath) 55 strs = [line.extract().strip() for line in contentFragment] 56 item['content'] = '\n'.join(strs) 57 # 发帖时间 58 timeFragment = Selector(response).xpath(timeXPath) 59 iftimeFragment: 60 item['time'] = timeFragment[0].extract() 61 62 # 发帖人信息 63 authorInfo = AuthorInfo()64 authorFragment = Selector(response).xpath(authorXPath) 65 if authorFragment: 66 authorInfo['authorName'] = authorFragment[0].xpath(self.anchorTitleXPath).extract()[0] 67 authorInfo['authorUrl'] = authorFragment[0].xpath(self.anchorHrefXPath).extract()[0] 68 69 item['author'] = dict(authorInfo) 70 71 # 回复列表XPath 72 replyRootXPath = r'//div[@class="reply-doc content"]' 73 # 回复时间XPath 74 replyTimeXPath = r'div[@class="bg-img-green"]/h4/span[@class="pubtime"]/text()' 75 # 回复人XPath 76 replyAuthorXPath = r'div[@class="bg-img-green"]/h4' 77 78 replies = [] 79 itemsFragment = Selector(response).xpath(replyRootXPath)80 for replyItemXPath in itemsFragment: 81 replyItem = ReplyItem() 82 # 回复内容 83 contents = replyItemXPath.xpath('p/text()') 84 strs = [line.extract().strip() for line in contents] 85 replyItem['content'] ='\n'.join(strs) 86 # 回复时间 87 timeFragment = replyItemXPath.xpath(replyTimeXPath) 88 iftimeFragment: 89 replyItem['time'] = timeFragment[0].extract() 90 # 回复人 91 replyAuthorInfo = AuthorInfo() 92 authorFragment = replyItemXPath.xpath(replyAuthorXPath) 93 if authorFragment: 94 replyAuthorInfo['authorName'] = authorFragment[0].xpath(self.anchorTitleXPath).extract()[0] 95 replyAuthorInfo['authorUrl'] = authorFragment[0].xpath(self.anchorHrefXPath).extract()[0] 96 97 replyItem['author'] = dict(replyAuthorInfo) 98 # 添加进回复列表 99 replies.append(dict(replyItem))100 101 item['reply'] = replies102 yield item103 104 # 帖子列表页面105 def parse_topic_list(self, response):106 # 帖子列表XPath(跳过表头行)107 topicRootXPath = r'//table[@class="olt"]/tr[position()>1]'108 # 单条帖子条目XPath109 titleXPath = r'td[@class="title"]'110 # 发帖人XPath111 authorXPath = r'td[2]'112 # 回复条数XPath113 replyCountXPath = r'td[3]/text()'114 # 发帖时间XPath115 timeXPath = r'td[@class="time"]/text()'116 117 topicsPath = Selector(response).xpath(topicRootXPath)118 for topicItemPath in topicsPath:119 item = TopicItem()120 titlePath = topicItemPath.xpath(titleXPath)121 item['title'] = titlePath.xpath(self.anchorTitleXPath).extract()[0]122 item['url'] = titlePath.xpath(self.anchorHrefXPath).extract()[0]123 # 发帖时间124 timePath = topicItemPath.xpath(timeXPath)125 if timePath:126 item['time'] = timePath[0].extract()127 # 发帖人128 authorPath = topicItemPath.xpath(authorXPath)129 authInfo = AuthorInfo()130 authInfo['authorName'] = authorPath[0].xpath(self.anchorTitleXPath).extract()[0]131 authInfo['authorUrl'] = authorPath[0].xpath(self.anchorHrefXPath).extract()[0]132 item['author'] = dict(authInfo)133 # 回复条数134 replyCountPath = topicItemPath.xpath(replyCountXPath)135 item['replyCount'] = replyCountPath[0].extract()136 137 item['content'] = ''138 yield item139 140 parse_start_url = parse_topic_content[url=]
[/url]
[url=]
[/url]
1 # -*-coding:utf-8-*- 2 3 import random 4 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 5 67 class RotateUserAgentMiddleware(UserAgentMiddleware): 8 def __init__(self, user_agent=''): 9 self.user_agent = user_agent10 11 def process_request(self, request, spider):12 ua = random.choice(self.user_agent_list)13 if ua:14 request.headers.setdefault('User-Agent', ua)15 16 # for more user agent strings,you can find it in http://www.idiancai.com/pages/useragentstring.php17 user_agent_list = [ \18 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \19 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \20 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \21 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \22 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \23 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \24 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \25 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \26 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \27 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \28 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \29 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \30 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \31 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \32 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \33 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \34 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \35 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"36 ][url=]
[/url]
DOWNLOADER_MIDDLEWARES = { 'kiwi.useragentmiddleware.RotateUserAgentMiddleware': 1,}