Scrapy+Mysql实现的一个小项目,爬取新片场中的小视频的内容
本项目爬取的是新片场的小视频,起始的url为: http://www.xinpianchang.com/channel/index/sort-like
总共有三层页面,分别是:
1、首页面,如下:
2、视频的详细页面,如下:
3、创作者的页面,如下:
该项目实现的就是把这三层页面中重要的信息都爬取下来,然后保存到mysql数据库,代码如下(呈现的主要是spider类中的代码):
import json import scrapy from scrapy import Request from xpc.items import PostItem,CommentItem,ComposerItem,CopyrightItem class DiscoverySpider(scrapy.Spider): name = 'xpc' allowed_domains = ['www.xinpianchang.com'] start_urls = ['http://www.xinpianchang.com/channel/index/sort-like'] # 提取作者的著作权信息 composer_url = 'http://www.xinpianchang.com/u%s?from=articleList' def parse(self, response): post_list = response.xpath('//ul[@class="video-list"]/li') #每个视频对应的第二层页面的链接 url = 'http://www.xinpianchang.com/a%s' #爬取第一层页面的信息,并且请求第二层的数据 for post in post_list: pid = post.xpath('./@data-articleid').extract_first() #请求第二层的页面,并且回调parse_post()函数对第二层的信息进行抓取 request = Request(url % pid, callback=self.parse_post) #提取每个视频的data-articleid request.meta['pid'] = pid #提取每个视频第一层页面的图片的链接 request.meta['thumbnail'] = post.xpath('.//img/@_src').get() #提取每个视频的描述 request.meta['duration'] = post.xpath('./a/span/text()').get() yield request #提取下一页的链接 next_page = response.xpath("//div[@class='page']/a[last()]/@href").get() if next_page: # print("下一页:"+next_page) #对下一页进行请求爬取,该语句会把所有的页数进行爬取 yield response.follow(next_page,callback=self.parse) def parse_post(self, response): # 提取视频信息 post = PostItem() post['pid'] = response.meta['pid'] #提取视频的时间长度,形式为 ‘3'30’,即3分30秒 duration = response.meta['duration'] #把时间转换成秒为单位 if duration: duration = [int(i) for i in duration.replace("'","").split()] duration = duration[0]*60+duration[1] post['duration'] = duration # 缩略图,列表页小图 post['thumbnail'] = response.meta['thumbnail'] post['title'] = response.xpath('//div\ [@class="title-wrap"]/h3/text()').get() # 视频页的预览大图 post['preview'] = response.xpath( '//div[@class="filmplay"]//img/@src').extract_first() post['video'] = response.xpath('//video[@id="xpc_video"]/@src').get() cates = response.xpath('//span[@class="cate v-center"]/a/text()').extract() post['category'] = '-'.join([cate.strip() for cate in cates]) # 发布时间 post['created_at'] = response.xpath('//span[@class="update-time v-center"]/i/text()').get() # 播放次数 post['play_counts'] = response.xpath('//i[contains(@class, "play-counts")]/text()').get().replace(",","") # 点赞次数 post['like_counts'] = response.xpath('//span[contains(@class, "like-counts")]/text()').get() # 描述 desc = response.xpath('//p[contains(@class, "desc")]/text()').get() post['description'] = desc.strip() if desc else '' yield post # creator_list = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li') for creator in creator_list: cr = CopyrightItem() #创作者id cr['cid'] = creator.xpath('./a/@data-userid').get() cr['pid'] = post['pid'] cr['pcid']= '%s_%s'%(cr['cid'],cr['pid']) # 每个作者不同影片里担任的角色是不一样的。 cr['roles'] = creator.xpath('./div[@class="creator-info"]/span/text()').get() yield cr request = Request(self.composer_url % cr['cid'], callback=self.parse_composer) request.meta['cid'] = cr['cid'] yield request # 爬取评论,这是评论的链接 comment_api = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1' % post['pid'] yield response.follow(comment_api, callback=self.parse_comment) def parse_composer(self, response): """抓取作者信息""" composer = ComposerItem() # 抓取作者的id composer['cid'] = response.meta['cid'] #创作者页面的背景图片 banner = response.xpath('//div[@class="banner-wrap"]/@style').get() composer['banner'] = banner[21:-1] #创作者的用户名 composer['name'] = response.xpath('//p[contains(@class, "creator-name")]/text()').get() #创作者的头像 composer['avatar'] = response.xpath('//span[@class="avator-wrap-s"]/img/@src').get() #创作者的信息 composer['intro'] = response.xpath('//p[contains(@class, "creator-desc")]/text()').get() #喜欢该创作者的人数 composer['like_counts'] = response.xpath('//span[contains(@class, \ "like-counts")]/text()').get().replace(',', '') #粉丝的数量 composer['fans_counts'] = response.xpath('//span[contains(@class, "fans-counts")]/@data-counts').get() #创作者关注的人数 composer['follow_counts'] = response.xpath('//span[@class="follow-wrap"]/span[2]/text()').get() #位置 composer['location'] = response.xpath('//span[contains(@class, "icon-location")]/following-sibling::span[1]/text()').get() #职业 composer['career'] = response.xpath('//span[contains(@class, "icon-career")]/following-sibling::span[1]/text()').get() yield composer #评论的提取 def parse_comment(self, response): #把网页请求得到的数据转换成json数据 result = json.loads(response.text) next_page = result['data']['next_page_url'] if next_page: yield response.follow(next_page, callback=self.parse_comment) comments = result['data']['list'] for c in comments: print(c) comment = CommentItem() #评论的id comment['commentid'] = c['commentid'] comment['pid'] = c['articleid'] #评论的内容 comment['content'] = c['content'] #喜欢该评论的人数 comment['like_counts'] = c['count_approve'] comment['created_at'] = c['addtime'] comment['cid'] = c['userInfo']['userid'] comment['uname'] = c['userInfo']['username'] comment['avatar'] = c['userInfo']['face'] if c['reply']: comment['reply'] = c['reply']['commentid'] yield comment request = Request(self.composer_url % comment['cid'], callback=self.parse_composer) request.meta['cid'] = comment['cid'] yield request
整体实现的代码如下:
项目所有代码链接:https://pan.baidu.com/s/1vhbbZk0nWsPMF-6VLZ7ksg 密码:tiju
数据库db.sql链接:https://pan.baidu.com/s/1nrutxRlziCGI6CXyEmdI4Q 密码:nonz
只要修改相应的数据库配置和在数据库中运行相应的sql文件就可以执行了,前提必须安装Scrapy框架,pymysql第三库,安装的方式如下:
在cmd中的安装:
pip install scrapy;
pip install pymysql;
在pycharm中的安装:
File—>Setting—>Project 工程名—>Project Interpreter中点击+输入相应的第三库的名字,点击Install Packages便可以进行安装。