Scrapy+Mysql实现的一个小项目,爬取新片场中的小视频的内容

本项目爬取的是新片场的小视频,起始的url为: http://www.xinpianchang.com/channel/index/sort-like

总共有三层页面,分别是:

1、首页面,如下:

Scrapy+Mysql实现的一个小项目,爬取新片场中的小视频的内容

2、视频的详细页面,如下:

Scrapy+Mysql实现的一个小项目,爬取新片场中的小视频的内容

3、创作者的页面,如下:

Scrapy+Mysql实现的一个小项目,爬取新片场中的小视频的内容

该项目实现的就是把这三层页面中重要的信息都爬取下来,然后保存到mysql数据库,代码如下(呈现的主要是spider类中的代码):

import json
import scrapy
from scrapy import Request
from xpc.items import PostItem,CommentItem,ComposerItem,CopyrightItem


class DiscoverySpider(scrapy.Spider):
    name = 'xpc'
    allowed_domains = ['www.xinpianchang.com']
    start_urls = ['http://www.xinpianchang.com/channel/index/sort-like']

    # 提取作者的著作权信息
    composer_url = 'http://www.xinpianchang.com/u%s?from=articleList'

    def parse(self, response):
        post_list = response.xpath('//ul[@class="video-list"]/li')
        #每个视频对应的第二层页面的链接
        url = 'http://www.xinpianchang.com/a%s'
        #爬取第一层页面的信息,并且请求第二层的数据
        for post in post_list:
            pid = post.xpath('./@data-articleid').extract_first()
            #请求第二层的页面,并且回调parse_post()函数对第二层的信息进行抓取
            request = Request(url % pid, callback=self.parse_post)
            #提取每个视频的data-articleid
            request.meta['pid'] = pid
            #提取每个视频第一层页面的图片的链接
            request.meta['thumbnail'] = post.xpath('.//img/@_src').get()
            #提取每个视频的描述
            request.meta['duration'] = post.xpath('./a/span/text()').get()
            yield request
        #提取下一页的链接
        next_page = response.xpath("//div[@class='page']/a[last()]/@href").get()
        if next_page:
            # print("下一页:"+next_page)
            #对下一页进行请求爬取,该语句会把所有的页数进行爬取
            yield response.follow(next_page,callback=self.parse)

    def parse_post(self, response):
        # 提取视频信息
          post = PostItem()
        post['pid'] = response.meta['pid']
        #提取视频的时间长度,形式为 ‘3'30’,即330          duration = response.meta['duration']
        #把时间转换成秒为单位
          if duration:
            duration = [int(i) for i in duration.replace("'","").split()]
            duration = duration[0]*60+duration[1]

        post['duration'] = duration
        # 缩略图,列表页小图
        post['thumbnail'] = response.meta['thumbnail']
        post['title'] = response.xpath('//div\
        [@class="title-wrap"]/h3/text()').get()
        # 视频页的预览大图
        post['preview'] = response.xpath(
            '//div[@class="filmplay"]//img/@src').extract_first()
        post['video'] = response.xpath('//video[@id="xpc_video"]/@src').get()
        cates = response.xpath('//span[@class="cate v-center"]/a/text()').extract()
        post['category'] = '-'.join([cate.strip() for cate in cates])
        # 发布时间
        post['created_at'] = response.xpath('//span[@class="update-time v-center"]/i/text()').get()
        # 播放次数
        post['play_counts'] = response.xpath('//i[contains(@class, "play-counts")]/text()').get().replace(",","")
        # 点赞次数
        post['like_counts'] = response.xpath('//span[contains(@class, "like-counts")]/text()').get()
        # 描述
        desc = response.xpath('//p[contains(@class, "desc")]/text()').get()
        post['description'] = desc.strip() if desc else ''
        yield post
        #
        creator_list = response.xpath('//div[@class="user-team"]//ul[@class="creator-list"]/li')
        for creator in creator_list:
            cr = CopyrightItem()
            #创作者id
            cr['cid'] = creator.xpath('./a/@data-userid').get()
            cr['pid'] = post['pid']
            cr['pcid']= '%s_%s'%(cr['cid'],cr['pid'])
            # 每个作者不同影片里担任的角色是不一样的。
            cr['roles'] = creator.xpath('./div[@class="creator-info"]/span/text()').get()
            yield cr
            request = Request(self.composer_url % cr['cid'], callback=self.parse_composer)
            request.meta['cid'] = cr['cid']
            yield request
        # 爬取评论,这是评论的链接
        comment_api = 'http://www.xinpianchang.com/article/filmplay/ts-getCommentApi?id=%s&ajax=0&page=1' % post['pid']
        yield response.follow(comment_api, callback=self.parse_comment)

    def parse_composer(self, response):
        """抓取作者信息"""
        composer = ComposerItem()
        # 抓取作者的id
        composer['cid'] = response.meta['cid']
        #创作者页面的背景图片
        banner = response.xpath('//div[@class="banner-wrap"]/@style').get()
        composer['banner'] = banner[21:-1]
        #创作者的用户名
        composer['name'] = response.xpath('//p[contains(@class, "creator-name")]/text()').get()
        #创作者的头像
        composer['avatar'] = response.xpath('//span[@class="avator-wrap-s"]/img/@src').get()
        #创作者的信息
        composer['intro'] = response.xpath('//p[contains(@class, "creator-desc")]/text()').get()
        #喜欢该创作者的人数
        composer['like_counts'] = response.xpath('//span[contains(@class, \
        "like-counts")]/text()').get().replace(',', '')
        #粉丝的数量
        composer['fans_counts'] = response.xpath('//span[contains(@class, "fans-counts")]/@data-counts').get()
        #创作者关注的人数
        composer['follow_counts'] = response.xpath('//span[@class="follow-wrap"]/span[2]/text()').get()
        #位置
        composer['location'] = response.xpath('//span[contains(@class, "icon-location")]/following-sibling::span[1]/text()').get()
        #职业
        composer['career'] = response.xpath('//span[contains(@class, "icon-career")]/following-sibling::span[1]/text()').get()
        yield composer

    #评论的提取
    def parse_comment(self, response):
        #把网页请求得到的数据转换成json数据
        result = json.loads(response.text)
        next_page = result['data']['next_page_url']
        if next_page:
            yield response.follow(next_page, callback=self.parse_comment)
        comments = result['data']['list']
        for c in comments:
            print(c)
            comment = CommentItem()
            #评论的id
            comment['commentid'] = c['commentid']
            comment['pid'] = c['articleid']
            #评论的内容
            comment['content'] = c['content']
            #喜欢该评论的人数
            comment['like_counts'] = c['count_approve']
            comment['created_at'] = c['addtime']
            comment['cid'] = c['userInfo']['userid']
            comment['uname'] = c['userInfo']['username']
            comment['avatar'] = c['userInfo']['face']
            if c['reply']:
                comment['reply'] = c['reply']['commentid']
            yield comment
            request = Request(self.composer_url % comment['cid'], callback=self.parse_composer)
            request.meta['cid'] = comment['cid']
            yield request

整体实现的代码如下:

项目所有代码链接:https://pan.baidu.com/s/1vhbbZk0nWsPMF-6VLZ7ksg 密码:tiju

数据库db.sql链接:https://pan.baidu.com/s/1nrutxRlziCGI6CXyEmdI4Q 密码:nonz

只要修改相应的数据库配置和在数据库中运行相应的sql文件就可以执行了,前提必须安装Scrapy框架,pymysql第三库,安装的方式如下:

在cmd中的安装:

pip install scrapy;

pip install pymysql;

 在pycharm中的安装:

File—>Setting—>Project 工程名—>Project Interpreter中点击+输入相应的第三库的名字,点击Install Packages便可以进行安装。