Python练习题--爬取猫眼电影TOP100

题目要求:

爬取这个网站:http://maoyan.com/board/4?offset=0 上TOP100电影的①电影名②演员③日期④图片 并写入数据库。

代码如下:

#!/usr/bin/env python
# coding:utf-8

import re
import urllib
from urllib import *
from urllib import request

import pymysql

url = r'http://maoyan.com/board/4?offset='


def get_content(url):
    with request.urlopen(url) as f:
        content = f.read().decode('utf-8').replace(' ', '')
        return content


def create_url(url):
    url_li = []
    for i in range(0, 100, 10):
        newurl = url + '%d' % i
        url_li.append(newurl)
    return url_li


def get_film(content):
    # content = get_content(url)
    pattern = r'<pclass="name"><ahref=".*"title="(.*)"data-act="boarditem-click"data-val="{movieId:.*}">'
    return re.findall(pattern, content)


def get_date(content):
    # content = get_content(url)
    pattern = r'<pclass="releasetime">上映时间:(.*)\(?.*\)?</p></div>'
    return re.findall(pattern, content)


def get_act(content):
    # content = get_content(url)
    pattern = r'主演:(.+)'
    return re.findall(pattern, content)


def get_purl(content):
    # content = get_content(url)
    pattern = r'<imgdata-src="(.+)"alt=".*"class="board-img"/>'
    return re.findall(pattern, content)


if __name__ == '__main__':
    # urls = create_url(url)
    # purls = []
    # with open('E:\\wenjian.txt','a+') as f:
    #     for url in urls:
    #         f.write(get_content(url))


    with open('E:\\wenjian.txt','r') as f:
        neirong = f.read()
        films = get_film(neirong)
        acts = get_act(neirong)
        dates = get_date(neirong)
        print(dates)
        for i in range(0,len(dates)-1):
            if '(' in dates[i]:
                dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'')
        print(dates[12]+'-01-01')
        for i in range(0,len(dates)-1):
            if len(dates[i]) == 4:
                dates[i] = dates[i] + '-01-01'
        print(dates)

        conn = pymysql.connect(host = 'localhost',user = 'root',
                               passwd = 'Linux',db = 'test',
                               charset = 'utf8')
        cur = conn.cursor()

        ins = 'INSERT INTO movie(actor,date,img) VALUES(%s,%s,%s)'
        for i in range(1,100):
            print(i)
            with open('E:\\pp\\%s.jpg'%i,'rb') as f :
                img = f.read()
                cur.execute(ins,(acts[i],dates[i],img))

        conn.commit()

        cur.close()



#     for url in urls:
#         Uep = get_purl(url)
#         for urll in Uep:
#             purls.append(urll)
#     films = []
#     dates = []
#     acts = []
#
#     for i in urls:
#         a = get_film(i)
#         for j in a:
#             films.append(j)
#         b = get_date(i)
#         for j2 in b:
#             dates.append(j2)
#         c = get_act(i)
#         for j3 in c:
#             acts.append(j3)
#
#     print('films = %s, actors = %s,dates = %s' % (len(films), len(acts), len(dates)))
#
# zhenli = list(zip(films,acts,dates))
# #处理日期
# for i in range(0,len(dates)-1):
#     if '(' in dates[i]:
#         dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'')
#
# x = 1
# for i in purls:
#     urllib.request.urlretrieve(i,filename='E:\\pp\\%s.jpg' % (x))
#     x += 1
#
# conn = pymysql.connect(host = 'localhost',user = 'root',
#                        passwd = 'Linux',db = 'test',
#                        charset = 'utf8')
# cur = conn.cursor()
#
# ins = 'INSERT INTO movie(actor,date,img) VALUES(%s,%s,%s)'
# for i in range(1,100):
#     print(i)
#     with open('E:\\pp\\%s.jpg'%i,'rb') as f :
#         img = f.read()
#         cur.execute(ins,(acts[i],dates[i],img))
#
# conn.commit()
#
# cur.close()

代码比较混乱因为老是被识别出机器行为导致IP被封所以交得比较赶没有整理。被注释掉的都是原原代码,也就是没把网页源码写入文件每次都直接从网站上爬取导致IP被封的代码。

各个函数说明:

def get_content(url):

获取网页源码并去掉多余空格。

def create_url(url):

100个电影分别放在10个网页内,找出规律,获取10个url放在列表里返回。

def get_film(content):

def get_date(content):

def get_act(content):

这三个函数利用正则分别找出电影名,日期,演员。

def get_purl(content):

也是利用正则,找出图片的url。

    # with open('E:\\wenjian.txt','a+') as f:
    #     for url in urls:
    #         f.write(get_content(url))

这一段代码爬取了10页网页的源码,并写入文件E:\\wenjian.txt中,注意用格式a+打开。执行一次后注释掉,以免IP被封报错。

    with open('E:\\wenjian.txt','r') as f:
       
neirong = f.read()
       
films = get_film(neirong)
       
acts = get_act(neirong)
       
dates = get_date(neirong)
        print(
dates)

这一段获取了所有的电影名,演员,日期,图片之前就获取过了,这里不再重复。print是输出检查结果的,可以删去。

for i in range(0,len(dates)-1):
    if '(' in dates[i]:
        dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'')
print(dates[12]+'-01-01')
for i in range(0,len(dates)-1):
    if len(dates[i]) == 4:
        dates[i] = dates[i] + '-01-01'
print(dates)

这一段是对时间进行处理,之前不断爬取网页被封IP就是因为没注意到时间格式居然出现了三种

分别为:
1996-11-04(美国)

1996-11-04

1996

气得我想亲自问候网站制作人员的母亲。

处理过程分三步,①爬取所以时间到一个列表内(也就是三种格式都有),

for i in range(0,len(dates)-1):
    if '(' in
dates[i]:
       
dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'')

利用正则,把有括号的替换成只剩时间。

for i in range(0,len(dates)-1):
    if len(
dates[i]) == 4:
       
dates[i] = dates[i] + '-01-01'

把只有年份的添加默认日月为1月1日。

至此,时间全变为标准格式。这段代码中出现的2个print也是为了检测处理效果,可以删去。


        conn = pymysql.connect(host = 'localhost',user = 'root',
                               passwd = 'Linux',db = 'test',
                               charset = 'utf8')
       
cur = conn.cursor()

       
ins = 'INSERT INTO movie(actor,date,img) VALUES(%s,%s,%s)'
        for
i in range(1,100):
            print(
i)
            with open('E:\\pp\\%s.jpg'%
i,'rb') as f :
               
img = f.read()
               
cur.execute(ins,(acts[i],dates[i],img))

       
conn.commit()

       
cur.close()

这一段将信息写入数据库。里面有个print(i)用来在报错时检测是写入第几个数据时出错的,可以删掉。


# x = 1
# for i in purls:
#     urllib.request.urlretrieve(i,filename='E:\\pp\\%s.jpg' % (x))
#     x += 1

被注释掉的代码当中,这一段是之前爬取图片的代码,利用的方法是从每一页网页上爬取,需要打开网页。


结果截图:

爬下来的电影图片。

Python练习题--爬取猫眼电影TOP100

时间处理效果截图(第一行是没有处理的,第三行是处理完毕的)

Python练习题--爬取猫眼电影TOP100

数据库中信息:

Python练习题--爬取猫眼电影TOP100

Python练习题--爬取猫眼电影TOP100

198行是刚才为了演示执行了多次,写入了重复数据。。。。