Python练习题--爬取猫眼电影TOP100

题目要求：

爬取这个网站：http://maoyan.com/board/4?offset=0 上TOP100电影的①电影名②演员③日期④图片并写入数据库。

代码如下：

#!/usr/bin/env python
# coding:utf-8

import re
import urllib
from urllib import *
from urllib import request

import pymysql

url = r'http://maoyan.com/board/4?offset='


def get_content(url):
    with request.urlopen(url) as f:
        content = f.read().decode('utf-8').replace(' ', '')
        return content


def create_url(url):
    url_li = []
    for i in range(0, 100, 10):
        newurl = url + '%d' % i
        url_li.append(newurl)
    return url_li


def get_film(content):
    # content = get_content(url)
    pattern = r'<pclass="name"><ahref=".*"title="(.*)"data-act="boarditem-click"data-val="{movieId:.*}">'
    return re.findall(pattern, content)


def get_date(content):
    # content = get_content(url)
    pattern = r'<pclass="releasetime">上映时间：(.*)\(?.*\)?</p></div>'
    return re.findall(pattern, content)


def get_act(content):
    # content = get_content(url)
    pattern = r'主演：(.+)'
    return re.findall(pattern, content)


def get_purl(content):
    # content = get_content(url)
    pattern = r'<imgdata-src="(.+)"alt=".*"class="board-img"/>'
    return re.findall(pattern, content)


if __name__ == '__main__':
    # urls = create_url(url)
    # purls = []
    # with open('E:\\wenjian.txt','a+') as f:
    #     for url in urls:
    #         f.write(get_content(url))


    with open('E:\\wenjian.txt','r') as f:
        neirong = f.read()
        films = get_film(neirong)
        acts = get_act(neirong)
        dates = get_date(neirong)
        print(dates)
        for i in range(0,len(dates)-1):
            if '(' in dates[i]:
                dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'')
        print(dates[12]+'-01-01')
        for i in range(0,len(dates)-1):
            if len(dates[i]) == 4:
                dates[i] = dates[i] + '-01-01'
        print(dates)

        conn = pymysql.connect(host = 'localhost',user = 'root',
                               passwd = 'Linux',db = 'test',
                               charset = 'utf8')
        cur = conn.cursor()

        ins = 'INSERT INTO movie(actor,date,img) VALUES(%s,%s,%s)'
        for i in range(1,100):
            print(i)
            with open('E:\\pp\\%s.jpg'%i,'rb') as f :
                img = f.read()
                cur.execute(ins,(acts[i],dates[i],img))

        conn.commit()

        cur.close()



#     for url in urls:
#         Uep = get_purl(url)
#         for urll in Uep:
#             purls.append(urll)
#     films = []
#     dates = []
#     acts = []
#
#     for i in urls:
#         a = get_film(i)
#         for j in a:
#             films.append(j)
#         b = get_date(i)
#         for j2 in b:
#             dates.append(j2)
#         c = get_act(i)
#         for j3 in c:
#             acts.append(j3)
#
#     print('films = %s, actors = %s,dates = %s' % (len(films), len(acts), len(dates)))
#
# zhenli = list(zip(films,acts,dates))
# #处理日期
# for i in range(0,len(dates)-1):
#     if '(' in dates[i]:
#         dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'')
#
# x = 1
# for i in purls:
#     urllib.request.urlretrieve(i,filename='E:\\pp\\%s.jpg' % (x))
#     x += 1
#
# conn = pymysql.connect(host = 'localhost',user = 'root',
#                        passwd = 'Linux',db = 'test',
#                        charset = 'utf8')
# cur = conn.cursor()
#
# ins = 'INSERT INTO movie(actor,date,img) VALUES(%s,%s,%s)'
# for i in range(1,100):
#     print(i)
#     with open('E:\\pp\\%s.jpg'%i,'rb') as f :
#         img = f.read()
#         cur.execute(ins,(acts[i],dates[i],img))
#
# conn.commit()
#
# cur.close()

代码比较混乱因为老是被识别出机器行为导致IP被封所以交得比较赶没有整理。被注释掉的都是原原代码，也就是没把网页源码写入文件每次都直接从网站上爬取导致IP被封的代码。

各个函数说明：

def get_content(url):

获取网页源码并去掉多余空格。

def create_url(url):

100个电影分别放在10个网页内，找出规律，获取10个url放在列表里返回。

def get_film(content):

def get_date(content):

def get_act(content):

这三个函数利用正则分别找出电影名，日期，演员。

def get_purl(content):

也是利用正则，找出图片的url。

    # with open('E:\\wenjian.txt','a+') as f:
    #     for url in urls:
    #         f.write(get_content(url))

这一段代码爬取了10页网页的源码，并写入文件E:\\wenjian.txt中，注意用格式a+打开。执行一次后注释掉，以免IP被封报错。

    with open('E:\\wenjian.txt','r') as f:
        neirong = f.read()
        films = get_film(neirong)
        acts = get_act(neirong)
        dates = get_date(neirong)
        print(dates)

这一段获取了所有的电影名，演员，日期，图片之前就获取过了，这里不再重复。print是输出检查结果的，可以删去。

for i in range(0,len(dates)-1):
    if '(' in dates[i]:
        dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'')
print(dates[12]+'-01-01')
for i in range(0,len(dates)-1):
    if len(dates[i]) == 4:
        dates[i] = dates[i] + '-01-01'
print(dates)

这一段是对时间进行处理，之前不断爬取网页被封IP就是因为没注意到时间格式居然出现了三种

分别为：
1996-11-04（美国）

1996-11-04

1996

气得我想亲自问候网站制作人员的母亲。

处理过程分三步，①爬取所以时间到一个列表内（也就是三种格式都有），

②

for i in range(0,len(dates)-1):
if '(' in dates[i]:
dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'')

利用正则，把有括号的替换成只剩时间。

③

for i in range(0,len(dates)-1):
if len(dates[i]) == 4:
dates[i] = dates[i] + '-01-01'

把只有年份的添加默认日月为1月1日。

至此，时间全变为标准格式。这段代码中出现的2个print也是为了检测处理效果，可以删去。

        conn = pymysql.connect(host = 'localhost',user = 'root',
                               passwd = 'Linux',db = 'test',
                               charset = 'utf8')
        cur = conn.cursor()

        ins = 'INSERT INTO movie(actor,date,img) VALUES(%s,%s,%s)'
        for i in range(1,100):
            print(i)
            with open('E:\\pp\\%s.jpg'%i,'rb') as f :
                img = f.read()
                cur.execute(ins,(acts[i],dates[i],img))

        conn.commit()

        cur.close()