Python练习题--爬取猫眼电影TOP100
题目要求:
爬取这个网站:http://maoyan.com/board/4?offset=0 上TOP100电影的①电影名②演员③日期④图片 并写入数据库。
代码如下:
#!/usr/bin/env python # coding:utf-8 import re import urllib from urllib import * from urllib import request import pymysql url = r'http://maoyan.com/board/4?offset=' def get_content(url): with request.urlopen(url) as f: content = f.read().decode('utf-8').replace(' ', '') return content def create_url(url): url_li = [] for i in range(0, 100, 10): newurl = url + '%d' % i url_li.append(newurl) return url_li def get_film(content): # content = get_content(url) pattern = r'<pclass="name"><ahref=".*"title="(.*)"data-act="boarditem-click"data-val="{movieId:.*}">' return re.findall(pattern, content) def get_date(content): # content = get_content(url) pattern = r'<pclass="releasetime">上映时间:(.*)\(?.*\)?</p></div>' return re.findall(pattern, content) def get_act(content): # content = get_content(url) pattern = r'主演:(.+)' return re.findall(pattern, content) def get_purl(content): # content = get_content(url) pattern = r'<imgdata-src="(.+)"alt=".*"class="board-img"/>' return re.findall(pattern, content) if __name__ == '__main__': # urls = create_url(url) # purls = [] # with open('E:\\wenjian.txt','a+') as f: # for url in urls: # f.write(get_content(url)) with open('E:\\wenjian.txt','r') as f: neirong = f.read() films = get_film(neirong) acts = get_act(neirong) dates = get_date(neirong) print(dates) for i in range(0,len(dates)-1): if '(' in dates[i]: dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'') print(dates[12]+'-01-01') for i in range(0,len(dates)-1): if len(dates[i]) == 4: dates[i] = dates[i] + '-01-01' print(dates) conn = pymysql.connect(host = 'localhost',user = 'root', passwd = 'Linux',db = 'test', charset = 'utf8') cur = conn.cursor() ins = 'INSERT INTO movie(actor,date,img) VALUES(%s,%s,%s)' for i in range(1,100): print(i) with open('E:\\pp\\%s.jpg'%i,'rb') as f : img = f.read() cur.execute(ins,(acts[i],dates[i],img)) conn.commit() cur.close() # for url in urls: # Uep = get_purl(url) # for urll in Uep: # purls.append(urll) # films = [] # dates = [] # acts = [] # # for i in urls: # a = get_film(i) # for j in a: # films.append(j) # b = get_date(i) # for j2 in b: # dates.append(j2) # c = get_act(i) # for j3 in c: # acts.append(j3) # # print('films = %s, actors = %s,dates = %s' % (len(films), len(acts), len(dates))) # # zhenli = list(zip(films,acts,dates)) # #处理日期 # for i in range(0,len(dates)-1): # if '(' in dates[i]: # dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'') # # x = 1 # for i in purls: # urllib.request.urlretrieve(i,filename='E:\\pp\\%s.jpg' % (x)) # x += 1 # # conn = pymysql.connect(host = 'localhost',user = 'root', # passwd = 'Linux',db = 'test', # charset = 'utf8') # cur = conn.cursor() # # ins = 'INSERT INTO movie(actor,date,img) VALUES(%s,%s,%s)' # for i in range(1,100): # print(i) # with open('E:\\pp\\%s.jpg'%i,'rb') as f : # img = f.read() # cur.execute(ins,(acts[i],dates[i],img)) # # conn.commit() # # cur.close()
代码比较混乱因为老是被识别出机器行为导致IP被封所以交得比较赶没有整理。被注释掉的都是原原代码,也就是没把网页源码写入文件每次都直接从网站上爬取导致IP被封的代码。
各个函数说明:
def get_content(url):
获取网页源码并去掉多余空格。
def create_url(url):
100个电影分别放在10个网页内,找出规律,获取10个url放在列表里返回。
def get_film(content):
def get_date(content):
def get_act(content):
这三个函数利用正则分别找出电影名,日期,演员。
def get_purl(content):
也是利用正则,找出图片的url。
# with open('E:\\wenjian.txt','a+') as f:
# for url in urls:
# f.write(get_content(url))
这一段代码爬取了10页网页的源码,并写入文件E:\\wenjian.txt中,注意用格式a+打开。执行一次后注释掉,以免IP被封报错。
with open('E:\\wenjian.txt','r') as f:
neirong = f.read()
films = get_film(neirong)
acts = get_act(neirong)
dates = get_date(neirong)
print(dates)
这一段获取了所有的电影名,演员,日期,图片之前就获取过了,这里不再重复。print是输出检查结果的,可以删去。
for i in range(0,len(dates)-1): if '(' in dates[i]: dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'') print(dates[12]+'-01-01') for i in range(0,len(dates)-1): if len(dates[i]) == 4: dates[i] = dates[i] + '-01-01' print(dates)
这一段是对时间进行处理,之前不断爬取网页被封IP就是因为没注意到时间格式居然出现了三种
分别为:
1996-11-04(美国)
1996-11-04
1996
气得我想亲自问候网站制作人员的母亲。
处理过程分三步,①爬取所以时间到一个列表内(也就是三种格式都有),
②
for i in range(0,len(dates)-1):
if '(' in dates[i]:
dates[i] = dates[i].replace(re.findall(r'\(.+\)',dates[i])[0],'')
利用正则,把有括号的替换成只剩时间。
③
for i in range(0,len(dates)-1):
if len(dates[i]) == 4:
dates[i] = dates[i] + '-01-01'
把只有年份的添加默认日月为1月1日。
至此,时间全变为标准格式。这段代码中出现的2个print也是为了检测处理效果,可以删去。
conn = pymysql.connect(host = 'localhost',user = 'root',
passwd = 'Linux',db = 'test',
charset = 'utf8')
cur = conn.cursor()
ins = 'INSERT INTO movie(actor,date,img) VALUES(%s,%s,%s)'
for i in range(1,100):
print(i)
with open('E:\\pp\\%s.jpg'%i,'rb') as f :
img = f.read()
cur.execute(ins,(acts[i],dates[i],img))
conn.commit()
cur.close()
这一段将信息写入数据库。里面有个print(i)用来在报错时检测是写入第几个数据时出错的,可以删掉。
# x = 1
# for i in purls:
# urllib.request.urlretrieve(i,filename='E:\\pp\\%s.jpg' % (x))
# x += 1
被注释掉的代码当中,这一段是之前爬取图片的代码,利用的方法是从每一页网页上爬取,需要打开网页。
结果截图:
爬下来的电影图片。
时间处理效果截图(第一行是没有处理的,第三行是处理完毕的)
数据库中信息:
198行是刚才为了演示执行了多次,写入了重复数据。。。。