python爬取top100电影缩略图

代码如下:

import re
from urllib import request


url = 'http://maoyan.com/board/4'


def get_content(url):
    with request.urlopen(url) as f:
        content = f.read().decode('utf-8')
        return content

def get_name(url):

    content = get_content(url)
    pattern = r'<p class="name"><a href="/films/.*" title=".*" data-act="boarditem-click" data-val="{movieId:.*}">(.*)</a></p>'
    return re.findall(pattern, content)

def get_pattern(url):
    content = get_content(url)
    pattern = r'<img data-src="(.*)" alt=".*" class="board-img" />'
    return re.findall(pattern, content)

def create_url(url):
    url_li = []
    for i in range(10):
        new_url = url+'?offset=%d' %(i*10)
        i += 1
        url_li.append(new_url)
    return url_li


url_li = create_url(url)
# print (url_li)
# for i in url_li:
#     url_name = get_name(i)
#     print (url_name)

for url in url_li:
    list = get_pattern(url)
    url_name = get_name(url)
    x = 0
    for imgurl in list:
        request.urlretrieve(imgurl,'/opt/img/%s.jpg' %(url_name[x]))
        x += 1

爬取数据后,保存在/opt/img/下,并以电影名作为图片名。

结果如下:

python爬取top100电影缩略图

********   end   *********