python爬取top100电影缩略图
代码如下:
import re from urllib import request url = 'http://maoyan.com/board/4' def get_content(url): with request.urlopen(url) as f: content = f.read().decode('utf-8') return content def get_name(url): content = get_content(url) pattern = r'<p class="name"><a href="/films/.*" title=".*" data-act="boarditem-click" data-val="{movieId:.*}">(.*)</a></p>' return re.findall(pattern, content) def get_pattern(url): content = get_content(url) pattern = r'<img data-src="(.*)" alt=".*" class="board-img" />' return re.findall(pattern, content) def create_url(url): url_li = [] for i in range(10): new_url = url+'?offset=%d' %(i*10) i += 1 url_li.append(new_url) return url_li url_li = create_url(url) # print (url_li) # for i in url_li: # url_name = get_name(i) # print (url_name) for url in url_li: list = get_pattern(url) url_name = get_name(url) x = 0 for imgurl in list: request.urlretrieve(imgurl,'/opt/img/%s.jpg' %(url_name[x])) x += 1
爬取数据后,保存在/opt/img/下,并以电影名作为图片名。
结果如下:
******** end *********