python爬取妹纸图片

源代码:

程序说明:

    功能:爬取www.mzitu.com网页中1-14页中妹子图片

    注意:随着www.mzitu.com网页结构的改变,此代码也需要作出相应的变化

程序抓取结果:

python爬取妹纸图片

from urllib import request,parse
import re
import os
import time

#获取图片数量
def getPicNum(links):
    headers = {
        "Host": " www.mzitu.com",
        "Connection": " keep-alive",
        "Pragma": " no-cache",
        "Cache-Control": " no-cache",
        "Upgrade-Insecure-Requests": " 1",
        "User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
        "Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": " zh-CN,zh;q=0.9",
    }
    req=request.Request(url=links,headers=headers)
    response=request.urlopen(req)
    html=response.read().decode("utf-8")
    pattern=re.compile('<span>(\d*)</span>')
    res=pattern.findall(html)
    # print(res)
    # exit()
    pageNum=int(res[4])
    return pageNum




#获取爬取列表
def getList():
    base_url="http://www.mzitu.com/page/{0}/"
    headers={
        "Host": " www.mzitu.com",
        "Connection": " keep-alive",
        "Pragma": " no-cache",
        "Cache-Control": " no-cache",
        "Upgrade-Insecure-Requests": " 1",
        "User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
        "Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Referer": "http://www.mzitu.com/119965",
        "Accept-Language": " zh-CN,zh;q=0.9",
    }
    allLinks=list()
    for one in range(1,15):
        time.sleep(0.5)
        fullPath=base_url.format(one)
        req=request.Request(url=fullPath,headers=headers)
        response=request.urlopen(req)
        html=response.read().decode("utf-8")
        pattern=re.compile('href="(.*?)"\starget="_blank"><')
        res=pattern.findall(html)
        allLinks=allLinks+res
        # print(res)
        # print("{0}---------------------------------------------".format(one))
    return allLinks

#下载图片
def downloadPic(picPath, fileName,dirName):
    print("downloading...."+fileName)
    headers={
        "Host": " i.meizitu.net",
        "Connection": " keep-alive",
        "Pragma": " no-cache",
        "Cache-Control": " no-cache",
        "User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
        "Accept": " image/webp,image/apng,image/*,*/*;q=0.8",
        "Referer": "http://www.mzitu.com/120012",
        "Accept-Language": " zh-CN,zh;q=0.9",

    }
    req=request.Request(url=picPath,headers=headers)
    response=request.urlopen(req)
    context=response.read()
    with open("c:/Users/Michael/Desktop/爬取内容/{dirName}/{fileName}".format(dirName=dirName,fileName=fileName),"wb") as fp:
        fp.write(context)

#获取所图片链接
def getPic(base):
    base_url=base+"/%d"
    headers={
        "Host": " www.mzitu.com",
        "Connection": " keep-alive",
        "Pragma": " no-cache",
        "Cache-Control": " no-cache",
        "Upgrade-Insecure-Requests": " 1",
        "User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",
        "Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": " zh-CN,zh;q=0.9",
        # "Cookie": " Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1518425004; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1518425648",
    }
    dirName=base_url.split("/")[-2]
    if not os.path.exists("c:/Users/Michael/Desktop/爬取内容/{dirName}/".format(dirName=dirName)):
        os.makedirs("c:/Users/Michael/Desktop/爬取内容/{dirName}/".format(dirName=dirName))
    count=getPicNum(base)
    for one in range(1,count):
        time.sleep(0.5)
        fullUrl=base_url% one
        req=request.Request(url=fullUrl,headers=headers)
        response=request.urlopen(req)
        html=response.read().decode("utf-8")
        pattern=re.compile('img.*?src="(.*?)"')
        res=pattern.search(html)
        # print(res.group(1))
        if res is not None:
            picPath=res.group(1)
            fileName=picPath.split("/")[-1]
            # print("c:/Users/Michael/Desktop/{dirName}/fileName".format(dirName=dirName))
            # exit()
            downloadPic(picPath,fileName,dirName)
            # request.urlretrieve(picPath,"c:/Users/Michael/Desktop/{dirName}/{fileName}".format(dirName=dirName,fileName=fileName),headers)
        else:
            print("第{0}页没有图片".format(one))
        # print(res)0

        print(one,"------------------------------------------------------------------------")
    print("下载完毕")



if __name__ == '__main__':
    allLinks=getList()
    for one in allLinks:
        getPic(one)