IPpool及爬取搜狗搜索下的微信页面

# 代理ip
# 默认爬虫时使用的是真实IP,为了伪装真实IP,使用代理IP
# redis数据库是内存型数据库,及支持将数据存储到内存,也可以将数据进行持久化的本地存储。
# redis非常适合做数据的缓存。

先拿  知乎  练手, 模仿用户正常登陆知乎

# 直接将登陆成功之后的Cookie放在headers中,向页面发送请求。
url = 'https://www.zhihu.com/'
需要请求头headers

访问zhihu.com , 以正常身份登陆知乎,打开网页,F12

IPpool及爬取搜狗搜索下的微信页面

headers = {
    "Host": "www.zhihu.com",
    "Referer": "https://www.zhihu.com/signup?next=%2F",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
    "Cookie": 'd_c0="AGAku45jtw2PTu09Dpvhke4klei4JarodIE=|1528425319"; q_c1=a46ba33124a2403cb907a2d9105a7bd4|1528425319000|1528425319000; capsion_ticket="2|1:0|10:1528705722|14:capsion_ticket|44:NzE1YzNhZmJiZjIxNDA1MTg4ZTdkN2YyMTFiNWQwNTk=|42d9339a55b21206f1cae511940cab2468b6e201c1adce2c958be61edfadb1a0"; _zap=cc087957-a74b-43fc-a1d9-f7bd685897b7; _xsrf=8e9717b7-05a4-481c-8e82-3ca140d5b266; tgw_l7_route=156dfd931a77f9586c0da07030f2df36; z_c0="2|1:0|10:1528705732|4:z_c0|92:Mi4xRHBTMkJRQUFBQUFBWUNTN2ptTzNEU1lBQUFCZ0FsVk54SUFMWEFEMGtfeUowbzNNeXlQRjcwYXVSNV9zMHV1UXZn|27f3bcbcc0b8271658d88009ef05d80b3da8b6df2fb156a7c186a5d076047a63'
}
# allow_redirects=False 禁止重定向的参数。否则,无法获取302的状态码。
response = requests.get(url, headers=headers, allow_redirects=False)
print(response.status_code)  # 302
print(response.text)
# 以下获得随机IP

from requests.exceptions import ConnectionError

PROXY_POOL_URL = 'http://localhost:5000/get'


def get_proxy():
    try:
        response = requests.get(PROXY_POOL_URL)
        if response.status_code == 200:
            return response.text
    except ConnectionError:
        return None

def get_html():
    proxy = get_proxy()
    print(proxy)
    proxies = {
        'http': 'http://' + proxy
    }
    try:
        response = requests.get('http://www.baidu.com', proxies=proxies)
        print(response.status_code)
    except:
        print('----')

get_html()