爬虫实例爬取微信公众号文章
网站:http://weixin.sogou.com/weixin?type=2&query=python&page=1
实例中含有IP代理池!
import requests, re, pymongo, time from fake_useragent import UserAgent from urllib.parse import urlencode from pyquery import PyQuery from requests.exceptions import ConnectionError client = pymongo.MongoClient('localhost') db = client['weixin'] key_word = 'python开发' connection_count = 0 # 连接列表页失败的次数 connection_detail_count = 0 # 连接列表页失败的次数 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Cookie': 'CXID=161A70BF2483DEF017E035BBBACD2A81; [email protected]@@@@@@@@@@; SUID=57A70FAB5D68860A5B1E1053000BC731; IPLOC=CN4101; SUV=1528705320668261; pgv_pvi=5303946240; ABTEST=5|1528705329|v1; SNUID=EF1FB713B7B2D9EE6E2A6351B8B3F072; weixinIndexVisited=1; sct=2; SUIR=F607AE0BA0A5CFF9D287956DA129A225; pgv_si=s260076544; JSESSIONID=aaaILWONRn9wK_OiUhlnw; PHPSESSID=1i38a2ium8e5th2ukhnufua6r1; ppinf=5|1528783576|1529993176|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTklQUQlOTQlRTklOTUlOUN8Y3J0OjEwOjE1Mjg3ODM1NzZ8cmVmbmljazoxODolRTklQUQlOTQlRTklOTUlOUN8dXNlcmlkOjQ0Om85dDJsdUtPQzE0d05mQkJFeUI2d1VJVkhZUE1Ad2VpeGluLnNvaHUuY29tfA; pprdig=ENOZrtvLfoIOct75SgASWxBJb8HJQztLgFbyhRHBfeqrzcirg5WQkKZU2GDCFZ5wLI93Wej3P0hCr_rST0AlvGpF6MY9h24P267oHdqJvgP2DmCHDr2-nYvkLqKs8bjA7PLM1IEHNaH4zK-q2Shcz2A8V5IDw0qEcEuasGxIZQk; sgid=23-35378887-AVsfYtgBzV8cQricMOyk9icd0; ppmdig=15287871390000007b5820bd451c2057a94d31d05d2afff0', } def get_proxy(): try: response = requests.get("http://127.0.0.1:5010/get/") if response.status_code == 200: return response.text return None except Exception as e: print('获取代理异常:',e) return None def get_page_list(url): global connection_count proxies = get_proxy() print('列表页代理:', proxies) # 请求url,获取源码 if proxies != None: proxies = { 'http':'http://'+proxies } try: response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies) if response.status_code == 200: print('列表页{}请求成功',url) return response.text print('状态码:',response.status_code) if response.status_code == 302: # 切换代理,递归调用当前函数。 get_page_list(url) except ConnectionError as e: print('连接对方主机{}失败: {}',url,e) connection_count += 1 if connection_count == 3: return None # 增加连接次数的判断 get_page_list(url) def parse_page_list(html): obj = PyQuery(html) all_a = obj('.txt-box > h3 > a').items() for a in all_a: href = a.attr('href') yield href def get_page_detail(url): global connection_detail_count """ 请求详情页 :param url: 详情页的url :return: """ proxies = get_proxy() print('详情页代理:',proxies) # 请求url,获取源码 if proxies != None: proxies = { 'http': 'http://' + proxies } try: # 注意:将重定向allow_redirects=False删除。列表页是https: verify=False, # 注意:将重定向allow_redirects=False删除。列表页是https: verify=False, # 注意:将重定向allow_redirects=False删除。列表页是https: verify=False, # 注意:将重定向allow_redirects=False删除。列表页是https: verify=False, response = requests.get(url, headers=headers, verify=False, proxies=proxies) if response.status_code == 200: print('详情页{}请求成功', url) return response.text else: print('状态码:', response.status_code,url) # 切换代理,递归调用当前函数。 get_page_detail(url) except ConnectionError as e: print('连接对方主机{}失败: {}', url, e) connection_detail_count += 1 if connection_detail_count == 3: return None # 增加连接次数的判断 get_page_detail(url) def parse_page_detail(html): obj = PyQuery(html) # title = obj('#activity-name').text() info = obj('.profile_inner').text() weixin = obj('.xmteditor').text() print('info') return { 'info':info, 'weixin':weixin } def save_to_mongodb(data): # insert_one: 覆盖式的 db['article'].insert_one(data) # 更新的方法: # 参数1:指定根据什么字段去数据库中进行查询,字段的值。 # 参数2:如果经过参数1的查询,查询到这条数据,执行更新的操作;反之,执行插入的操作;$set是一个固定的写法。 # 参数3:是否允许更新 db['article'].update_one({'info': data['info']}, {'$set': data}, True) time.sleep(1) def main(): for x in range(1, 101): url = 'http://weixin.sogou.com/weixin?query={}&type=2&page={}'.format(key_word, 1) html = get_page_list(url) if html != None: # 详情页的url urls = parse_page_list(html) for url in urls: detail_html = get_page_detail(url) if detail_html != None: data = parse_page_detail(detail_html) if data != None: save_to_mongodb(data) if __name__ == '__main__': main()