【Python爬虫_7】爬取免费代理IP并验证是否可用(西刺代理)
import requests from lxml import etree import pymysql ''' 爬取西刺代理ip和port,然后验证该ip是否可用 ''' header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } def parse_data(url): # 开启数据库 conn = pymysql.Connect( host='127.0.0.1', port=3306, db='pyproject', user='root', passwd='root', charset='utf8' ) curr = conn.cursor() all_ip = [] # 保存可用的ip和port response = requests.get(url, headers=header) data = response.content.decode('utf-8') html = etree.HTML(data) all_list = html.xpath('//table[@id="ip_list"]//tr') for t in all_list[2:]: ip = t.xpath('./td[2]/text()')[0] port = t.xpath('./td[3]/text()')[0] sql = "insert into XCIP(ip, port) values('" + ip + "','" + port + "')" list_data = ip + ':' + port # 验证ip是否有效 tarurl = 'http://www.baidu.com' proxies = {'http': 'http://' + list_data, 'https': 'https://' + list_data} try: res = requests.get(url=tarurl, proxies=proxies, headers=header, timeout=5).status_code if res == 200: print(list_data) curr.execute(sql) conn.commit() all_ip.append(list_data) except: print('该ip不可用') curr.close() conn.close() return all_ip def main(): base_url = 'https://www.xicidaili.com/nn/{}' for i in range(1, 5): url = base_url.format(i) detail_url = parse_data(url) if __name__ == '__main__': main()
爬取了前五页,就只有7个可用,这个网页太不靠谱了;