Python糗百爬虫
# -*- coding:utf-8 -*- import urllib2 import urllib import re url='https://www.qiushibaike.com/8hr/page/1/' headers={ 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } try: request=urllib2.Request(url,headers=headers) response=urllib2.urlopen(request) content=response.read().decode('utf-8') pattern=re.compile('<div class="author.*?<h2>(.*?)</h2>.*?'+'<div class="content">.*?<span>(.*?)</span>.*?'+'<i class="number.*?>(.*?)</i>(.*?)</span>.*?',re.S) items=re.findall(pattern,content) for item in items: print item[0],item[1],item[2],item[3] except urllib2.URLError,e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason
爬取结果: