介绍requests+threading多线程爬虫,提取采用xpath 和正则两种,介绍线程锁

爬虫专业的都喜欢scrapy框架,但scrapy上手需要时间,对初学者不太适合。

本文介绍使用requets爬虫,为了利于演示学习,使用了xpath解析html和完全使用正则来提取两种方法,仅供参考。

代码是爬取http://esf.sz.fang.com/,房天下网站的深圳二手房信息

import requests,json,random
import re,threading
from lxml import etree

lock=threading.Lock()

user_agent_list = [ \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,\
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
count=0

def fang_com(page_url):    ##列表页

    header={}

    header['User-Agent']=random.choice(user_agent_list)
    header.update({
        "Host":"esf.sz.fang.com",
        #"Cookie":"global_cookie=fb1g6d0w64d2cmu86sv4g9n3va0j137sk48; vh_newhouse=3_1491312022_2816%5B%3A%7C%40%7C%3A%5D833300ee3177d88529c7aa418942ece9; newhouse_user_guid=2F163DE7-8201-7FA9-2FB6-E507FE6F03B1; SoufunSessionID_Esf=3_1495389730_232; sf_source=; s=; showAdsh=1; hlist_xfadhq_SZ=0%7c2017%2f5%2f25+1%3a21%3a47%7c; city=sz; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; logGuid=a768dd46-b85b-47f4-a7a0-0a6596cab4cd; __utma=147393320.1111837171.1491290389.1495646208.1495650134.9; __utmb=147393320.12.10.1495650134; __utmc=147393320; __utmz=147393320.1495650134.9.4.utmcsr=esf.sz.fang.com|utmccn=(referral)|utmcmd=referral|utmcct=/; unique_cookie=U_cqyov4ut5vv1al8e2858qhzgt17j2z06mph*14"
        })
    while(1):                     ###这个主要是,fang.com会随机返回几个10054或者10053,如果连页面都没读取到,提取就是后话了,这网站没有封杀,即使使用单ip只会很少时候随机来几个10054 ,('Connection aborted.', error(10054, ''))
        text=''
        try:
            text=requests.get(page_url,headers=header,timeout=10).text
            #print text
        except Exception as e:
            print e
        if text!='':
            break

    se = etree.HTML(text)                                 ###########为了利于大家学习,这段演示xpath提取信息
    all_dl=se.xpath('//dl[@class="list rel"]')
    print len(all_dl)
    for dl in all_dl:
        title=dl.xpath('.//dd[@class="info rel floatr"]/p[@class="title"]/a/text()')[0]

        url=dl.xpath('.//dd[@class="info rel floatr"]/p[@class="title"]/a/@href')[0]
        url='http://esf.sz.fang.com'+url

        info_list=dl.xpath('.//dd[@class="info rel floatr"]/p[@class="mt12"]/text()')
        #print json.dumps(info,ensure_ascii=False)    #py2显示汉字,py3可以直接print mt12
        info=''
        for l in info_list:
            l2= re.findall('\S*',l)[0]          ###消除空白和换行
            #print m_str
            info+=l2+'|'

        time.sleep(1)
        total_price,price_squere,huxin,cankao_shoufu,shiyong_mianji,jianzhu_mianji,years,discription=get_detail(url)


        lock.acquire()                  ###这里叫锁,一是保证count计数准确,而是不会导致多个线程乱print,导致看不清楚。加锁的目的是别的线程不能运行这段代码了。但我之前看到有的人乱加锁,把消耗时间很长的代码加锁,那样导致多线程就基本个废物
        global count
        count+=1
        print time.strftime('%H:%M:%S',time.localtime(time.time())),'    ',count
        print '列表页:'
        print ' title: %s\n url: %s\n info: %s\n'%(title,url,info)

        print '详情页:'
        print ' total_price: %s\n price_squere: %s\n huxin: %s\n cankao_shoufu: %s\n shiyong_mianji: %s\n jianzhu_mianji: %s\n years: %s \n'%(total_price,price_squere,huxin,cankao_shoufu,shiyong_mianji,jianzhu_mianji,years)
        print '**************************************************************'
        lock.release()



def get_detail(url):    ###详情页

    header={'User-Agent':random.choice(user_agent_list)}
    header.update({"Host":"esf.sz.fang.com"})

    while(1):
        content=''
        try:
            content=requests.get(url,headers=header,timeout=10).content
        except Exception as  e:
            print e
            pass
        if content!='':
            break

    content=content.decode('gbk').encode('utf8')  ##查看网页源代码可看到是gbk编码,直接print的话,如果你在pycharm设置控制台是utf8编码,那么控制台的中文则会乱码,cmd是gbk的恰好可以显示。如果你在pycharm设置控制台是utf8编码,需要这样做
    #print content

    inforTxt=getlist0(re.findall('(<div class="inforTxt">[\s\S]*?)<ul class="tool">',content))       ###########为了利于大家学习,这段演示正则表达式提取信息,某些信息可能在有的房子界面没有,要做好判断
    #print inforTxt

    total_price=getlist0(re.findall('</span>价:<span class="red20b">(.*?)</span>',inforTxt))

    price_squere=getlist0(re.findall('class="black">万</span>\((\d+?)元[\s\S]*?<a id="agantesfxq_B02_03"',inforTxt))
    huxin=getlist0(re.findall('<dd class="gray6"><span class="gray6">户<span class="padl27"></span>型:</span>(.*?)</dd>',inforTxt))
    cankao_shoufu=getlist0(re.findall('参考首付:</span><span class="black floatl">(.*?万)</span> </dd>',inforTxt))
    shiyong_mianji=getlist0(re.findall('>使用面积:<span class="black ">(.*?)</span></dd>',inforTxt))
    shiyong_mianji=getlist0(re.findall('\d+',shiyong_mianji))
    jianzhu_mianji=getlist0(re.findall('建筑面积:<span class="black ">(.*?)</span></dd>',inforTxt))
    jianzhu_mianji=getlist0(re.findall('\d+',jianzhu_mianji))
    years=getlist0(re.findall('<span class="gray6">年<span class="padl27"></span>代:</span>(.*?)</dd>',inforTxt))

    discription=getlist0(re.findall('style="-moz-user-select: none;">([\s\S]*?)<div class="leftBox"',content))
    #print discription
    #print total_price,price_squere,huxin,cankao_shoufu,shiyong_mianji,jianzhu_mianji,years

    return total_price,price_squere,huxin,cankao_shoufu,shiyong_mianji,jianzhu_mianji,years,discription




#get_detail('http://esf.sz.fang.com/chushou/3_193928457.htm')
def getlist0(list):
    if list:
        return list[0]
    else:
        return ''

if __name__=='__main__':
    '''                                       ##这个是单线程,单线程爬很慢,3000个房子信息,一个5秒,那也得15000秒了,很耽误时间
    for i in range(1,101):
        page_url='http://esf.sz.fang.com/house/i3%s'%i
        fang_com(page_url)
    '''
    threads=[]                                            ###这个是演示多线程爬取
    for i in range(1,101):                                             #开了100线程,这样开100线程去爬100页面的详情页面,因为fang.com只能看100页
        t=threading.Thread(target=fang_com,args=('http://esf.sz.fang.com/house/i3%s'%i,))           ###这样做没问题,但如果你是爬取1000页面,也这样做就不合适了,python开多了线程会导致线程创建失败,100线程已经很快了,网速是瓶颈了这时候,我开100线程时候网速是800KB左右的网速,我宽带才4M,运营商还算比较良心了,4M宽带400k

        threads.append(t)

        t.start()

    for t in threads:
        t.join()

    print 'over'

 

多线程不到3分钟就爬取完3000个房源的详细信息了,发下多线程的运行结果:

介绍requests+threading多线程爬虫,提取采用xpath 和正则两种,介绍线程锁