Python链家租房信息爬虫和高德地图展示

Python链家租房信息爬虫和高德地图展示

工具:Pycharm,Win10,Python3.6.4,高德API

1.数据爬取

首先我们明确要获取的信息。我们要北京的东城,西城,朝阳,海淀,丰台这5个地区的租房信息。打开链家租房网站,选择东城地区的第二页信息我们发现网址有如下规律

Python链家租房信息爬虫和高德地图展示

下面就是分析页面,以东城为例,我们发现数据直接在源代码中,很简单,直接正则或者xpath获取即可。

import csv
import requests
import re
from lxml import etree
import csv
import urllib3
urllib3.disable_warnings()
headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Connection': 'close',
}
'''
函数功能:
    获取索引页信息
输    入:
    url - 索引页网址
输    出:
    html - 网页源码
'''
def get_index_html(url):
    try:
        response = requests.get(url,headers=headers,timeout=60)
        response.encoding = response.apparent_encoding
        html = response.text
        return html
    except:
        get_index_html(url)
'''
函数功能:
    解析索引页
输    入:
    html - 网页源码
输    出:
    all_data - 所有信息
        describe - 描述
        area - 小区位置
        square - 面积
        direction - 朝向
        type - 类型
        floor - 楼层
        detail_url - 详情链接 
    advantage - 优势
'''
def parse_index_html(html):
    advantages = []
    all_data = []
    html_xpath = etree.HTML(html)
    for j in range(1,31):
        advantage = ''
        try:
            html_data = html_xpath.xpath('//*[@id="content"]/div[1]/div[1]/div['+str(j)+']/div/p[5]//i/text()')
            for i in html_data:
                advantage += i
                advantage += ','
            advantages.append(advantage)
        except:
            advantages.append(advantage)
            '//*[@id="content"]/div[1]/div[1]/div[2]/div/p[1]'
    infos_pattern = re.compile('twoline">.*?href="(.*?)">'        #url
                                     +'(.*?)</a>'                        #describe
                                     +'.*?a target.*?">(.*?)</a>'       #分区
                                     +'.*?target="_blank">(.*?)</a>'    #local
                                     +'.*?</i>(.*?)㎡'                   #square
                                     +'.*?</i>(.*?)<i>'                  #direction
                                     +'.*?</i>(.*?)<span'                #type
                                     +'.*?</i>(.*?)</span>'              #floor
                                      '.*?content__list--item-price"><em>(.*?)</em' # money
                                     ,re.S)
    infos = re.findall(infos_pattern,html)
    for info in infos:
        info = list(info)
        all_data.append(info)
    # print(all_data)
    return advantages,all_data

'''
函数功能:
    获取详情页信息
输    入:
    url - 详情页网址
输    出:
    html - 网页源码
'''
def get_detail_url(url):
    try:
        response = requests.get(url,headers=headers,timeout=60)
        response.encoding = response.apparent_encoding
        html = response.text
        # print(html)
        return html
    except:
        get_detail_url()

'''
函数功能:
    解析详情页
输    入:
    html - 网页源码
输    出:
    longitude - 经度
    latitude - 维度
'''
def parse_detail_html(html):
    longitude_pattern = re.compile('longitude: \'(.*?)\',',re.S)
    longitude = re.findall(longitude_pattern,html)
    latitude_pattern = re.compile('latitude: \'(.*?)\'',re.S)
    latitude = re.findall(latitude_pattern,html)
    name_pattern = re.compile('g_conf.name = \'(.*?)\';',re.S)
    name = re.findall(name_pattern,html)
    return longitude[0],latitude[0],name[0]


def write2csv(info,local):
    with open(local+'.csv','a',encoding='utf-8-sig',newline='') as f:
        writer = csv.writer(f)

        writer.writerow(info)
if __name__ == '__main__':
    local_list = ['fengtai']
    for local in local_list:
        for page in range(1,30):
            print(local,page)
            url = 'https://bj.lianjia.com/zufang/'+str(local)+'/pg'+str(page)
            # url = 'https://bj.lianjia.com/zufang/BJ2225173257978920960.html?nav=0'
            html = get_index_html(url)
            advantages,all_data = parse_index_html(html)
            for j in range(len(all_data)):
                detail_url = 'https://bj.lianjia.com'+all_data[j][0]
                detail_html = get_detail_url(detail_url)
                longitude, latitude,name = parse_detail_html(detail_html)
                all_data[j].append(advantages[j])
                all_data[j].append(longitude)
                all_data[j].append(latitude)
                all_data[j].insert(1,name)
                for k in range (len(all_data[j])):
                        # print(all_data[j][k])
                    all_data[j][k] = all_data[j][k].strip().replace(' ','')
                write2csv(all_data[j],local)

我们获取了5个地区数据之后合并在一起获得如下数据

Python链家租房信息爬虫和高德地图展示

2.数据分析

数据获取了之后我们要进行数据分析,主要分析三个。一是价格和面积的散点图,面积的直方图。二是5个地区的平均租价(元/平方米)的直方图。三是房源描述的词云图。

from matplotlib import  pyplot as plt
import pandas as pd
import jieba
import wordcloud
from scipy.misc import imread

plt.rcParams['font.family'] = 'SimHei' #配置中文字体
plt.rcParams['font.size'] = 15   # 更改默认字体大小
data = pd.read_csv('all_data.csv',encoding='utf-8-sig') #读取数据

#均价统计
dongcheng = data.iloc[:,9] /   data.iloc[:,5]
data.insert(12,'average',dongcheng)
grouped = data['average'].groupby(data['local'])
print(grouped.mean().keys())
plt.bar(list(grouped.mean().keys()),list(grouped.mean().values))
plt.show()


#房屋面积和价格的分析
plt.figure(figsize=(30,10))
plt.subplot(1,2,1) #一行两列第一个图
size = data.iloc[:,5]
price = data.iloc[:,9]
plt.scatter(size,price)
plt.xlabel('房屋面积')
plt.ylabel('价格')

plt.subplot(1,2,2) #一行两列第一个图
plt.title('面积统计',fontsize=20,)
plt.hist(size,bins = 15) #bins指定有几条柱状
plt.xlabel('房屋面积')
plt.show()




#title词云分析
title = data.iloc[:,2]

color_mask = imread("123.jpg") #读取背景图片,

title = str(title)
for ch in "'\n'' '!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|>}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.":
    title = title.replace(ch,"")
# 生成词云
ls = jieba.lcut(title)
txt = " ".join(ls)
a = wordcloud.WordCloud(font_path = "msyh.ttc", width = 1000, height = 700, background_color = "black",mask=color_mask,)
a.generate(txt)
a.to_file("title.png")

Python链家租房信息爬虫和高德地图展示

Python链家租房信息爬虫和高德地图展示

Python链家租房信息爬虫和高德地图展示

3.高德地图展示

这部分去高德API示例中心找找就有相应结果我直接贴出代码

<html>
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
        <meta name="viewport" content="initial-scale=1.0, user-scalable=no, width=device-width">
        <title>毕业生租房</title>
        <link rel="stylesheet" href="http://cache.amap.com/lbs/static/main1119.css" />
        <link rel="stylesheet" href="http://cache.amap.com/lbs/static/jquery.range.css" />
        <script src="http://cache.amap.com/lbs/static/jquery-1.9.1.js"></script>
        <script src="http://cache.amap.com/lbs/static/es5.min.js"></script>
        <script src="http://webapi.amap.com/maps?v=1.3&key=22d3816e107f199992666d6412fa0691&plugin=AMap.ArrivalRange,AMap.Scale,AMap.Geocoder,AMap.Transfer,AMap.Autocomplete"></script>
        <script src="http://cache.amap.com/lbs/static/jquery.range.js"></script>
        <style>
        .control-panel {
            position: absolute;
            top: 30px;
            right: 20px;
        }

        .control-entry {
            width: 280px;
            background-color: rgba(119, 136, 153, 0.8);
            font-family: fantasy, sans-serif;
            text-align: left;
            color: white;
            overflow: auto;
            padding: 10px;
            margin-bottom: 10px;
        }

        .control-input {
            margin-left: 120px;
        }

        .control-input input[type="text"] {
            width: 160px;
        }

        .control-panel label {
            float: left;
            width: 120px;
        }

        #transfer-panel {
            position: absolute;
            background-color: white;
            max-height: 80%;
            overflow-y: auto;
            top: 30px;
            left: 20px;
            width: 250px;
        }
        </style>
    </head>

    <body>
        <div id="container"></div>
        <div class="control-panel">
            <div class="control-entry">
                <label>选择工作地点:</label>
                <div class="control-input">
                    <input id="work-location" type="text">
                </div>
            </div>
            <div class="control-entry">
                <label>选择通勤方式:</label>
                <div class="control-input">
                    <input type="radio" name="vehicle" value="SUBWAY,BUS" onClick="takeBus(this)" checked/> 公交+地铁
                    <input type="radio" name="vehicle" value="SUBWAY" onClick="takeSubway(this)" /> 地铁
                </div>
            </div>
            <div class="control-entry">
                <label>导入房源文件:</label>
                <div class="control-input">
                    <input type="file" name="file" onChange="importRentInfo(this)" />
                </div>
            </div>
        </div>
        <div id="transfer-panel"></div>
        <script>
        var map = new AMap.Map("container", {
            resizeEnable: true,
            zoomEnable: true,
            center: [116.397428, 39.90923],
            zoom: 11
        });
        var scale = new AMap.Scale();
map.addControl(scale);

var arrivalRange = new AMap.ArrivalRange();
var x, y, t, vehicle = "SUBWAY,BUS";
var workAddress, workMarker;
var rentMarkerArray = [];
var polygonArray = [];
var amapTransfer;

var infoWindow = new AMap.InfoWindow({
    offset: new AMap.Pixel(0, -30)
});

var auto = new AMap.Autocomplete({
    input: "work-location"
});
AMap.event.addListener(auto, "select", workLocationSelected);


function takeBus(radio) {
    vehicle = radio.value;
    loadWorkLocation()
}

function takeSubway(radio) {
    vehicle = radio.value;
    loadWorkLocation()
}

function importRentInfo(fileInfo) {
    var file = fileInfo.files[0].name;
    loadRentLocationByFile(file);
}

function workLocationSelected(e) {
    workAddress = e.poi.name;
    loadWorkLocation();
}

function loadWorkMarker(x, y, locationName) {
    workMarker = new AMap.Marker({
        map: map,
        title: locationName,
        icon: 'http://webapi.amap.com/theme/v1.3/markers/n/mark_r.png',
        position: [x, y]

    });
}


function loadWorkRange(x, y, t, color, v) {
    arrivalRange.search([x, y], t, function(status, result) {
        if (result.bounds) {
            for (var i = 0; i < result.bounds.length; i++) {
                var polygon = new AMap.Polygon({
                    map: map,
                    fillColor: color,
                    fillOpacity: "0.4",
                    strokeColor: color,
                    strokeOpacity: "0.8",
                    strokeWeight: 1
                });
                polygon.setPath(result.bounds[i]);
                polygonArray.push(polygon);
            }
        }
    }, {
        policy: v
    });
}

function addMarkerByAddress(address) {
    var geocoder = new AMap.Geocoder({
        city: "北京",
        radius: 1000
    });
    geocoder.getLocation(address, function(status, result) {
        if (status === "complete" && result.info === 'OK') {
            var geocode = result.geocodes[0];
            rentMarker = new AMap.Marker({
                map: map,
                title: address,
                icon: 'http://webapi.amap.com/theme/v1.3/markers/n/mark_b.png',
                position: [geocode.location.getLng(), geocode.location.getLat()]
            });
            rentMarkerArray.push(rentMarker);

            rentMarker.content = "<div>房源:<a target = '_blank' href='https://bj.lianjia.com" + address + "'>" + address + "</a><div>"
            rentMarker.on('click', function(e) {
                infoWindow.setContent(e.target.content);
                infoWindow.open(map, e.target.getPosition());
                if (amapTransfer) amapTransfer.clear();
                amapTransfer = new AMap.Transfer({
                    map: map,
                    policy: AMap.TransferPolicy.LEAST_TIME,
                    city: "北京市",
                    panel: 'transfer-panel'
                });
                amapTransfer.search([{
                    keyword: workAddress
                }, {
                    keyword: address
                }], function(status, result) {})
            });
        }
    })
}

function delWorkLocation() {
    if (polygonArray) map.remove(polygonArray);
    if (workMarker) map.remove(workMarker);
    polygonArray = [];
}

function delRentLocation() {
    if (rentMarkerArray) map.remove(rentMarkerArray);
    rentMarkerArray = [];
}

function loadWorkLocation() {
    delWorkLocation();
    var geocoder = new AMap.Geocoder({
        city: "北京",
        radius: 1000
    });

    geocoder.getLocation(workAddress, function(status, result) {
        if (status === "complete" && result.info === 'OK') {
            var geocode = result.geocodes[0];
            x = geocode.location.getLng();
            y = geocode.location.getLat();
            loadWorkMarker(x, y);
            loadWorkRange(x, y, 60, "#3f67a5", vehicle);
            map.setZoomAndCenter(12, [x, y]);
        }
    })
}

function loadRentLocationByFile(fileName) {
    delRentLocation();
    var rent_locations = new Set();
    $.get(fileName, function(data) {
        data = data.split("\n");
        data.forEach(function(item, index) {
            rent_locations.add(item.split(",")[1]);
        });
        rent_locations.forEach(function(element, index) {
            addMarkerByAddress(element);
        });
    });
}

        </script>
    </body>
</html>

Python链家租房信息爬虫和高德地图展示