二手房数据的分析与挖掘(一) -- 数据采集
实验环境
Windows7
python3.6
Pycharm2018
实验内容
编写爬虫从房天下上抓取郑州市的二手房信息。本次分析要爬取的网页数据如下:
数据包括户型、建筑面积、单价、总价、朝向、楼层、装修、小区、区域、建筑年代、有无电梯、产权性质、住宅类型、建筑结构、建筑类型、挂牌时间十六个维度。
实验步骤
一、分析网页结构
打开房天下的郑州市二手房网页,默认显示的是郑州的所有二手房信息。
二、爬虫源码
# -*- coding:utf8 -*-
__author__ = 'liulei'
import requests
import re
from bs4 import BeautifulSoup
def write_to_file(oneinfo):
f = open("d://data.txt", "a")
f.write("%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s \n"
%(oneinfo[0], oneinfo[1], oneinfo[2], oneinfo[3],
oneinfo[4], oneinfo[5], oneinfo[6], oneinfo[7],
oneinfo[8], oneinfo[9], oneinfo[10], oneinfo[11],
oneinfo[12], oneinfo[13], oneinfo[14], oneinfo[15]))
f.close()
def parse_html_detail(html):
detail_info = []
soupDetail = BeautifulSoup(html.text, "html.parser", from_encoding="utf8")
direction = "null"
housetype = "null"
huXingAndCaoXiang = soupDetail.find_all("div", attrs={"class": "trl-item1 w146"})
if (huXingAndCaoXiang.__len__() != 0):
for num in range(0, huXingAndCaoXiang.__len__()):
xx = huXingAndCaoXiang[num].find_all("div", attrs={"class": "font14"})
yy = huXingAndCaoXiang[num].find_all("div", attrs={"class": "tt"})
if (xx.__len__() != 0):
if (xx[0].get_text().__contains__("朝向")):
direction = yy[0].get_text()
elif (xx[0].get_text().__contains__("户型")):
housetype = yy[0].get_text().strip()
detail_info.append(direction)
detail_info.append(housetype)
floor = "null"
conarea = "null"
louCengAndArea = soupDetail.find_all("div", attrs={"class": "trl-item1 w182"})
if (louCengAndArea.__len__() != 0):
for num in range(0, louCengAndArea.__len__()):
xx = louCengAndArea[num].find_all("div", attrs={"class": "font14"})
yy = louCengAndArea[num].find_all("div", attrs={"class": "tt"})
if (xx.__len__() != 0):
if (xx[0].get_text().__contains__("楼层")):
floor = yy[0].get_text()
elif (xx[0].get_text().__contains__("建筑面积")):
area1 = yy[0].get_text()
area1 = re.findall(r"\d+", area1)
if (area1.__len__() != 0):
conarea = area1[0]
detail_info.append(floor)
detail_info.append(conarea)
unitprice = "null"
decoration = "null"
unitPriceAndDecorate = soupDetail.find_all("div", attrs={"class": "trl-item1 w132"})
if (unitPriceAndDecorate.__len__() != 0):
for num in range(0, unitPriceAndDecorate.__len__()):
xx = unitPriceAndDecorate[num].find_all("div", attrs={"class": "font14"})
yy = unitPriceAndDecorate[num].find_all("div", attrs={"class": "tt"})
if (xx.__len__() != 0):
if (xx[0].get_text().__contains__("装修")):
decorate1 = yy[0].get_text()
if (decorate1.__contains__("暂无") == False):
decoration = decorate1
elif (xx[0].get_text().__contains__("单价")):
unitPrice1 = yy[0].get_text()
unitPrice1 = re.findall(r"\d+", unitPrice1)
if (area1.__len__() != 0):
unitprice = unitPrice1[0]
detail_info.append(unitprice)
detail_info.append(decoration)
totalprice = "null"
total = soupDetail.find_all("div", attrs={"class": "trl-item_top"})
if (total.__len__() != 0):
text = total[0].get_text()
totalPrice1 = re.findall(r"\d+", text)
if (totalPrice1.__len__() != 0):
totalprice = totalPrice1[0]
detail_info.append(totalprice)
xiaoQu = "null"
region = "null"
xiaoQuAndRegion = soupDetail.find_all("div", attrs={"class": "trl-item2 clearfix"})
for num in range(0, xiaoQuAndRegion.__len__()):
leibie = xiaoQuAndRegion[num].find_all("div", attrs={"class": "lab"})
if (leibie.__len__() != 0):
if (leibie[0].get_text().__contains__("小")):
xiaoQu1 = xiaoQuAndRegion[num].find_all("a", attrs={"class": "blue"})
if (xiaoQu1.__len__() == 1):
xiaoQu = xiaoQu1[0].get_text()
elif (leibie[0].get_text().__contains__("域")):
region1 = xiaoQuAndRegion[num].find_all("a")
if (region1.__len__() != 0):
region = ""
for num in range(0, region1.__len__()):
region = region + region1[num].get_text().strip() + " "
detail_info.append(xiaoQu)
detail_info.append(region)
fangYuanInfo = soupDetail.find_all("div", attrs={"class": "text-item clearfix"})
buildAge1 = "null"
lift1 = "null"
chanQuan1 = "null"
zhuzaiLeiBie1 = "null"
buildStruct1 = "null"
buildLeiBie1 = "null"
guaPaiTime1 = "null"
if (fangYuanInfo.__len__() != 0):
for num in range(0, fangYuanInfo.__len__()):
if (fangYuanInfo[num].find_all("span", attrs={"class": "lab"})[0].get_text().__contains__("建筑年代")):
buildAge = fangYuanInfo[num].find_all("span", attrs={"class": "rcont"})
if (buildAge.__len__() != 0):
buildAge = re.findall(r"\d+", buildAge[0].get_text())
if (buildAge.__len__() != 0):
buildAge1 = buildAge[0]
elif (fangYuanInfo[num].find_all("span", attrs={"class": "lab"})[0].get_text().__contains__("有无电梯")):
lift = fangYuanInfo[num].find_all("span", attrs={"class": "rcont"})
if (lift.__len__() != 0):
lift1 = lift[0].get_text().strip()
elif (fangYuanInfo[num].find_all("span", attrs={"class": "lab"})[0].get_text().__contains__("产权性质")):
chanQuan = fangYuanInfo[num].find_all("span", attrs={"class": "rcont"})
if (chanQuan.__len__() != 0):
chanQuan1 = chanQuan[0].get_text()
elif (fangYuanInfo[num].find_all("span", attrs={"class": "lab"})[0].get_text().__contains__("住宅类别")):
zhuzaiLeiBie = fangYuanInfo[num].find_all("span", attrs={"class": "rcont"})
if (zhuzaiLeiBie.__len__() != 0):
zhuzaiLeiBie1 = zhuzaiLeiBie[0].get_text()
elif (fangYuanInfo[num].find_all("span", attrs={"class": "lab"})[0].get_text().__contains__("建筑结构")):
buildStruct = fangYuanInfo[num].find_all("span", attrs={"class": "rcont"})
if (buildStruct.__len__() != 0):
buildStruct1 = buildStruct[0].get_text()
elif (fangYuanInfo[num].find_all("span", attrs={"class": "lab"})[0].get_text().__contains__("建筑类别")):
buildLeiBie = fangYuanInfo[num].find_all("span", attrs={"class": "rcont"})
if (buildLeiBie.__len__() != 0):
buildLeiBie1 = buildLeiBie[0].get_text()
elif (fangYuanInfo[num].find_all("span", attrs={"class": "lab"})[0].get_text().__contains__("挂牌时间")):
guaPaiTime = fangYuanInfo[num].find_all("span", attrs={"class": "rcont"})
if (guaPaiTime.__len__() != 0):
guaPaiTime1 = guaPaiTime[0].get_text()
guaPaiTime1 = guaPaiTime1.strip()
detail_info.append(buildAge1)
detail_info.append(lift1)
detail_info.append(chanQuan1)
detail_info.append(zhuzaiLeiBie1)
detail_info.append(buildStruct1)
detail_info.append(buildLeiBie1)
detail_info.append(guaPaiTime1)
print(detail_info)
write_to_file(detail_info)
def parse_html(html, i, header):
soup = BeautifulSoup(html, "html.parser", from_encoding="utf8")
lis = soup.find_all("dd", attrs={"class": "info rel floatr"})
for li in lis:
detailHref = ""
p = li.find_all("p", attrs={"class": "title"})
a = []
if (p.__len__() != 0):
a = p[0].find_all("a")
if (a.__len__() != 0):
detailHref = a[0]['href']
if (detailHref.__eq__("") == False):
detailHref = "http://esf.zz.fang.com" + detailHref
html = requests.request("GET", detailHref, headers=header, allow_redirects=False)
parse_html_detail(html)
def download_html():
base_url = "http://esf.zz.fang.com/house/i3"
for i in range(1, 101):
url = base_url + str(i)
try:
url = base_url + str(i) + "/"
header = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4620.400 QQBrowser/9.7.13014.400'}
html = requests.request("GET", url, headers=header, allow_redirects=False)
s = requests.session()
s.keep_alive = False
parse_html(html.text, i, header)
print(url + " 爬取完成。")
except Exception as err:
print("出现错误: "+str(err))
print(url)
if __name__ == '__main__':
download_html()
三、采集的数据
用爬虫采集的数据直接存入txt文件里,数据如下:
将原始数据导入excel中,如下:
txt格式数据下载:https://download.****.net/download/qq_35809147/11176115
excel格式数据下载:https://download.****.net/download/qq_35809147/11176115