爬虫:通过百度map的api爬取地图上搜索结果的信息
爬起雁塔区的宾馆信息
准备工作:
1.新建city.txt 文件,里面保存的城市或者区域的名称(必须是百度地图上可以搜索到的)
2.以下 内容保存在.py文件中,city.txt与.py文件保存在同一目录下
# -*- coding: utf-8 -*-
"""
Created on Sat May 26 17:31:46 2018
@author: yuehua
"""
# 测试AK=vOqsgq4tHrHDQ23C6lAWb0hNkYeHOyvk
#ak码需要在百度注册后获取
#请地址:http://api.map.baidu.com/place/v2/search?q=星巴克®ion=西安&output=json&ak=8BB7F0E5C9C77BD6B9B655DB928B74B6E2D838FA
import requests
import json
import re
import time
def getjson(address_name,city,page_size,page_num):
address='http://api.map.baidu.com/place/v2/search?q='+address_name+'®ion='+city+'&page_size='+page_size+'&page_num='+page_num+'&output=json&ak=vOqsgq4tHrHDQ23C6lAWb0hNkYeHOyvk'
r=requests.get(address)
print ("*****获取结束,解析数据中****")
decodejson=json.loads(r.text)
#print (decodejson)
return decodejson
def write_to_file(address_name,city_name,page_num,):
#获取result数据个数
results=result_json["results"]
length=len(results)#数据个数
title = ["name","location","address","province","city","area","street_id","telephone","detail","uid"]
titile_length=len(title)
#print ("length======"+str(length))
#print ("属性个数======"+str(titile_length))
add_file_name=city_name+"_"+address_name+".txt"#这个是生成txt文件的名称
with open(add_file_name,"a+") as f:
for j in range(0,titile_length):
key_name=(title[j])
f.write(key_name)
f.write(" ")
f.write("\n")
for i in range(0,length):
for j in range(0,titile_length):
#print ("i="+str(i)+'j='+str(j))
key_name=(title[j])
#print (key_name)
#print(results[i].get(key_name))
try:
data=results[i][key_name]
f.write(data)
f.write(" ")
except:
#print ("找不到属性")
data="NULL"
f.write("NULL")
f.write(" ")
continue
f.write("\n")
f.close()
#从文件获取城市名成
def get_city_name(file):
city_name_list=list()
with open(file,'r',encoding='GBK') as txt_file:
for eachline in txt_file:
number=0
if eachline != "" and eachline != '\n':
fields=eachline.split(' ') #用空格进行分割每一行数据
length=len(fields) #获取空格分割开后,每一行有几个城市
#通过每一行的城市个数判断是否该行的城市加载完没有
while(number<length):
city=fields[number]
city_name_list.append(city)
number=number+1
#print ("number="+str(number))
txt_file.close()
return city_name_list
def get_city_name2(file):
f = open(file)
s=f.read()
s1 = re.split('', s) #利用正则函数进行分割
#for name in s1:
# print (name)
return s1
#------------------------自定义变量区域------------------------------------------------
address_name="宾馆" #查询的东西
city_name_list=[] #存储所有城市名称
page_size="30" #一页查询的条数
page_num_max=30 #查询多少页
file="city.txt" #城市所在txt文件
#------------------------以下为主程序------------------------------------------------
#city_name.rstrip()是去除city_name两边的空格回车等字符
city_name_list=get_city_name(file)
for city_name in city_name_list:
for page_num in range(0,page_num_max):
print ("------------城市名:"+city_name.rstrip()+"--"+address_name+'---第'+str(page_num+1)+"页数据下载数据中------")
result_json=getjson(address_name,city_name,page_size,str(page_num))
write_to_file(address_name,city_name.rstrip(),str(page_num+1))
time.sleep(3)
print ("数据查询完毕end******")
运行 过程
保存结果: