使用selenium爬取餐厅信息
使用selenium爬取餐厅信息
一 工具及平台介绍
使用python语言爬取
使用BeautifulSoup解析
爬取餐厅信息——大众点评某个地区的餐厅列表
导入到CSV文件夹下
使用谷歌浏览器
二 代码主要部分解析
1.使用的库:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import csv
2.根据url获取到网页的函数
def get_page(myurl):
caps = webdriver.DesiredCapabilities().CHROME
caps["marionette"] = False
#进行设置不显示图片和js代码
options = webdriver.ChromeOptions()
prefers = {
'profile.default_content_setting_values': {
'images': 2,
'javascript': 2
}
}
options.add_experimental_option('prefs', prefers)
browser = webdriver.Chrome(chrome_options=options,desired_capabilities=caps)
browser.get(myurl)
soup = BeautifulSoup(browser.page_source, "lxml")
return soup
3.解析获取网页的内容并存到csv文件中
def get_infors(soup):
title_list = soup.find_all('div', class_='tit')
for item in title_list:
title = item.find('h4').text
link = item.find('a')['href']
name_list.append(title)
link_list.append(link)
infor_list = soup.find_all('div', class_='comment')
for it in infor_list:
#这里使用try是因为有些餐厅没有评论
try:
comment = it.find('a', class_='review-num').b.text
except:
comment = ''
comment_list.append(comment)
#这里使用try是因为有些餐厅没有显示人均消费价格
try:
price = it.find('a',class_='mean-price').b.text
except:
price = ''
price_list.append(price)
for addr in (soup.find_all('div', class_='tag-addr')):
addr_list.append(addr.find('span', class_='addr').text)
for i in range(1, len(title_list)):
output_list.append([name_list[i], link_list[i], comment_list[i],price_list[i], addr_list[i]])
return output_list
三 网页完整代码
from selenium import webdriver
from bs4 import BeautifulSoup
from pymongo import MongoClient
import time
import csv
url = "http://www.dianping.com/shenzhen/ch10"
link_list = []#餐厅详细内容链接
name_list = []#餐厅名
addr_list = []#餐厅地址
comment_list = []#评论数
price_list = []#人均消费
output_list = []#所有内容
#获取餐厅网页
def get_page(myurl):
caps = webdriver.DesiredCapabilities().CHROME
caps["marionette"] = False
options = webdriver.ChromeOptions()
prefers = {
'profile.default_content_setting_values': {
'images': 2,
'javascript': 2
}
}
options.add_experimental_option('prefs', prefers)
browser = webdriver.Chrome(chrome_options=options,desired_capabilities=caps)
browser.get(myurl)
soup = BeautifulSoup(browser.page_source, "lxml")
return soup
# 爬取餐厅的信息
def get_infors(soup):
title_list = soup.find_all('div', class_='tit')
for item in title_list:
title = item.find('h4').text
link = item.find('a')['href']
name_list.append(title)
link_list.append(link)
infor_list = soup.find_all('div', class_='comment')
for it in infor_list:
try:
comment = it.find('a', class_='review-num').b.text
except:
comment = ''
comment_list.append(comment)
try:
price = it.find('a',class_='mean-price').b.text
except:
price = ''
price_list.append(price)
for addr in (soup.find_all('div', class_='tag-addr')):
addr_list.append(addr.find('span', class_='addr').text)
for i in range(1, len(title_list)):
output_list.append([name_list[i], link_list[i], comment_list[i],price_list[i], addr_list[i]])
return output_list
#爬取1到30页的餐厅信息
for i in range(1, 30):
if i == 1:
url_link = url
else:
url_link = url+'/p'+str(i)
mysoup = get_page(url_link)
output_list = get_infors(mysoup)
print(output_list[i])
#录入到csv文件里,文件路径可自行更改
with open('F://restaurant.csv', 'a+', newline='') as csvfile:
spamwriter = csv.writer(csvfile, dialect='excel')
spamwriter.writerows(output_list)
print("successfully")
time.sleep(3)
结果
表格头部是爬取后是自己加进去的,便于理解