抓取淘宝美食数据
1. 得到所有页的数量
2. 解析页面得到产品信息
3. 实现翻页面得到所有关于美食的数据
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import re
from bs4 import BeautifulSoup
# 不加载图片
SERVICE_ARGS = ['--load-images=false', '--disk-cache=false']
# chrome_options=Options()
# chrome_options.add_argument("--headless")
driver=webdriver.Chrome(service_args=SERVICE_ARGS)
# driver=webdriver.Chrome(service_args=SERVICE_ARGS,chrome_options=chrome_options)
wait=WebDriverWait(driver,10)
driver.get("https://www.taobao.com/")
driver.set_window_size(width=1500,height=800)
# 第一步,得到美食的所有页
def get_page_num():
print("搜索>>>美食")
input=wait.until(EC.presence_of_element_located((By.ID,"q")))
input.send_keys("美食")
driver.find_element_by_css_selector("#J_TSearchForm > div > button").click()
text= driver.find_element_by_class_name("total").text
print(text)
page_num1=re.compile(r"(\d+)").search(text).group(0)
get_product_into()
return page_num1
def get_product_into():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
soup=BeautifulSoup(driver.page_source,"lxml")
items_list=soup.select("#mainsrp-itemlist .items .item")
for item in items_list:
item_dict={}
print("*"*100)
location=item.select(".location")[0].text.strip()
price=item.select(".price")[0].text.strip()
shopname=item.select(".shopname")[0].text.strip()
title=item.select('a[class="J_ClickStat"]')[0].text.strip()
product_link=item.select(".J_ClickStat")[0].attrs["href"]
image = item.select('.J_ItemPic.img')[0].attrs["data-src"]
if not image:
image=item.select(".J_ItemPic.img")[0].attrs['data-ks-lazyload']
item_dict["image"] = "https:" + image
item_dict["price"] = price
item_dict["location"] = location
item_dict["title"] = title
item_dict["product_link"] = "https:"+product_link
item_dict["shopname"] = shopname
print(item_dict)
def next_page(page):
print("当前正在加载第%s页的数据------------------------------------"%page)
try:
input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > input")))
input.clear()
input.send_keys(page)
driver.find_element_by_css_selector("#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit").click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active"),str(page)))
except Exception as e:
print(e)
next_page(page)
get_product_into()
def main():
try:
page_num=get_page_num()
print("总页数是:",page_num)
for page in range(2,int(page_num)+1):
next_page(page)
except Exception as e:
print(e)
finally:
driver.quit()
if __name__ == '__main__':
main()
结果: