1. 得到所有页的数量

2. 解析页面得到产品信息

3. 实现翻页面得到所有关于美食的数据

from selenium import webdriver

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.by import By

from selenium.webdriver.chrome.options import Options

import re

from bs4 import BeautifulSoup

# 不加载图片

SERVICE_ARGS = ['--load-images=false', '--disk-cache=false']

# chrome_options=Options()

# chrome_options.add_argument("--headless")

driver=webdriver.Chrome(service_args=SERVICE_ARGS)

# driver=webdriver.Chrome(service_args=SERVICE_ARGS,chrome_options=chrome_options)

wait=WebDriverWait(driver,10)

driver.get("https://www.taobao.com/")

driver.set_window_size(width=1500,height=800)

# 第一步，得到美食的所有页

def get_page_num():

print("搜索>>>美食")

input=wait.until(EC.presence_of_element_located((By.ID,"q")))

input.send_keys("美食")

driver.find_element_by_css_selector("#J_TSearchForm > div > button").click()

text= driver.find_element_by_class_name("total").text

print(text)

page_num1=re.compile(r"(\d+)").search(text).group(0)

get_product_into()

return page_num1

def get_product_into():

wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))

soup=BeautifulSoup(driver.page_source,"lxml")

items_list=soup.select("#mainsrp-itemlist .items .item")

for item in items_list:

item_dict={}

**print(""100)**

location=item.select(".location")[0].text.strip()

price=item.select(".price")[0].text.strip()

shopname=item.select(".shopname")[0].text.strip()

title=item.select('a[class="J_ClickStat"]')[0].text.strip()

product_link=item.select(".J_ClickStat")[0].attrs["href"]

image = item.select('.J_ItemPic.img')[0].attrs["data-src"]

if not image:

image=item.select(".J_ItemPic.img")[0].attrs['data-ks-lazyload']

item_dict["image"] = "https:" + image

item_dict["price"] = price

item_dict["location"] = location

item_dict["title"] = title

item_dict["product_link"] = "https:"+product_link

item_dict["shopname"] = shopname

print(item_dict)

def next_page(page):

print("当前正在加载第%s页的数据------------------------------------"%page)

try:

input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > input")))

input.clear()

input.send_keys(page)

driver.find_element_by_css_selector("#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit").click()

wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active"),str(page)))

抓取淘宝美食数据

1. 得到所有页的数量

2. 解析页面得到产品信息

3. 实现翻页面得到所有关于美食的数据

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.by import By

from selenium.webdriver.chrome.options import Options

import re

from bs4 import BeautifulSoup

# 不加载图片

SERVICE_ARGS = ['--load-images=false', '--disk-cache=false']

# chrome_options=Options()

# chrome_options.add_argument("--headless")

driver=webdriver.Chrome(service_args=SERVICE_ARGS)

# driver=webdriver.Chrome(service_args=SERVICE_ARGS,chrome_options=chrome_options)

wait=WebDriverWait(driver,10)

driver.get("https://www.taobao.com/")

driver.set_window_size(width=1500,height=800)

# 第一步，得到美食的所有页

def get_page_num():

print("搜索>>>美食")

input=wait.until(EC.presence_of_element_located((By.ID,"q")))

input.send_keys("美食")

driver.find_element_by_css_selector("#J_TSearchForm > div > button").click()

text= driver.find_element_by_class_name("total").text

print(text)

page_num1=re.compile(r"(\d+)").search(text).group(0)

get_product_into()

return page_num1

def get_product_into():

wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))

soup=BeautifulSoup(driver.page_source,"lxml")

items_list=soup.select("#mainsrp-itemlist .items .item")

for item in items_list:

item_dict={}

print("*"*100)

location=item.select(".location")[0].text.strip()

price=item.select(".price")[0].text.strip()

shopname=item.select(".shopname")[0].text.strip()

title=item.select('a[class="J_ClickStat"]')[0].text.strip()

product_link=item.select(".J_ClickStat")[0].attrs["href"]

image = item.select('.J_ItemPic.img')[0].attrs["data-src"]

if not image:

image=item.select(".J_ItemPic.img")[0].attrs['data-ks-lazyload']

item_dict["image"] = "https:" + image

item_dict["price"] = price

item_dict["location"] = location

item_dict["title"] = title

item_dict["product_link"] = "https:"+product_link

item_dict["shopname"] = shopname

print(item_dict)

def next_page(page):

print("当前正在加载第%s页的数据------------------------------------"%page)

try:

input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > input")))

input.clear()

input.send_keys(page)

driver.find_element_by_css_selector("#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit").click()

wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active"),str(page)))

except Exception as e:

print(e)

next_page(page)

get_product_into()

def main():

try:

page_num=get_page_num()

print("总页数是：",page_num)

for page in range(2,int(page_num)+1):

next_page(page)

except Exception as e:

print(e)

finally:

driver.quit()

if __name__ == '__main__':

main()

相关推荐

**print(""100)**

if name == 'main':