python[爬虫]运用selenium库抓取京东手机信息
1. 准备工作
IDE: pycharm
PY版本:python3.6
库:selenium pyquery等
浏览器:Chrome ,webdriver (版本要互相对应才能用)
数据库:MongoDB
2. 具体操作,上code
#-*-coding:utf-8-*-
import re
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from config import *
import pymongo
#应该加个header更稳妥些
client = pymongo.MongoClient(MONGO_URL) #声明
db = client[MONGO_DB]
broswer = webdriver.Chrome()
wait = WebDriverWait(broswer,50) #最大等待时间50
def search():
try:
broswer.get('https://www.jd.com/') #淘宝需要cookie 下次爬
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,'#key'))
) #F12 定位到COPY-CSS-SECECTOR
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#search > div > div.form > button')))
input.send_keys('手机')
submit.click()
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,'#J_bottomPage > span.p-skip > em:nth-child(1)')))
get_products()
return total.text
except TimeoutException:
return search()
def next_page(page_number): #翻页
try:
broswer.execute_script("window.scrollTo(0, document.body.scrollHeight);") #往下加载到底部(重点坑)
time.sleep(10) #看实际情况设置具体时间
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input'))
)
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
input.clear()
input.send_keys(page_number)
submit.click()
wait.until(
EC.text_to_be_present_in_element ((By.CSS_SELECTOR,'#J_bottomPage > span.p-num > a.curr'),str(page_number)))
#判断是否是高亮 跳转到当前页面
get_products()
except TimeoutException:
next_page(page_number) #如果错误了重新调用
def get_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#J_searchWrap #J_container .m-list .ml-wrap #J_goodsList .gl-i-wrap')))
html = broswer.page_source
doc = pq(html)
items = doc('#J_searchWrap #J_container .m-list .ml-wrap #J_goodsList .gl-i-wrap').items()
for item in items:
product = {
'price': item.find('.p-price').text(),
'commit': item.find('.p-commit').text()[:-3], #不要后三个
'name': item.find('.p-name').text(),
'shop': item.find('.p-shop').text()
}
print(product)
save_to_mongo(product)
#用的selenium定位到img失败了,所以没爬。下次用xPath试试
def save_to_mongo(result): #存储到mongo
try:
if db[MONGO_TABLE].insert(result):
print('存储道MONGODB成功',result)
except Exception:
print('存储道MONGODB失败',result)
def main():
total = search()
total =int(re.compile('(\d+)').search(total).group(1))
for i in range(2, total + 1):
next_page(i)
broswer.close()
if __name__ == '__main__':
main()
3. MongoDB
MONGO_URL = 'localhost'
MONGO_DB = 'jd'
MONGO_TABLE = 'product'
-爬下的数据如图:
4. 遇到的坑
1. 定位失败:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,’#J_searchWrap #J_container .m-list .ml-wrap #J_goodsList .gl-i-wrap’))) #中间有空格
(特别注意:#匹配ID .匹配class)
其实只要看仔细了应该没问题,不要把ID NAME搞错。
2. 爬到第三页就报错停止:
原因:网页没拉满
解决:加上 broswer.execute_script(“window.scrollTo(0, document.body.scrollHeight);”)
3. 自身问题:
语法问题以及细心程度。
2019.3.26 记录一个小白。
准备把下载下的数据拿spss powerBI玩一玩。