python[爬虫]运用selenium库抓取京东手机信息

1. 准备工作

IDE: pycharm
PY版本:python3.6
库:selenium pyquery等
浏览器:Chrome ,webdriver (版本要互相对应才能用)
数据库:MongoDB

2. 具体操作,上code

#-*-coding:utf-8-*-
import re
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from config import *
import pymongo

#应该加个header更稳妥些

client = pymongo.MongoClient(MONGO_URL)  #声明
db = client[MONGO_DB]

broswer = webdriver.Chrome()
wait = WebDriverWait(broswer,50)  #最大等待时间50
def search():
    try:
        broswer.get('https://www.jd.com/') #淘宝需要cookie 下次爬
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,'#key'))
        ) #F12 定位到COPY-CSS-SECECTOR
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#search > div > div.form > button')))
        input.send_keys('手机')
        submit.click()
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR,'#J_bottomPage > span.p-skip > em:nth-child(1)')))
        get_products()
        return total.text
    except TimeoutException:
        return search()

def next_page(page_number): #翻页
    try:
        broswer.execute_script("window.scrollTo(0, document.body.scrollHeight);") #往下加载到底部(重点坑)
        time.sleep(10) #看实际情况设置具体时间
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input'))
        )
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        wait.until(
            EC.text_to_be_present_in_element ((By.CSS_SELECTOR,'#J_bottomPage > span.p-num > a.curr'),str(page_number)))
        #判断是否是高亮 跳转到当前页面
        get_products()
    except TimeoutException:
        next_page(page_number)  #如果错误了重新调用

def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#J_searchWrap #J_container .m-list .ml-wrap #J_goodsList .gl-i-wrap')))
    html = broswer.page_source
    doc = pq(html)
    items = doc('#J_searchWrap #J_container .m-list .ml-wrap #J_goodsList .gl-i-wrap').items()
    for item in items:
        product = {
            'price': item.find('.p-price').text(),
            'commit': item.find('.p-commit').text()[:-3], #不要后三个
            'name': item.find('.p-name').text(),
            'shop': item.find('.p-shop').text()
        }
        print(product)
        save_to_mongo(product)
#用的selenium定位到img失败了,所以没爬。下次用xPath试试

def save_to_mongo(result):		#存储到mongo
    try:
        if db[MONGO_TABLE].insert(result):
           print('存储道MONGODB成功',result)
    except Exception:
        print('存储道MONGODB失败',result)

def main():
    total = search()
    total =int(re.compile('(\d+)').search(total).group(1))
    for i in range(2, total + 1):
        next_page(i)
    broswer.close()

if __name__ == '__main__':
    main()

3. MongoDB

- 建立config.py

MONGO_URL = 'localhost'
MONGO_DB = 'jd'
MONGO_TABLE = 'product'

-爬下的数据如图:
python[爬虫]运用selenium库抓取京东手机信息

4. 遇到的坑

1. 定位失败:

wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,’#J_searchWrap #J_container .m-list .ml-wrap #J_goodsList .gl-i-wrap’))) #中间有空格

(特别注意:#匹配ID .匹配class)

其实只要看仔细了应该没问题,不要把ID NAME搞错。

2. 爬到第三页就报错停止:

原因:网页没拉满

解决:加上 broswer.execute_script(“window.scrollTo(0, document.body.scrollHeight);”)

3. 自身问题:

语法问题以及细心程度。

2019.3.26 记录一个小白。
准备把下载下的数据拿spss powerBI玩一玩。