从零开始学习--Python-爬虫 7月3日
Python
---小白121的记录笔记
用爬虫爬取煎蛋网妹纸图片
源码:
import os import re import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait chrome="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" browser = webdriver.Chrome(chrome) wait = WebDriverWait(browser,2) file = os num = 1 def get_source(url): #用来利用 webdriver 来请求对应网站 print('正在爬取 "%s"' % url) try: browser.get(url) html = browser.page_source if html : return html except EOFError: return None def get_pares(html,num, url): #解析没带进度条的网站 url_num = analysis(url) tmp = url_num[0] temp = int(tmp) + 1 soup = BeautifulSoup(html,'lxml') image = soup.select('img') find_image = re.findall('<img src="(.*?)" style="', '%s'%image, re.S) file.mkdir('F:\\python测试\\data\\煎蛋网妹纸爬取\\%s' % temp) for i in find_image: s_c = requests.get(i) print('正在下载:%s'%i) save = open('F:\\python测试\\data\\煎蛋网妹纸爬取\\%s\\'%temp + str(num) + '.jpg', 'wb') save.write(s_c.content) save.close() num += 1 def pares_one(html): #解析带进度条的网站 soup = BeautifulSoup(html,'lxml') find_html = soup.select('#body #comments .comments .cp-pagenavi a') find_url = re.findall('<a class="previous-comment-page" href="(.*?)" title="Older Comments">下一页<','%s'% find_html, re.S) url = 'http:' + '%s' % find_url[0] return url def analysis(url): #获取 当前网站 的页数 url_num = re.findall('/page-(.*?)#comments', '%s'%url) return url_num def next(url): #进行翻页 while num > 0 and num < 60 : html = get_source(url) next_url = pares_one(html) get_pares(html, num, next_url) next(next_url) def main(): url = 'http://jandan.net/ooxx' next(url) if __name__ == '__main__': main()
---源自公众号python那些事
主要运用了 webdriver 和 beautiful 可以先学会这2个第三方库在去运用
成品图
邪恶的马赛克~