51testing软件测试网电子书下载
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
time: 2018-12-21
title:51testing软件测试网电子书下载
annotation:
author: pqx
email:[email protected]
"""
import threading
import time
import requests
import re
import os
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
headers = { 'User-Agent': 'Mozilla/5.0'}
def get_one_page(url, headers=headers, code='gbk'): # 访问一个页面 返回页面信息
try:
s = requests.Session() # 保持会话
s.mount('http://', HTTPAdapter(max_retries=3)) # 最大重试
s.mount('https://', HTTPAdapter(max_retries=3))
r = s.get(url, headers=headers, timeout=15) # 超时设置
r.raise_for_status() # 状态码 如果不是200则报错
r.encoding = code # r.apparent_encoding#字符类型
return r.text # 返回页面
except Exception as e:
t = time.strftime('%Y/%m/%d %H:%M:%S %a') # 时间格式化
with open(r'D:\51软件测试\Exception.txt', 'a+', encoding='utf-8') as f:
f.write('time:{}\n\nurl:{}\n\n{}\n\n'.format(t, url, e))
def create_folder(name): # 创建文件夹
try:
if '{}'.format(name) not in os.listdir(): # 如果不存在
os.makedirs('{}'.format(name)) # 则创建
except:
return ''
def clean_txt(title): # 清洗标题中不能用于命名文件的字符
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
title = re.sub(rstr, "_", title) # 替换为下划线
return title
def get_classify_href(url):
html_text=get_one_page(url)
soup=BeautifulSoup(html_text,'lxml')
classify_href_list=soup.find('div',{'class':'down_nav'}).find_all('a')
list=[]
for i in classify_href_list:
name=i.text
href=i['href']
list.append([name,href])
return list
def get_classify_page(url):
html_text = get_one_page(url)
soup = BeautifulSoup(html_text, 'lxml')
try:
classify_page = soup.find('span', {'class': 'xspace-totlepages'}).text.split('/')[1]
except:
classify_page='1'
return classify_page
def get_books_href(url):
html_text = get_one_page(url)
soup = BeautifulSoup(html_text, 'lxml')
book_href_list = soup.find_all('div', {'class': 'column_js_yw'})
book_list = []
for i in book_href_list:
name = i.h3.a.text
href = i.h3.a['href']
book_list.append([name, href])
return book_list
def get_book_href_and_download(path,book):
try:
name=clean_txt(book[0][0:25])
book_html_text = get_one_page(book[1])
id=book[1].split('-')[-1]
book_soup = BeautifulSoup(book_html_text, 'lxml')
book_column = book_soup.find('div', {'id': 'articlebody'})
try:
book_real_url=re.findall(r'href="(http://.*?.51testing.com/ddimg/uploadsoft/.*?)"',str(book_column) ,re.S)[0]
file_mode = book_real_url.split('.')[-1]
except:
book_real_url = re.findall(r'href="(http://www.51testing.com/batch.download.*?)"', str(book_column), re.S)[0]
file_mode = 'rar'
headers.update({'Cookie':'UM_distinctid=167ce57f46030-083f3c246388ce-6b1b1279-13c680-167ce57f4615fd; looyu_id=1f38aaeccbc843153797fdcc939cd253_20001818%3A1; xscdb_cookietime=2592000; xscdb_auth=5e60nDqimLbCb87VIo4qDYgEFdJ3ERExrKT6D1FOoWL5V%2B7JZFpa063r4QPjPaAwKOyXpT4ws2fR9OMXj6YFYhwFFOmLcVc; supe_batch_html_refresh_items=0_3723610_3711522_4456225_3722305_221799_221593_220429_{}; xscdb_supe_refresh_items=0_3723610_3711522_4456225_3722305_221799_221593_220429_{}; looyu_20001818=v%3A1f38aaeccbc843153797fdcc939cd253%2Cref%3A%2Cr%3A%2Cmon%3Ahttp%3A//m2423.looyu.com/monitor%2Cp0%3Ahttp%253A//www.51testing.com/article_windows.htm'.format(id,id)})
content=requests.get(book_real_url,headers=headers,timeout=15).content
with open(path+'\\'+name+'.'+file_mode,'wb')as f:
f.write(content)
print(name,file_mode, book_real_url)
except Exception as e:
t = time.strftime('%Y/%m/%d %H:%M:%S %a') # 时间格式化
with open(r'D:\51软件测试\Exception.txt', 'a+', encoding='utf-8') as f:
f.write('time:{}\n\nurl:{}\n\n{}\n\n'.format(t, book, e))
def main():
start_url = 'http://www.51testing.com/html/6/category-catid-6.html'
classify_href_list=get_classify_href(start_url)
for i in classify_href_list:
page=get_classify_page(i[1])
i.append(page)
for i in classify_href_list:
path='D:\\51软件测试\\%s'%i[0]
create_folder(path)
for j in range(1,int(i[2])+1):
print('第%s页'%j)
if j==1:
url=i[1]
else:
url=i[1]+'-page-%s.html'%j
book_list=get_books_href(url)
for book in book_list:
t = threading.Thread(target=get_book_href_and_download,args=(path,book))
time.sleep(0.5)
t.start()
if __name__=='__main__':
main()
print('下载完成')
图片版