51testing软件测试网电子书下载

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
    time: 2018-12-21
    title:51testing软件测试网电子书下载
    annotation:
    author: pqx
    email:[email protected]
"""
import threading
import time
import requests
import re
import os
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter

headers = { 'User-Agent': 'Mozilla/5.0'}
def get_one_page(url, headers=headers, code='gbk'):  # 访问一个页面 返回页面信息
    try:
        s = requests.Session()  # 保持会话
        s.mount('http://', HTTPAdapter(max_retries=3))  # 最大重试
        s.mount('https://', HTTPAdapter(max_retries=3))
        r = s.get(url, headers=headers, timeout=15)  # 超时设置
        r.raise_for_status()  # 状态码 如果不是200则报错
        r.encoding = code  # r.apparent_encoding#字符类型
        return r.text  # 返回页面
    except Exception as e:
        t = time.strftime('%Y/%m/%d %H:%M:%S %a')  # 时间格式化
        with open(r'D:\51软件测试\Exception.txt', 'a+', encoding='utf-8') as f:
            f.write('time:{}\n\nurl:{}\n\n{}\n\n'.format(t, url, e))

def create_folder(name):  # 创建文件夹
    try:
        if '{}'.format(name) not in os.listdir():  # 如果不存在
            os.makedirs('{}'.format(name))  # 则创建
    except:
        return ''

def clean_txt(title):  # 清洗标题中不能用于命名文件的字符
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    title = re.sub(rstr, "_", title)  # 替换为下划线
    return title

def get_classify_href(url):
    html_text=get_one_page(url)
    soup=BeautifulSoup(html_text,'lxml')
    classify_href_list=soup.find('div',{'class':'down_nav'}).find_all('a')
    list=[]
    for i in classify_href_list:
        name=i.text
        href=i['href']
        list.append([name,href])
    return list

def get_classify_page(url):
    html_text = get_one_page(url)
    soup = BeautifulSoup(html_text, 'lxml')
    try:
        classify_page = soup.find('span', {'class': 'xspace-totlepages'}).text.split('/')[1]
    except:
        classify_page='1'
    return classify_page

def get_books_href(url):
    html_text = get_one_page(url)
    soup = BeautifulSoup(html_text, 'lxml')
    book_href_list = soup.find_all('div', {'class': 'column_js_yw'})
    book_list = []
    for i in book_href_list:
        name = i.h3.a.text
        href = i.h3.a['href']
        book_list.append([name, href])
    return book_list

def get_book_href_and_download(path,book):
    try:
        name=clean_txt(book[0][0:25])
        book_html_text = get_one_page(book[1])
        id=book[1].split('-')[-1]
        book_soup = BeautifulSoup(book_html_text, 'lxml')
        book_column = book_soup.find('div', {'id': 'articlebody'})
        try:
            book_real_url=re.findall(r'href="(http://.*?.51testing.com/ddimg/uploadsoft/.*?)"',str(book_column) ,re.S)[0]
            file_mode = book_real_url.split('.')[-1]
        except:
            book_real_url = re.findall(r'href="(http://www.51testing.com/batch.download.*?)"', str(book_column), re.S)[0]
            file_mode = 'rar'
        headers.update({'Cookie':'UM_distinctid=167ce57f46030-083f3c246388ce-6b1b1279-13c680-167ce57f4615fd; looyu_id=1f38aaeccbc843153797fdcc939cd253_20001818%3A1; xscdb_cookietime=2592000; xscdb_auth=5e60nDqimLbCb87VIo4qDYgEFdJ3ERExrKT6D1FOoWL5V%2B7JZFpa063r4QPjPaAwKOyXpT4ws2fR9OMXj6YFYhwFFOmLcVc;  supe_batch_html_refresh_items=0_3723610_3711522_4456225_3722305_221799_221593_220429_{}; xscdb_supe_refresh_items=0_3723610_3711522_4456225_3722305_221799_221593_220429_{}; looyu_20001818=v%3A1f38aaeccbc843153797fdcc939cd253%2Cref%3A%2Cr%3A%2Cmon%3Ahttp%3A//m2423.looyu.com/monitor%2Cp0%3Ahttp%253A//www.51testing.com/article_windows.htm'.format(id,id)})

        content=requests.get(book_real_url,headers=headers,timeout=15).content
        with open(path+'\\'+name+'.'+file_mode,'wb')as f:
            f.write(content)
        print(name,file_mode, book_real_url)
    except Exception as e:
        t = time.strftime('%Y/%m/%d %H:%M:%S %a')  # 时间格式化
        with open(r'D:\51软件测试\Exception.txt', 'a+', encoding='utf-8') as f:
            f.write('time:{}\n\nurl:{}\n\n{}\n\n'.format(t, book, e))

def main():
    start_url = 'http://www.51testing.com/html/6/category-catid-6.html'
    classify_href_list=get_classify_href(start_url)
    for i in classify_href_list:
        page=get_classify_page(i[1])
        i.append(page)
    for i in classify_href_list:
        path='D:\\51软件测试\\%s'%i[0]
        create_folder(path)
        for j in range(1,int(i[2])+1):
            print('第%s页'%j)
            if j==1:
                url=i[1]
            else:
                url=i[1]+'-page-%s.html'%j
            book_list=get_books_href(url)
            for book in book_list:
                t = threading.Thread(target=get_book_href_and_download,args=(path,book))
                time.sleep(0.5)
                t.start()

if __name__=='__main__':
    main()
    print('下载完成')

图片版51testing软件测试网电子书下载