爬取沪江网考研词汇并按要求存为txt

存储数据的要求:每遍历一个单词,保存单词汉语翻译到txt文档,并以单词命名,保存到一个文件夹里,便于GUI设计使用。
点击此查看要爬取的网页
爬取沪江网考研词汇并按要求存为txt
爬取沪江网考研词汇并按要求存为txt

思路:大循环找herf进行url拼接,小循环进入数据网页提取信息

mport re
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

headers = {
    'User-Agent': '''Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'''
}

def get_html(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        return None

def get_entranceUrls(html):
    part_urls = re.findall('<li class="clearfix">.*?<a href="(.*?)" target="_blank">', html, re.S)
    return part_urls
    # for part_url in part_urls:
    #     print(part_url)测试成功

def get_info(html,file_name):
    # selector = etree.HTML(html)Xpath不好处理!!!
    # content = selector.xpath(
    #     '//div[@class="sp-lexicon-word-comment clearfix"]/span/text()')
    # for i in content:
    #     print(i.strip())
    try:
        soup = BeautifulSoup(html, 'lxml')
        # select的属性中间带有空格的解决方法
        content = soup.select('div.sp-lexicon-word-comment.clearfix')
        for real in content:
            # print(real.get_text().strip())#测试成功,不用strip()结果好看一点
            try:
                with open('D:/寒假项目/{0}.txt'.format(file_name),'a+',encoding='UTF-8') as fp:
                    fp.write(real.get_text().strip())
            except:
                pass
    except:
        pass

if __name__ == '__main__':
    urls = ['https://www.hujiang.com/ciku/zuixinkaoyanyingyucihui_{0}/'.format(i) for i in range(1, 276)]#最后一页为275
    for url in urls:
        html = get_html(url)
        part_list = get_entranceUrls(html)
        for part_name in part_list:
            href = 'https://www.hujiang.com' + part_name
            Html = get_html(href)
            word = part_name.split('/')[2]
            # print(word)测试成功
            get_info(Html,word)