爬取沪江网考研词汇并按要求存为txt
存储数据的要求:每遍历一个单词,保存单词汉语翻译到txt文档,并以单词命名,保存到一个文件夹里,便于GUI设计使用。
点击此查看要爬取的网页
思路:大循环找herf进行url拼接,小循环进入数据网页提取信息
mport re
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
headers = {
'User-Agent': '''Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'''
}
def get_html(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
return None
def get_entranceUrls(html):
part_urls = re.findall('<li class="clearfix">.*?<a href="(.*?)" target="_blank">', html, re.S)
return part_urls
# for part_url in part_urls:
# print(part_url)测试成功
def get_info(html,file_name):
# selector = etree.HTML(html)Xpath不好处理!!!
# content = selector.xpath(
# '//div[@class="sp-lexicon-word-comment clearfix"]/span/text()')
# for i in content:
# print(i.strip())
try:
soup = BeautifulSoup(html, 'lxml')
# select的属性中间带有空格的解决方法
content = soup.select('div.sp-lexicon-word-comment.clearfix')
for real in content:
# print(real.get_text().strip())#测试成功,不用strip()结果好看一点
try:
with open('D:/寒假项目/{0}.txt'.format(file_name),'a+',encoding='UTF-8') as fp:
fp.write(real.get_text().strip())
except:
pass
except:
pass
if __name__ == '__main__':
urls = ['https://www.hujiang.com/ciku/zuixinkaoyanyingyucihui_{0}/'.format(i) for i in range(1, 276)]#最后一页为275
for url in urls:
html = get_html(url)
part_list = get_entranceUrls(html)
for part_name in part_list:
href = 'https://www.hujiang.com' + part_name
Html = get_html(href)
word = part_name.split('/')[2]
# print(word)测试成功
get_info(Html,word)