Python顶点小说爬虫(《三寸人间》爬取)
Python顶点小说爬虫(《三寸人间》爬取)
获取整个页面
import requests
from bs4 import BeautifulSoup
url = "https://www.23us.so/files/article/html/0/43/3615671.html"
r = requests.get(url, timeout=30)
r.raise_for_status()#检验连接状态
r.encoding = 'utf-8'#中文格式
soup = BeautifulSoup(r,"html.parser")
a = str(soup.find_all('h1'))#得到章节标题并转化成str类型
b = str(soup.find_all('dd',{'id',contents}))#得到小说内容并将它转化成str类型
写入.txt文档
dingdian = open("顶点.txt","a",encoding = "utf-8")
dingdian.write(a+'\n')
dingdian.write(b+'\n\n\n')
dingdian.close
大致结果如下
这就要用replace函数进行优化
a = a.replace('<h1>','')#将<h1>替换成空
a = a.replace('</h1>','')#将</h1>替换成空
b = b.replace('<br/>','')#将<br/>替换成空
继续爬取下一章
Next = soup.find_all('a')
for i in Next:#因为一个页面有多个a标签,所以遍历取出需要的内容
if(i.string =='下一页'):
break
url = 'https://www.23us.so'+i.get('href')#href的地址为相对路径,需要加上网站首页IP地址
得到url后就重复上面的操作。
整理所有代码并将它函数化
import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def getNextHTML(soup):
Next = soup.find_all('a')
for i in Next:
if(i.string =='下一页'):
break
return'https://www.23us.so' + i.get('href')
def getText(soup):
a = str(soup.find_all('dd',{'id':'contents'})).replace('[<dd id="contents">','')
return a.replace('<br/>','')
def getHead(soup):
a = str(soup.find_all('h1')).replace('<h1>','')
return a.replace('</h1>','')
def putText(text,h1):
dingdian = open("sancunrenjian.txt","a",encoding = "utf-8")
dingdian.write(h1 + '\n')
dingdian.write(text+'\n\n\n\n')
dingdian.close
def main():
url = "https://www.23us.so/files/article/html/0/43/3615671.html"
r = getHTMLText(url)
soup = BeautifulSoup(r,"html.parser")
while(True):
putText(getText(soup),getHead(soup))
url = getNextHTML(soup)
if(url == 'https://www.23us.so/files/article/html/0/43/index.html'):
break
r = getHTMLText(url)
soup = BeautifulSoup(r,"html.parser")
main()