python爬虫爬取古诗词内容,并存入mysql
python爬虫爬取古诗词内容,并存入mysql
爬取结果展示:
代码如下:
from urllib import request
import re,os
import pymysql
import time
base_url = "https://so.gushiwen.org"
shiwen_url='https://www.gushiwen.org/shiwen/'
def get_model_url(shiwen_url):
html=get_html(shiwen_url)
res = '<a href="https://so\.gushiwen\.org/(.*?)\.aspx">(.*?)</a>'
urls=re.compile(res).findall(html)
return urls
def conn_mysql():
url = '49.4.71.22'
username = 'admin'
password = 'admin963'
dbname = 'test'
db=pymysql.connect(url,username,password,dbname)
return db
def createtable_poem():
sql='create table if not exists poem(model_name varchar(50),poem_name varchar(50),' \
'author_name varchar(50),dynasty varchar(50),content text)'
db=conn_mysql()
db.cursor().execute(sql)
db.commit()
def get_html(url):
html = request.urlopen(url).read().decode('utf-8')
return html
def get_url_list(html):
res='<span><a href="(.*?)" target="_blank">.*?</a>.*?</span>'
url1=re.compile(res).findall(html)
url_list=[]
for u in url1:
url_list.append(base_url+u)
return url_list
def get_poem_content(url):
html=get_html(url)
res='<h1 style="font-size:.*?;">(.*?)</h1>\n<p class="source"><a href=".*?">' \
'(.*?)</a>.*?<a href=".*?">(.*?)</a> </p>\n<div class="contson" id="contson.*?">\n([\s\S]*?)\n</div>'
poem_content=re.compile(res).findall(html)
return poem_content
if __name__ == '__main__':
start = time.clock()
db = conn_mysql()
createtable_poem()
model_name=[]
i = 0
j=0
for g in get_model_url(shiwen_url):
url='https://so.gushiwen.org/'+list(g)[0]+'.aspx'
model_name.append(list(g)[1])
html=get_html(url)
url_list=get_url_list(html)
for s in url_list:
i+=1
LL=get_poem_content(s)
if len(LL)==0:
pl=['NULL','NULL','NULL','NULL']
else:
pl = LL[0]
sql='insert into poem(model_name,poem_name,author_name,dynasty,content) values (%s,%s,%s,%s,%s)'
data=[model_name[j],pl[0],pl[2],pl[1],re.sub('<br />|<p>|</p>','',pl[3])]
db.cursor().execute(sql,data)
db.commit()
print('Success!!! '+'当前模块 :'+model_name[j]+'-----'+'已导入------'+str(i)+'条数据'+'-----')
j+=1
db.close()
print('Success!!!')
print('End!!!')
end = time.clock()
print('Running time: %s Seconds' % (end - start))
解析主要使用re模块,正则匹配!实测可用!!!