python爬虫爬取古诗词内容,并存入mysql

python爬虫爬取古诗词内容,并存入mysql

python爬虫爬取古诗词内容,并存入mysql
爬取结果展示:
python爬虫爬取古诗词内容,并存入mysql
代码如下:

from urllib import request
import re,os
import pymysql
import time

base_url = "https://so.gushiwen.org"
shiwen_url='https://www.gushiwen.org/shiwen/'

def get_model_url(shiwen_url):
    html=get_html(shiwen_url)
    res = '<a href="https://so\.gushiwen\.org/(.*?)\.aspx">(.*?)</a>'
    urls=re.compile(res).findall(html)
    return urls

def conn_mysql():
    url = '49.4.71.22'
    username = 'admin'
    password = 'admin963'
    dbname = 'test'
    db=pymysql.connect(url,username,password,dbname)
    return db

def createtable_poem():
    sql='create table if not exists poem(model_name varchar(50),poem_name varchar(50),' \
        'author_name varchar(50),dynasty varchar(50),content text)'
    db=conn_mysql()
    db.cursor().execute(sql)
    db.commit()

def get_html(url):
    html = request.urlopen(url).read().decode('utf-8')
    return html

def get_url_list(html):
    res='<span><a href="(.*?)" target="_blank">.*?</a>.*?</span>'
    url1=re.compile(res).findall(html)
    url_list=[]
    for u in url1:
        url_list.append(base_url+u)
    return  url_list

def get_poem_content(url):
    html=get_html(url)
    res='<h1 style="font-size:.*?;">(.*?)</h1>\n<p class="source"><a href=".*?">' \
        '(.*?)</a>.*?<a href=".*?">(.*?)</a> </p>\n<div class="contson" id="contson.*?">\n([\s\S]*?)\n</div>'
    poem_content=re.compile(res).findall(html)
    return poem_content

if __name__ == '__main__':
    start = time.clock()
    db = conn_mysql()
    createtable_poem()
    model_name=[]
    i = 0
    j=0
    for g in get_model_url(shiwen_url):
        url='https://so.gushiwen.org/'+list(g)[0]+'.aspx'
        model_name.append(list(g)[1])
        html=get_html(url)
        url_list=get_url_list(html)
        for s in url_list:
            i+=1
            LL=get_poem_content(s)
            if len(LL)==0:
                pl=['NULL','NULL','NULL','NULL']
            else:
                pl = LL[0]
            sql='insert into poem(model_name,poem_name,author_name,dynasty,content) values (%s,%s,%s,%s,%s)'
            data=[model_name[j],pl[0],pl[2],pl[1],re.sub('<br />|<p>|</p>','',pl[3])]
            db.cursor().execute(sql,data)
            db.commit()
            print('Success!!!   '+'当前模块 :'+model_name[j]+'-----'+'已导入------'+str(i)+'条数据'+'-----')
        j+=1
    db.close()
    print('Success!!!')
    print('End!!!')
    end = time.clock()
    print('Running time: %s Seconds' % (end - start))

解析主要使用re模块,正则匹配!实测可用!!!