Python 豆瓣影评抓取

Python 豆瓣影评抓取

环境:win7 、Python:2.7 、pycharm:2017.2    代码下载地址:http://download.****.net/download/hunhun1122/10033077

先分析抓取网页内容:

Python 豆瓣影评抓取


代码如下:

# encoding=utf8
import urllib2
request=urllib2.Request("https://movie.douban.com/nowplaying/hangzhou/")
response=urllib2.urlopen(request)
html_data = response.read().decode('utf-8')
#print html_data

#获取上映的电影信息
from bs4 import BeautifulSoup as bs
#'html.parser'   Python标准库  解析html
soup = bs(html_data, 'html.parser')
nowplaying_movie = soup.find_all('div', id='nowplaying')
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')

#print(nowplaying_movie_list[0])

#电影的id和名称
nowplaying_list = []
for item in nowplaying_movie_list:
        nowplaying_dict = {}
        nowplaying_dict['id'] = item['data-subject']
        for tag_img_item in item.find_all('img'):
            nowplaying_dict['name'] = tag_img_item['alt']
            nowplaying_list.append(nowplaying_dict)
#print(nowplaying_list)

#===================评论=========================================================
#评论的 地址
requrl = 'https://movie.douban.com/subject/' + nowplaying_list[8]['id'] + '/comments' +'?' +'start=0' + '&limit=30'

resp =urllib2.Request(requrl)
response2=urllib2.urlopen(resp)
html_data = response2.read().decode('utf-8')
soup = bs(html_data, 'html.parser')
comment_div_lits = soup.find_all('div', class_='comment')
#jiexi pinglun
eachCommentList = [];
for item in comment_div_lits:
        if item.find_all('p')[0].string is not None:
            #增加编码utf-8
            eachCommentList.append(item.find_all('p')[0].string.encode("utf8"))

#print(eachCommentList)


#================================数据清洗=======================
#1、拼接一个字符串
comments = ''
for k in range(len(eachCommentList)):
    comments = comments + (str(eachCommentList[k])).strip()
#print comments
#2、匹配文本中的汉字 bu baohan 标点
import re
pattern = re.compile(ur'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments.decode("utf8"))
cleaned_comments = ''.join(filterdata)
#print  cleaned_comments

#===================================分词=============

#jieba 分词
import jieba
import pandas as pd
segment=jieba.lcut(cleaned_comments)
words_df=pd.DataFrame({'segment':segment})
#停止词 过滤
stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],
                        encoding='utf-8')  # quoting=3全不引用
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
#print words_df.head()

#=====================词频统计==============================
import numpy
#numpy计算包
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)

#print words_stat.head()

#====================词云显示================
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import matplotlib.pyplot as plt
#matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
from wordcloud import WordCloud#词云包

wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)  # 指定字体类型、字体大小和字体颜色

#word_frequence 为字典类型,可以直接传入wordcloud.fit_words()
#word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
#wordcloud = wordcloud.fit_words(word_frequence)
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
wordcloud = wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()