Python 豆瓣影评抓取
Python 豆瓣影评抓取
环境:win7 、Python:2.7 、pycharm:2017.2 代码下载地址:http://download.****.net/download/hunhun1122/10033077
先分析抓取网页内容:
代码如下:
# encoding=utf8 import urllib2 request=urllib2.Request("https://movie.douban.com/nowplaying/hangzhou/") response=urllib2.urlopen(request) html_data = response.read().decode('utf-8') #print html_data #获取上映的电影信息 from bs4 import BeautifulSoup as bs #'html.parser' Python标准库 解析html soup = bs(html_data, 'html.parser') nowplaying_movie = soup.find_all('div', id='nowplaying') nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item') #print(nowplaying_movie_list[0]) #电影的id和名称 nowplaying_list = [] for item in nowplaying_movie_list: nowplaying_dict = {} nowplaying_dict['id'] = item['data-subject'] for tag_img_item in item.find_all('img'): nowplaying_dict['name'] = tag_img_item['alt'] nowplaying_list.append(nowplaying_dict) #print(nowplaying_list) #===================评论========================================================= #评论的 地址 requrl = 'https://movie.douban.com/subject/' + nowplaying_list[8]['id'] + '/comments' +'?' +'start=0' + '&limit=30' resp =urllib2.Request(requrl) response2=urllib2.urlopen(resp) html_data = response2.read().decode('utf-8') soup = bs(html_data, 'html.parser') comment_div_lits = soup.find_all('div', class_='comment') #jiexi pinglun eachCommentList = []; for item in comment_div_lits: if item.find_all('p')[0].string is not None: #增加编码utf-8 eachCommentList.append(item.find_all('p')[0].string.encode("utf8")) #print(eachCommentList) #================================数据清洗======================= #1、拼接一个字符串 comments = '' for k in range(len(eachCommentList)): comments = comments + (str(eachCommentList[k])).strip() #print comments #2、匹配文本中的汉字 bu baohan 标点 import re pattern = re.compile(ur'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, comments.decode("utf8")) cleaned_comments = ''.join(filterdata) #print cleaned_comments #===================================分词============= #jieba 分词 import jieba import pandas as pd segment=jieba.lcut(cleaned_comments) words_df=pd.DataFrame({'segment':segment}) #停止词 过滤 stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8') # quoting=3全不引用 words_df = words_df[~words_df.segment.isin(stopwords.stopword)] #print words_df.head() #=====================词频统计============================== import numpy #numpy计算包 words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size}) words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False) #print words_stat.head() #====================词云显示================ import sys reload(sys) sys.setdefaultencoding('utf-8') import matplotlib.pyplot as plt #matplotlib inline import matplotlib matplotlib.rcParams['figure.figsize'] = (10.0, 5.0) from wordcloud import WordCloud#词云包 wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80) # 指定字体类型、字体大小和字体颜色 #word_frequence 为字典类型,可以直接传入wordcloud.fit_words() #word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} #wordcloud = wordcloud.fit_words(word_frequence) word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} wordcloud = wordcloud.fit_words(word_frequence) plt.imshow(wordcloud) plt.axis("off") plt.show()