Python弹幕情感分析之《我就是演员》
当我们爬取了弹幕内容后,我们进行简单的情感分析。
import pandas as pd
import jieba
from matplotlib import pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 SimHei为黑体
mpl.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 加载自定义的词典
jieba.load_userdict('stopwords.txt')
jieba.add_word('韩雪')
jieba.add_word('张钧甯')
jieba.add_word('不用猜')
jieba.add_word('不用想')
pd.set_option('display.max_columns', None) # pandas在pycharm中显示所有列
data = pd.read_csv('actor_danmu.csv', header=None, names=['id', '点赞数', 'Comment']) # 设置列名
comment = list(data['Comment']) # 提取某一列数据,将pandas对象转换为列表
def sent2word(sentence):
# 分词,并去除停用词
segList = jieba.cut(sentence, cut_all=False) # 分词
segResult = []
for w in segList:
segResult.append(w)
stopwords = open('stopwords.txt', encoding='utf-8-sig').readlines() # 按行读取,因此每个字符的末尾有换行符
# 去除每个字符中的换行符
stop_words = strip_blank(stopwords)
stop_words.append('不用') # 添加一个停用词
newSent = []
for word in segResult:
if word in stop_words:
# print("stopword: %s" % word) # 可以试着输出我们csv文件中包含的有哪些停用词
continue
else:
newSent.append(word)
return newSent
def strip_blank(list):
# 因为是按行读取,所以要去除每个字符中的换行符
l = []
for i in list:
l.append(i.strip('\n'))
return l
def classify_word(words):
# 统计各种情感词的个数:正面、负面、否定
# open()函数打开文件时,设置编码,readlines()方法有换行符,因此要去掉
positive_list = open('sentiment_word\正面情绪词.txt', encoding='utf-8-sig').readlines()
negtive_list = open('sentiment_word\负面情绪词.txt', encoding='utf-8-sig').readlines()
not_list = open('sentiment_word\否定词.txt', encoding='utf-8-sig').readlines()
positive_list = strip_blank(positive_list)
negtive_list = strip_blank(negtive_list)
not_list = strip_blank(not_list)
number = {} # 统计三类词汇的数量
i = j = k = 0
for num in range(len(words)):
if words[num] in positive_list:
i += 1
elif words[num] in negtive_list:
j += 1
elif words[num] in not_list:
k += 1
if words[num+1] in negtive_list: # 判断否定词后面的词性
i += 1
if words[num + 1] in positive_list:
j += 1
number['positive'] = i
number['negtive'] = j
number['not'] = k
return number
def main():
score = [] # 简单计算情感得分,正面词汇个数减去负面词汇个数
for i in range(24): # csv文件中循环24次
sentence = ''.join(comment[i * 81*2*5: (i + 1) * 81*2*5])
if sentence:
words = sent2word(sentence)
number = classify_word(words)
score1 = number['positive'] - number['negtive']
print(score1)
score.append(score1)
else:
break
y = [i*5 for i in range(1, len(score)+1)]
plt.plot(y, score)
plt.xlabel('分钟')
plt.ylabel('情感得分')
plt.xticks(y) # 设置坐标轴
plt.title('我就是演员第十三期之韩雪夺冠弹幕情感分数')
plt.savefig('我就是演员') # 图片保存到本地
plt.show()
if __name__ == '__main__':
main()
结果如下: