Python项目之企业审批流绩效分析分析应用2
在上一篇博文Python项目之企业审批流绩效分析分析应用1的源码基础上,发现统计出的词频需要做一些纠偏。
1、实际统计的时候,只按词库中的标准名词是不够说明问题,我们希望按照自定义的词组搭配进行自动切分,
jieba库支持自定义扩展词库:
jieba.load_userdict("newdit.txt")
文本格式:
派遣单 4 n
支付方式 4 n
预算编码 4 n
其中字母 表示词类型,数字表示期望统计词频
2、如:预算编码、WBS编码是同一个东西,且有些审批人可能直接写WBS,Python是大小写敏感的,所有wbs和WBS会被统计不同的词汇。所以算法中应该对这类词汇做汇总统计。同样,申请人要求回退和本人要求回退同样的理由,应该归为一类。
程序做如下调整:
#bpmRejectAnalyzeV1.py
import jieba
import jieba.posseg as pseg
from os import path
from scipy.misc import imread
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def getTxt(txt):
with open(txt,'r',encoding='utf-8')as f:
reject_list = f.readlines()
return reject_list
def segmentWords(txtlist):
stop_words = set(line.strip() for line in open('stopwords.txt', encoding='utf-8'))
newslist = []
#新增自定义词
jieba.load_userdict("newdit.txt")
for subject in txtlist:
if subject.isspace():
continue
word_list = pseg.cut(subject)
for word, flag in word_list:
if not word in stop_words and flag == 'n' or flag == 'eng':
newslist.append(word)
#合并指定的相似词
for line in open('unionWords.txt', encoding='utf-8'):
newline = line.encode('utf-8').decode('utf-8-sig') #解决\ufeff问题
unionlist = newline.split("*")
for j in range(1,len(unionlist)):
#wordDict[unionlist[0]] += wordDict.pop(unionlist[j],0)
for index,value in enumerate(newslist):
if value == unionlist[j]:
newslist[index] = unionlist[0]
return newslist
def countWords(newslist):
wordDict = {}
for item in newslist:
wordDict[item] = wordDict.get(item,0) + 1
itemList = list(wordDict.items())
itemList.sort(key=lambda x:x[1],reverse=True)
for i in range(100):
word, count = itemList[i]
print("{}:{}".format(word,count))
def drawPlant(newslist):
d = path.dirname(__file__)
mask_image = imread(path.join(d, "mickey.png"))
content = ' '.join(newslist)
wordcloud = WordCloud(font_path='simhei.ttf', background_color="white", max_words=40).generate(content)
# Display the generated image:
plt.imshow(wordcloud)
plt.axis("off")
wordcloud.to_file('wordcloud.jpg')
plt.show()
def main():
txtlist = getTxt('P001.txt')
print(len(txtlist))
wordlist = segmentWords(txtlist)
countWords(wordlist)
drawPlant(wordlist)
main()
词云效果,比之前的好很多,能一眼看出问题: