RAKE 中文分词 与 关键词提取
import jieba
import jieba.posseg as pseg
import operator
import json
from collections import Counter
# Data structure for holding data
class Word():
def __init__(self, char, freq = 0, deg = 0):
self.freq = freq
self.deg = deg
self.char = char
def returnScore(self):
return self.deg/self.freq
def updateOccur(self, phraseLength):
self.freq += 1
self.deg += phraseLength
def getChar(self):
return self.char
def updateFreq(self):
self.freq += 1
def getFreq(self):
return self.freq
# Check if contains num
def notNumStr(instr):
for item in instr:
if '\u0041' <= item <= '\u005a' or ('\u0061' <= item <='\u007a') or item.isdigit():
return False
return True
# Read Target Case if Json
def readSingleTestCases(testFile):
with open(testFile) as json_data:
try:
testData = json.load(json_data)
except:
# This try block deals with incorrect json format that has ' instead of "
data = json_data.read().replace("'",'"')
try:
testData = json.loads(data)
# This try block deals with empty transcript file
except:
return ""
returnString = ""
for item in testData:
try:
returnString += item['text']
except:
returnString += item['statement']
return returnString
def run(rawText):
# Construct Stopword Lib
swLibList = [line.rstrip('\n') for line in open(r"sp.txt",'r',encoding='utf-8')]
# Construct Phrase Deliminator Lib
conjLibList = [line.rstrip('\n') for line in open(r"spw.txt",'r',encoding='utf-8')]
# Cut Text
rawtextList = pseg.cut(rawText)
# Construct List of Phrases and Preliminary textList
textList = []
listofSingleWord = dict()
lastWord = ''
poSPrty = ['m','x','uj','ul','mq','u','v','f']
meaningfulCount = 0
checklist = []
for eachWord, flag in rawtextList:
checklist.append([eachWord,flag])
if eachWord in conjLibList or not notNumStr(eachWord) or eachWord in swLibList or flag in poSPrty or eachWord == '\n':
if lastWord != '|':
textList.append("|")
lastWord = "|"
elif eachWord not in swLibList and eachWord != '\n':
textList.append(eachWord)
meaningfulCount += 1
if eachWord not in listofSingleWord:
listofSingleWord[eachWord] = Word(eachWord)
lastWord = ''
# Construct List of list that has phrases as wrds
newList = []
tempList = []
for everyWord in textList:
if everyWord != '|':
tempList.append(everyWord)
else:
newList.append(tempList)
tempList = []
tempStr = ''
for everyWord in textList:
if everyWord != '|':
tempStr += everyWord + '|'
else:
if tempStr[:-1] not in listofSingleWord:
listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])
tempStr = ''
# Update the entire List
for everyPhrase in newList:
res = ''
for everyWord in everyPhrase:
listofSingleWord[everyWord].updateOccur(len(everyPhrase))
res += everyWord + '|'
phraseKey = res[:-1]
if phraseKey not in listofSingleWord:
listofSingleWord[phraseKey] = Word(phraseKey)
else:
listofSingleWord[phraseKey].updateFreq()
# Get score for entire Set
outputList = dict()
for everyPhrase in newList:
if len(everyPhrase) > 5:
continue
score = 0
phraseString = ''
outStr = ''
for everyWord in everyPhrase:
score += listofSingleWord[everyWord].returnScore()
phraseString += everyWord + '|'
outStr += everyWord
phraseKey = phraseString[:-1]
freq = listofSingleWord[phraseKey].getFreq()
if freq / meaningfulCount < 0.01 and freq < 3 :
continue
outputList[outStr] = score
sorted_list = sorted(outputList.items(), key = operator.itemgetter(1), reverse = True)
return sorted_list[:10]
if __name__ == '__main__':
with open(r'E:\xkkAI\dazhao\industry\行业_34_.txt','r') as fp:
text = ''
for i in range(100):
text += fp.readline()
print(text)
result = run(text)
print(result)
RAKE(Rapid Automatic keyword extraction) 介绍
RAKE算法思想
RAKE算法用来做关键词(keyword)的提取,实际上提取的是关键的短语(phrase),并且倾向于较长的短语,在英文中,关键词通常包括多个单词,但很少包含标点符号和停用词,例如and,the,of等,以及其他不包含语义信息的单词。
RAKE算法首先使用标点符号(如半角的句号、问号、感叹号、逗号等)将一篇文档分成若干分句,然后对于每一个分句,使用停用词作为分隔符将分句分为若干短语,这些短语作为最终提取出的关键词的候选词。
我们注意到,每个短语可以再通过空格分为若干个单词,可以通过给每个单词赋予一个得分,通过累加得到每个短语的得分。一个关键点在于将这个短语中每个单词的共现关系考虑进去。最终定义的公式是:
- wordScore = wordDegree(w) / wordFrequency(w)
即单词w的得分是该单词的度(是一个网络中的概念,每与一个单词共现在一个短语中,度就加1,考虑该单词本身)除以该单词的词频(该单词在该文档中出现的总次数)。
然后对于每个候选的关键短语,将其中每个单词的得分累加,并进行排序,RAKE将候选短语总数的前三分之一的认为是抽取出的关键词。
另外,值得说明的是,关于分数计算这部分,wordDegree(w)实际上是等于word和每一个phrase里面的词共现的次数加上word的frequency。具体算法请看附件论文,《Automatic Keyword Extraction from IndividualDocumen》