python中Counter用法实例
爬虫案例,对美国总统的一篇演讲稿分析
要点: 1、Counter用法:统计分析,类似于tf-itf词频统计 常用的函数有subtract update 2、python中'delimer'.JOIN(sentence)的使用 对sentence按照delimer分割 from urllib.request import urlopen from bs4 import BeautifulSoup import re import string from collections import Counter def cleanSentence(sentence): sentence=sentence.split(' ') #string.puctuaction+string.whitespace获取所有的字符,strip去掉word左右的字符 sentence=[word.strip(string.punctuation+string.whitespace) for word in sentence] #word长度大于1 或者word是'a'或者'i'的收录到word中 并返回 sentence=[word for word in sentence if (len(word)>1 or (word.lower()=='a' or word.lower()=='i'))] print('cleanSentence>',sentence) return sentence def cleanInput(content): content=content.upper() #用空格替换\n content=re.sub('\n',' ',content) #按utf-8编码 content=bytes(content,'utf-8') #按照ASCII解码,解码报错直接忽略:排除掉非ASCII编码的字符,只适合在英文环境下分析 content=content.decode('ascii','ignore') #按照". "分割成句子 sentence=content.split('. ') print('cleanInput>',sentence) return [cleanSentence(sentence) for sentence in sentence] def getNgramFromSentence(content,n): output=[] # for i in range(len(content)-n+1): output.append(content[i:i+n]) print('getNgramFromSentence>',output) return output def getNgrams(content,n): content=cleanInput(content) #print('content>',content) ngrams=Counter() ngrams_list=[] for sentence in content: newNgrams=[' '.join(ngram) for ngram in getNgramFromSentence(sentence,2)] #print('newgream>',newNgrams) ngrams_list.extend(newNgrams) ngrams.update(newNgrams) return(ngrams) content=str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(),'utf-8') ngrams=getNgrams(content,2) print(ngrams)
counter用法案例