序言
- 相关基础知识:gensim、基于TFIDF的文本相似度↓
- https://blog.****.net/Yellow_python/article/details/81021142
- 语料下载地址:
- https://download.****.net/download/yellow_python/11019947
极简代码
【TF-IDF】版
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from numpy import argsort
with open('中国行政区划.txt', encoding='utf-8') as f:
texts = f.read().split()
corpora = [list(text) for text in texts]
dictionary = Dictionary(corpora)
num_features = len(dictionary.token2id)
corpora = [dictionary.doc2bow(c) for c in corpora]
tfidf = TfidfModel(corpora)
index = SparseMatrixSimilarity(tfidf[corpora], num_features)
while True:
kw = input('输入:').strip()
kw_vec = dictionary.doc2bow(list(kw))
sim = index[tfidf[kw_vec]]
ids = argsort(-sim)[:5]
for i in ids:
print(texts[i])
【TF-IDF+规则】版
from os.path import exists
import pickle
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from numpy import argsort
with open('中国行政区划.txt', encoding='utf-8') as f:
texts = f.read().split()
PATH = 'model.pickle'
if exists(PATH):
with open(PATH, 'rb') as f:
dictionary, tfidf = pickle.load(f)
else:
corpora = [list(t) for t in texts]
dictionary = Dictionary(corpora)
tfidf = TfidfModel(dictionary.doc2bow(c) for c in corpora)
with open(PATH, 'wb') as f:
pickle.dump((dictionary, tfidf), f)
num_features = len(dictionary.token2id)
while True:
kw = input('输入:').strip()
kw_vec = dictionary.doc2bow(list(kw))
texts_kw = [t for t in texts if kw in t]
corpora_kw = [dictionary.doc2bow(list(t)) for t in texts_kw]
index = SparseMatrixSimilarity(tfidf[corpora_kw], num_features)
sim = index[tfidf[kw_vec]]
ids = argsort(-sim)[:5]
for i in ids:
print(texts_kw[i])
效果