gensim【极简】搜索引擎

序言

相关基础知识:gensim、基于TFIDF的文本相似度↓
https://blog.****.net/Yellow_python/article/details/81021142
语料下载地址:
https://download.****.net/download/yellow_python/11019947

极简代码

【TF-IDF】版

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from numpy import argsort

# 读取地址集
with open('中国行政区划.txt', encoding='utf-8') as f:
    texts = f.read().split()

# 创建TF-IDF模型
corpora = [list(text) for text in texts]
dictionary = Dictionary(corpora)
num_features = len(dictionary.token2id)
corpora = [dictionary.doc2bow(c) for c in corpora]
tfidf = TfidfModel(corpora)
index = SparseMatrixSimilarity(tfidf[corpora], num_features)

# 搜索
while True:
    kw = input('输入:').strip()
    kw_vec = dictionary.doc2bow(list(kw))
    sim = index[tfidf[kw_vec]]
    ids = argsort(-sim)[:5]  # 索引排序,返回前5
    for i in ids:
        print(texts[i])

【TF-IDF+规则】版

from os.path import exists
import pickle
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
from numpy import argsort

# 读取地址集
with open('中国行政区划.txt', encoding='utf-8') as f:
    texts = f.read().split()

# TF-IDF模型
PATH = 'model.pickle'
if exists(PATH):
    with open(PATH, 'rb') as f:
        dictionary, tfidf = pickle.load(f)
else:
    corpora = [list(t) for t in texts]
    dictionary = Dictionary(corpora)
    tfidf = TfidfModel(dictionary.doc2bow(c) for c in corpora)
    with open(PATH, 'wb') as f:
        pickle.dump((dictionary, tfidf), f)

# 搜索
num_features = len(dictionary.token2id)
while True:
    kw = input('输入:').strip()
    kw_vec = dictionary.doc2bow(list(kw))
    texts_kw = [t for t in texts if kw in t]  # 规则搜索
    corpora_kw = [dictionary.doc2bow(list(t)) for t in texts_kw]
    index = SparseMatrixSimilarity(tfidf[corpora_kw], num_features)
    sim = index[tfidf[kw_vec]]  # TF-IDF搜索
    ids = argsort(-sim)[:5]  # 索引排序,返回前5
    for i in ids:
        print(texts_kw[i])

效果

gensim【极简】搜索引擎