lucene之suggert
Lucene 关键词搜索
所需jar
Lucene-suggest-4.7.0.jar
Lucene-queryparser-4.7.0.jar
Lucene-misc-4.7.0.jar
Lucene-memory-4.7.0.jar
Lucene-highlighter-4.7.0.jar
Lucene-core-4.7.0.jar
Lucene-analyzers-common-4.7.0.jar
分词器
IKAnalyzer2012FF_u1.jar
效果图
代码:
package lucene;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.codecs.lucene46.Lucene46Codec;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
public class MyAnalyzingInfixSuggester extends AnalyzingInfixSuggester {
/** 日志 **/
private final Logger logger = Logger.getLogger(MyAnalyzingInfixSuggester.class);
/** Field name used for the indexed text. */
public static final String TEXT_FIELD_NAME = "text";
/** Default minimum number of leading characters before
* PrefixQuery is used (4). */
public static final int DEFAULT_MIN_PREFIX_CHARS = 4;
private final File indexPath;
final int minPrefixChars;
final Version matchVersion;
private final Directory dir;
/**索引创建方式(新建或追加)*/
private final OpenMode mode;
/*
* 重载 构造方法 初始化相关变量
* @param matchVersion Lucene版本
* @param indexPath 索引文件目录
* @param analyzer 分词器
* @param mode 索引创建方式(新建或追加)
* @throws IOException
*/
public MyAnalyzingInfixSuggester(Version matchVersion, File indexPath, Analyzer analyzer, OpenMode mode) throws IOException {
//调用父类构造方法
super(matchVersion, indexPath, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS);
this.mode = mode;
this.indexPath = indexPath;
this.minPrefixChars = DEFAULT_MIN_PREFIX_CHARS;
this.matchVersion = matchVersion;
dir = getDirectory(indexPath);
}
/*
* 重写获得IndexWriterConfig的方法
* 增加索引创建方式可变(新建或追加)
* @see org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester#getIndexWriterConfig(org.apache.lucene.util.Version, org.apache.lucene.analysis.Analyzer)
*/
@Override
protected IndexWriterConfig getIndexWriterConfig(Version matchVersion, Analyzer indexAnalyzer) {
IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer);
iwc.setCodec(new Lucene46Codec());
if (indexAnalyzer instanceof AnalyzerWrapper) {
//如果是tmp目录,采用新建方式打开索引文件
iwc.setOpenMode(OpenMode.CREATE);
} else {
iwc.setOpenMode(mode);
}
return iwc;
}
/*
* 重写查询方法,取消在建立索引时候进行排序
* @see org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester#build(org.apache.lucene.search.suggest.InputIterator)
*/
@Override
public void build(InputIterator iter) throws IOException {
if (searcher != null) {
searcher.getIndexReader().close();
searcher = null;
}
Directory dirTmp = getDirectory(new File(indexPath.toString() + ".tmp"));
IndexWriter w = null;
IndexWriter w2 = null;
AtomicReader r = null;
boolean success = false;
try {
Analyzer gramAnalyzer = new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return indexAnalyzer;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
if (fieldName.equals("textgrams") && minPrefixChars > 0) {
return new TokenStreamComponents(components.getTokenizer(), new EdgeNGramTokenFilter(matchVersion, components.getTokenStream(), 1, minPrefixChars));
} else {
return components;
}
}
};
w = new IndexWriter(dirTmp, getIndexWriterConfig(matchVersion, gramAnalyzer));
BytesRef text;
Document doc = new Document();
FieldType ft = getTextFieldType();
// Field.Store.YES
Field textField = new Field(TEXT_FIELD_NAME, "", ft);
doc.add(textField);
Field textGramField = new Field("textgrams", "", ft);
doc.add(textGramField);
Field textDVField = new BinaryDocValuesField(TEXT_FIELD_NAME, new BytesRef());
doc.add(textDVField);
Field wordDVField = new StringField("word", "", Field.Store.YES);
doc.add(wordDVField);
Field weightField = new NumericDocValuesField("weight", 0);
doc.add(weightField);
Field countField = new StringField("count", "0", Field.Store.YES);
doc.add(countField);
Field payloadField;
if (iter.hasPayloads()) {
payloadField = new BinaryDocValuesField("payloads", new BytesRef());
doc.add(payloadField);
} else {
payloadField = null;
}
long t0 = System.nanoTime();
while ((text = iter.next()) != null) {
String textString = text.utf8ToString();
textField.setStringValue(textString);
wordDVField.setStringValue(textString);
textGramField.setStringValue(textString);
textDVField.setBytesValue(text);
weightField.setLongValue(iter.weight());
if (iter.hasPayloads()) {
payloadField.setBytesValue(iter.payload());
}
w.addDocument(doc);
}
logger.debug("initial indexing time: " + ((System.nanoTime() - t0) / 1000000) + " msec");
r = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(w, false));
w.rollback();
w2 = new IndexWriter(dir, getIndexWriterConfig(matchVersion, indexAnalyzer));
w2.addIndexes(new IndexReader[] { r });
r.close();
searcher = new IndexSearcher(DirectoryReader.open(w2, false));
w2.close();
payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads");
weightsDV = MultiDocValues.getNumericValues(searcher.getIndexReader(), "weight");
textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME);
assert textDV != null;
success = true;
} finally {
if (success) {
IOUtils.close(w, w2, r, dirTmp);
} else {
IOUtils.closeWhileHandlingException(w, w2, r, dirTmp);
}
}
}
/*
* 重写查询方法,改变结果排序的方法
* @see org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester#lookup(java.lang.CharSequence, int, boolean, boolean)
*/
@Override
public List<LookupResult> lookup(CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) {
if (searcher == null) {
throw new IllegalStateException("suggester was not built");
}
final BooleanClause.Occur occur;
if (allTermsRequired) {
occur = BooleanClause.Occur.MUST;
} else {
occur = BooleanClause.Occur.SHOULD;
}
TokenStream ts = null;
try {
ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
ts.reset();
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
String lastToken = null;
BooleanQuery query = new BooleanQuery();
int maxEndOffset = -1;
final Set<String> matchedTokens = new HashSet<String>();
while (ts.incrementToken()) {
if (lastToken != null) {
matchedTokens.add(lastToken);
query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
}
lastToken = termAtt.toString();
if (lastToken != null) {
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
}
}
ts.end();
String prefixToken = null;
if (lastToken != null) {
Query lastQuery;
if (maxEndOffset == offsetAtt.endOffset()) {
// Use PrefixQuery (or the ngram equivalent) when
// there was no trailing discarded chars in the
// string (e.g. whitespace), so that if query does
// not end with a space we show prefix matches for
// that token:
lastQuery = getLastTokenQuery(lastToken);
prefixToken = lastToken;
} else {
// Use TermQuery for an exact match if there were
// trailing discarded chars (e.g. whitespace), so
// that if query ends with a space we only show
// exact matches for that term:
matchedTokens.add(lastToken);
lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
}
if (lastQuery != null) {
query.add(lastQuery, occur);
}
}
ts.close();
Query finalQuery = finishQuery(query, allTermsRequired);
//新建排序方法
Sort sort = new Sort(new SortField("weight", SortField.Type.LONG, true));
TopDocs hits = searcher.search(finalQuery, num, sort);
List<LookupResult> results = createResults(hits, num, key, doHighlight, matchedTokens, prefixToken);
return results;
} catch (IOException ioe) {
throw new RuntimeException(ioe);
} finally {
IOUtils.closeWhileHandlingException(ts);
}
}
public List<LookupResult> lookup( int num, boolean allTermsRequired, boolean doHighlight ) {
if (searcher == null) {
throw new IllegalStateException("suggester was not built");
}
final Set<String> matchedTokens = new HashSet<String>();
String prefixToken = null;
final BooleanClause.Occur occur;
occur = BooleanClause.Occur.SHOULD;
TokenStream ts = null;
try {
BooleanQuery query = new BooleanQuery();
Query termQuery = new TermQuery(new Term("count", "0"));
query.add( termQuery, occur);
Query finalQuery = finishQuery(query, allTermsRequired);
//新建排序方法
Sort sort = new Sort(new SortField("weight", SortField.Type.LONG, true));
TopDocs hits = searcher.search(finalQuery, num, sort);
List<LookupResult> results = createResults(hits, num, null, doHighlight, matchedTokens, prefixToken);
return results;
} catch (IOException ioe) {
throw new RuntimeException(ioe);
} finally {
IOUtils.closeWhileHandlingException(ts);
}
}
}
package lucene;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.UnsupportedEncodingException;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.util.BytesRef;
public class ProductIterator implements InputIterator {
//集合的迭代器
private Iterator<VO> productIterator;
//遍历的当前的suggerter
private VO currentProduct;
//
public ProductIterator(Iterator<VO> productIterator) {
this.productIterator = productIterator;
}
public boolean hasContexts() {
return true;
}
/**
* 是否有设置payload信息
*/
public boolean hasPayloads() {
return true;
}
public Comparator<BytesRef> getComparator() {
return null;
}
public BytesRef next() {
if (productIterator.hasNext()) {
currentProduct = productIterator.next();
try {
//返回当前Project的name值,把product类的name属性值作为key
return new BytesRef(currentProduct.getTerm().getBytes("UTF8"));
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Couldn't convert to UTF-8",e);
}
} else {
return null;
}
}
/**
* 将Product对象序列化存入payload
* [这里仅仅是个示例,其实这种做法不可取,一般不会把整个对象存入payload,这样索引体积会很大,浪费硬盘空间]
* 存其他后期需要取出的各种数据
*/
public BytesRef payload() {
try {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutputStream out = new ObjectOutputStream(bos);
out.writeObject(currentProduct);
// out.writeInt(currentProduct.getTimes());
out.close();
return new BytesRef(bos.toByteArray());
} catch (IOException e) {
throw new RuntimeException("Well that's unfortunate.");
}
}
/**
* 把产品的销售区域存入context,context里可以是任意的自定义数据,一般用于数据过滤
* Set集合里的每一个元素都会被创建一个TermQuery,你只是提供一个Set集合,至于new TermQuery
* VO底层API去做了,但你必须要了解底层干了些什么
*/
public Set<BytesRef> contexts() {
try {
Set<BytesRef> regions = new HashSet<BytesRef>();
//for (String region : currentProduct.getStlist()) {
regions.add(new BytesRef(currentProduct.getTerm().getBytes("UTF8")));
//}
return regions;
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Couldn't convert to UTF-8");
}
}
/**
* 返回权重值,这个值会影响排序
* 这里以产品的销售量作为权重值,weight值即最终返回的热词列表里每个热词的权重值
* 怎么设计返回这个权重值,发挥你们的想象力吧
*/
public long weight() {
return currentProduct.getTimes();
}
}
package lucene;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;
import org.apache.lucene.search.suggest.analyzing.FuzzySuggester;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import document;
public class LuceneSuggest {
private static final Version VERSION = Version.LUCENE_47;
public void indexmake( List<VO> lucenelist,File indexDir,Analyzer analyzer,Version VERSION, OpenMode create ) throws IOException{
MyAnalyzingInfixSuggester suggester = new MyAnalyzingInfixSuggester(VERSION, indexDir, analyzer,OpenMode.CREATE_OR_APPEND);
try {
suggester.build(new ProductIterator(lucenelist.iterator()));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
//关闭
suggester.close();
}
}
public List<VO> lookup(String name, String region,int count,String orgno,String indexDir,Analyzer analyzer,Version VERSION ,OpenMode create ) throws IOException {
MyAnalyzingInfixSuggester suggester = new MyAnalyzingInfixSuggester(VERSION, new File(indexDir+orgno), analyzer,OpenMode.CREATE_OR_APPEND);
List<LookupResult> lookup = suggester.lookup( count,false,false );
List<VO> lulist=new ArrayList<VO>();
HashSet<BytesRef> contexts = new HashSet<BytesRef>();
/*contexts.add(new BytesRef(region.getBytes("UTF8"))); */
// 先以contexts为过滤条件进行过滤,再以name为关键字进行筛选,根据weight值排序返回前2条
// 第3个布尔值即是否每个Term都要匹配,第4个参数表示是否需要关键字高亮 //5 最大长度
/*
* 查询结果
* name- 查询的关键词
* count- 返回的最多数量
* allTermsRequired - should或者must关系
* doHighlight - 高亮
*/
List<LookupResult> results = suggester.lookup(name ,count,false,false);
System.out.println("-- \"" + name + "\" (" + region + "):");
for (LookupResult result : results) {
System.out.println(result.key);
String str = (String) result.highlightKey;
str=(String) result.key;
Integer time = null;
// 从payload中反序列化出Product对象
BytesRef bytesRef = result.payload;
ObjectInputStream is = new ObjectInputStream(new ByteArrayInputStream(bytesRef.bytes));
try {
VO vo = (VO) is.readObject() ;
lulist.add(vo);
} catch (Exception e) {
e.printStackTrace();
}
}
suggester.close();
System.out.println("结束");
if(!"00".equals(orgno)){
MyAnalyzingInfixSuggester suggester1 = new MyAnalyzingInfixSuggester(VERSION, new File(indexDir+"00"), analyzer,OpenMode.CREATE_OR_APPEND);
/*
* 查询结果
* name- 查询的关键词
* count- 返回的最多数量
* allTermsRequired - should或者must关系
* doHighlight - 高亮
*/
List<LookupResult> results1 = suggester1.lookup(name ,count,false,false);
System.out.println("-- \"" + name + "\" (" + region + "):");
for (LookupResult result : results1) {
System.out.println(result.key);
String str = (String) result.highlightKey;
str=(String) result.key;
Integer time = null;
// 从payload中反序列化出Product对象
BytesRef bytesRef = result.payload;
ObjectInputStream is = new ObjectInputStream(new ByteArrayInputStream(bytesRef.bytes));
try {
VO vo = (VO) is.readObject() ;
lulist.add(vo);
} catch (Exception e) {
e.printStackTrace();
}
}
suggester.close();
}
Collections.sort(lulist, new Comparator<VO>() {
@Override
public int compare(VO o1, VO o2) {
int i = o1.getTimes() - o2.getTimes();
return i;
}
});
Collections.reverse(lulist);
if(lulist.size() > count){
lulist.subList(0, count);//取前count 条
}
return lulist;
}
/**
*
* @param count
* @param orgno
* @param indexDir
* @param analyzer
* @param VERSION
* @param create
* @return 查询全部数据 (思路定义了定值count 为 0) 默认查询
* @throws IOException
*/
public List<VO> lookup1( int count,String orgno,String indexDir,Analyzer analyzer,Version VERSION ,OpenMode create ) throws IOException {
MyAnalyzingInfixSuggester suggester = new MyAnalyzingInfixSuggester(VERSION, new File(indexDir+orgno), analyzer,OpenMode.CREATE_OR_APPEND);
List<VO> lulist=new ArrayList<VO>();
List<LookupResult> results = suggester.lookup( count,false,false );
for (LookupResult result : results) {
System.out.println(result.key);
String str = (String) result.highlightKey;
str=(String) result.key;
Integer time = null;
// 从payload中反序列化出Product对象
BytesRef bytesRef = result.payload;
ObjectInputStream is = new ObjectInputStream(new ByteArrayInputStream(bytesRef.bytes));
try {
VO vo = (VO) is.readObject() ;
lulist.add(vo);
} catch (Exception e) {
e.printStackTrace();
}
}
suggester.close();
System.out.println("结束");
Collections.sort(lulist, new Comparator<VO>() {
@Override
public int compare(VO o1, VO o2) {
int i = o1.getTimes() - o2.getTimes();
return i;
}
});
Collections.reverse(lulist);
if(lulist.size() > count){
lulist.subList(0, count);//取前count 条
}
return lulist;
}
/**
* 跟新
* @return
* @throws IOException
*/
public void edit(String word,File indexDir,Analyzer analyzer,Version VERSION ,VO vo) throws IOException{
Directory fsDir = FSDirectory.open(indexDir);
IndexWriter indexWriter = new IndexWriter(fsDir, new IndexWriterConfig( VERSION, analyzer));
//删除对应的词条
TermQuery termQuery = new TermQuery(new Term("word", word));
// indexWriter.deleteDocuments(new Term(MyAnalyzingInfixSuggester.TEXT_FIELD_NAME, word));
indexWriter.deleteDocuments(termQuery);
//彻底删除
indexWriter.forceMergeDeletes();
//关闭IndexWriter
indexWriter.commit();
indexWriter.close();
List<VO> list = new ArrayList<VO>();
list.add(vo);
//添加建立新的词条索引
this.indexmake(list, indexDir,analyzer, VERSION,OpenMode.APPEND);
}
public void deleteSuggert( String text,File indexDir ) throws IOException{
Analyzer analyzer = new IKAnalyzer(false);
Directory fsDir = FSDirectory.open(indexDir);
IndexWriter indexWriter = new IndexWriter(fsDir, new IndexWriterConfig( VERSION, analyzer));
//删除对应的词条
TermQuery termQuery = new TermQuery(new Term("word",text));
// indexWriter.deleteDocuments(new Term(MyAnalyzingInfixSuggester.TEXT_FIELD_NAME, word));
indexWriter.deleteDocuments(termQuery);
//彻底删除
indexWriter.forceMergeDeletes();
//关闭IndexWriter
indexWriter.commit();
indexWriter.close();
}
}
package lucene;
import java.io.Serializable;
public class VO implements Serializable{
private static final long serialVersionUID = 1L;
String term;
int times;
/**
* @param term 词条
* @param times 词频
*/
public VO(String term, int times) {
this.term = term;
this.times = times;
}
public VO() {
super();
}
/**
* @return the term
*/
public String getTerm() {
return term;
}
/**
* @param term the term to set
*/
public void setTerm(String term) {
this.term = term;
}
/**
* @return the times
*/
public int getTimes() {
return times;
}
/**
* @param times the times to set
*/
public void setTimes(int times) {
this.times = times;
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return term + " " + times;
}
/* (non-Javadoc)
* @see java.lang.Object#hashCode()
*/
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((term == null) ? 0 : term.hashCode());
return result;
}
/*
* 只对比term
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
VO other = (VO) obj;
if (term == null) {
if (other.term != null)
return false;
} else if (!term.equals(other.term))
return false;
return true;
}
}
1.MyAnalyzingInfixSuggester 是重写AnalyzingInfixSuggester 由于原代码不追加索引,采用新建OpenMode.CREATE,所以重写
2.Document 为创建的数据
Main
Word为搜索的key 第二各参数可以是null 作用是进行过滤,搜索10条,orgno和IndexWPath 主要是拼地址 ,分词器false粒度分词,为true是智能分词
List<VO> lookup = luceneSuggest.lookup(word, null, 10,orgno, IndexWPath , new IKAnalyzer(false), Version.LUCENE_47,OpenMode.CREATE_OR_APPEND);
词频排序
VO vo= new VO();
vo.setTerm(word);
vo.setTimes(Integer.valueOf(terms)+1);
LuceneSuggest luceneSuggest = new LuceneSuggest();
luceneSuggest.edit(word, new File(IndexWPath), new IKAnalyzer(false),Version.LUCENE_47, vo);