lucene 学习实战系列(基础入门)
先前有研究过一段时间lucene,也有一个小的应用。最近的项目需要强大的检索功能,再深入研究下。嗯必须的。
第一篇就先弄个简单的demo吧。
lucene3.0.2
在《Lucene in action》中,Lucene 的构架和过程如下图
说明Lucene是有索引和搜索的两个过程,包含索引创建,索引,搜索三个要点。
package com.ht.lucene;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
//searching
import org.apache.lucene.queryParser.QueryParser; //单Field查询
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.util.Version;
public class SampleSearch{
public static void main(String arg[]) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException, InvalidTokenOffsetsException{
Directory directory = new RAMDirectory(); //索引在内存
//Directory directory2=new SimpleFSDirectory("路径"); //索引在硬盘 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
indexer(directory, analyzer);
IndexSearcher isearcher = new IndexSearcher(directory, true);
// //Term 是查询的基本单位
// //1.termQuery
// TermQuery query = new TermQuery(new Term("newsTitle","台"));
// System.out.println("--- query :"+query.toString());
//
// //2.BooleanQuery,多字段多域查询。类似还提供RangeQuery范围搜索
// Query a = new TermQuery(new Term("newsTitle", "福"));
// Query b = new TermQuery(new Term("newsTitle", "东"));
// BooleanQuery query = new BooleanQuery();
// query.add(a, BooleanClause.Occur.MUST);
// query.add(b, BooleanClause.Occur.MUST);
// query.add(a, BooleanClause.Occur.MUST);
// query.add(a, BooleanClause.Occur.SHOULD);
// System.out.println("--- query :"+query.toString());
//
// //3.用QueryParser 切词分析
QueryParser parser = new QueryParser(Version.LUCENE_30, "newsTitle", analyzer);
//parser.setDefaultOperator(QueryParser.AND_OPERATOR); //默认term之间是or关系
parser.setPhraseSlop(10);
Query query = parser.parse("福州台");
System.out.println("--- query :"+query.toString());
//
// //4.利用MultiFieldQueryParser实现对多Field查询
// String[] fields = {"newsTitle","newsContent"};
// MultiFieldQueryParser mparser = new MultiFieldQueryParser(Version.LUCENE_30, fields, analyzer);
// Query query = mparser.parse("福州");
// System.out.println("---- query :"+query.toString());
// 5.RangeQuery范围搜索 ,用于查找一定范围内的文档
// 6.PrefixQuery 前缀搜索,前缀搜索,相当于sql查询的 like 'XX%'
// 7.FuzzyQuery 模糊搜索
// 8.WildcardQuery 通配符搜索
/**
* IndexSearcher 的主要检索方法
* isearcher.search(Query query, Collector results);
* isearcher.search(Query query,int n);
* isearcher.search(Query query, Filter filter, Collector results);
*/
ScoreDoc[] docs = isearcher.search(query,10).scoreDocs;
System.out.println("找到的目标新闻数目:" +docs.length);
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<B>","</B>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter,new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(100));
for (int i = 0; i < docs.length; i++){
System.out.println("目标文档的id:" +docs[i].doc);
System.out.println("目标文档评分:" + docs[i].score);
Document resultDoc = isearcher.doc(docs[i].doc);
System.out.println("newsId:" + resultDoc.get("newsId"));
System.out.println("newsTitle:" + resultDoc.get("newsTitle"));
System.out.println("newsContent:" + resultDoc.get("newsContent"));
TokenStream tokenStream1 = analyzer.tokenStream("newsTitle",new StringReader(resultDoc.get("newsTitle")));
TokenStream tokenStream2 = analyzer.tokenStream("newsContent",new StringReader(resultDoc.get("newsContent")));
String highlighterstr1 = highlighter.getBestFragment(tokenStream1, resultDoc.get("newsTitle"));
String highlighterstr2 = highlighter.getBestFragment(tokenStream2, resultDoc.get("newsContent"));
System.out.println("带高亮的新闻内容显示效果:"+highlighterstr1);
System.out.println("带高亮的新闻内容显示效果:"+highlighterstr2);
System.out.println("------------------------------------------");
}
}
public static void indexer(Directory directory,Analyzer analyzer) throws CorruptIndexException, LockObtainFailedException, IOException{
IndexWriter iwriter = new IndexWriter(directory , analyzer, true,new IndexWriter.MaxFieldLength(25000));
//索引过程的优化
// iwriter.setMergeFactor(10);//激励因子
// iwriter.setMaxMergeDocs(2000);//segment最大文档数量
// iwriter.setMaxBufferedDocs(1);//内存文档数量
//news Fields
Field newsId = null;
Field newsTitle = null;
Field newsContent = null;
//第1条新闻
Document doc1 = new Document();
newsId = new Field("newsId","1",Field.Store.YES,Field.Index.NOT_ANALYZED);
newsTitle=new Field("newsTitle","福州台江数十户遭遇火灾",Field.Store.YES,Field.Index.ANALYZED);
newsContent = new Field("newsContent", "东快网讯(记者 张树福 林风)东快网" +
"在火灾现场获悉,今晚9点50左右福州台江发生大火的地点位于台江医院后面的中选社区,由于该社区主要为木屋结构," +
"大火已经燃烧了一个小时,火势非常猛烈,但是在消防官兵的努力下,已经渐渐被控制的趋势。", Field.Store.YES, Field.Index.ANALYZED);
doc1.add(newsId);
doc1.add(newsTitle);
doc1.add(newsContent);
//第2条新闻
Document doc2 = new Document();
newsId = new Field("newsId","2",Field.Store.YES,Field.Index.NOT_ANALYZED);
newsTitle = new Field("newsTitle","我国渔政船编队抵达钓鱼岛海域遭遇日方喊话",Field.Store.YES,Field.Index.ANALYZED);
newsContent = new Field("newsContent", "昨日早晨6时,中国渔政310船抵达钓鱼岛海域,在与东海渔政201船会合后," +
"护渔编队开始围绕钓鱼岛及周边岛屿进行巡航护渔。编队表示,接下来的工作重点将包括维护国家海洋权益," +
"保护我国渔民合法生产权益和履行我国海洋生物资源养护等。", Field.Store.YES, Field.Index.ANALYZED);
doc2.add(newsId);
doc2.add(newsTitle);
doc2.add(newsContent);
//第3条新闻
Document doc3 = new Document();
newsId = new Field("newsId","3",Field.Store.YES,Field.Index.NOT_ANALYZED);
newsTitle = new Field("newsTitle","我国渔政船编队抵达钓鱼岛海域遭遇日方喊话",Field.Store.YES,Field.Index.ANALYZED);
newsContent = new Field("newsContent", "昨日早晨6时,中国渔政310船抵达钓鱼岛海域,在与东海渔政201船会合后," +
"护渔编队开始围绕钓鱼岛及周边岛屿进行巡航护渔。编队表示,接下来的工作重点将包括维护国家海洋权益," +
"保护我国渔民合法生产权益和履行我国海洋生物资源养护等。", Field.Store.YES, Field.Index.ANALYZED);
doc3.add(newsId);
doc3.add(newsTitle);
doc3.add(newsContent);
//第4条新闻
Document doc4= new Document();
newsId = new Field("newsId","4",Field.Store.YES,Field.Index.NOT_ANALYZED);
newsTitle = new Field("newsTitle","我国渔政船编队抵达钓鱼岛海域遭遇日方喊话",Field.Store.YES,Field.Index.ANALYZED);
newsContent = new Field("newsContent", "昨日早晨6时,中国渔政310船抵达钓鱼岛海域,在与东海渔政201船会合后," +
"护渔编队开始围绕钓鱼岛及周边岛屿进行巡航护渔。编队表示,接下来的工作重点将包括维护国家海洋权益," +
"保护我国渔民合法生产权益和履行我国海洋生物资源养护等。", Field.Store.YES, Field.Index.ANALYZED);
doc4.add(newsId);
doc4.add(newsTitle);
doc4.add(newsContent);
//第5条新闻
Document doc5 = new Document();
newsId = new Field("newsId","5",Field.Store.YES,Field.Index.NOT_ANALYZED);
newsTitle = new Field("newsTitle","我国渔政船编队抵达钓鱼岛海域遭遇日方喊话",Field.Store.YES,Field.Index.ANALYZED);
newsContent = new Field("newsContent", "昨日早晨6时,中国渔政310船抵达钓鱼岛海域,在与东海渔政201船会合后," +
"护渔编队开始围绕钓鱼岛及周边岛屿进行巡航护渔。编队表示,接下来的工作重点将包括维护国家海洋权益," +
"保护我国渔民合法生产权益和履行我国海洋生物资源养护等。", Field.Store.YES, Field.Index.ANALYZED);
doc5.add(newsId);
doc5.add(newsTitle);
doc5.add(newsContent);
//第6条新闻
Document doc6 = new Document();
newsId = new Field("newsId","6",Field.Store.YES,Field.Index.NOT_ANALYZED);
newsTitle = new Field("newsTitle","我国渔政船编队抵达钓鱼岛海域遭遇日方喊话",Field.Store.YES,Field.Index.ANALYZED);
newsContent = new Field("newsContent", "昨日早晨6时,中国渔政310船抵达钓鱼岛海域,在与东海渔政201船会合后," +
"护渔编队开始围绕钓鱼岛及周边岛屿进行巡航护渔。编队表示,接下来的工作重点将包括维护国家海洋权益," +
"保护我国渔民合法生产权益和履行我国海洋生物资源养护等。", Field.Store.YES, Field.Index.ANALYZED);
doc6.add(newsId);
doc6.add(newsTitle);
doc6.add(newsContent);
iwriter.addDocument(doc1);
iwriter.addDocument(doc2);
iwriter.addDocument(doc3);
iwriter.addDocument(doc4);
iwriter.addDocument(doc5);
iwriter.addDocument(doc6);
iwriter.optimize(); //对索引进行优化
iwriter.close();
}
}
就2个方法,下面那个是建立索引,上面main方法测试用。
例中,索引直接建立在内存。
引入架包后可直接运行。