高亮实现
模板:
1、高亮的概述:从搜索结果中截取一部分摘要,并把符合条件的记录添加高亮显示;
比如:在baidu中搜索“小码哥教育“时的效果;
高亮需要使用lucene-highlighter-4.10.4.jar
2、高亮涉及的功能包括两部分:A、截取摘要,B、高亮显示
Formatter formatter = new SimpleHTMLFormatter("<font color=\"red\">","</font>");
Scorer scorer = new QueryScorer(query);
Highlighter hl = new Highlighter(formatter,scorer);
hl.setMaxDocCharsToAnalyze(20);
String str=h1.getBestFragment(new StandardAnalyzer(), "content",doc.get("context"));
package lucene;
import javafx.beans.binding.When;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class lucene {
String path="D:/workforce/lucene/hello"; //创建文件路径
Version version=Version.LUCENE_4_10_4; //创建Lucene的版本
String cn="我对什么书表示兴趣,父亲就把那部书放在我书桌上,有时他得爬梯到书橱高处去拿; 假如我长期不读,那部书就不见了---这就等于谴责。";
String en="When I expressed interest in any book, my father put it on my desk. Sometimes he had to climb the ladder to get it from the top of the bookcase. If I didn't read it for a long time, it would disappear - that would be condemnation.";
//设置内容
String content1="走好选择的路,别选择好走的路,你才能拥有真正的自己。";
String content2="惟有身处卑微的人,最有机缘看到世态人情的真相。一个人不想攀高就不怕下跌,也不用倾轧排挤,可以保其天真,成其自然,潜心一志完成自己能做的事。";
String content3="我甘心当个“零”,人家不把我当个东西,我正好可以把看不起我的人看个透 ";
//Lucene的录入
@Test
public void testLucene() throws Exception{
//1.定义Lucene存放文件的位置
Directory directory= FSDirectory.open(new File(path));
//2.配置分词对象
Analyzer analyzer=new StandardAnalyzer();
//3.配置对象
IndexWriterConfig config=new IndexWriterConfig(version,analyzer);
IndexWriter writer=new IndexWriter(directory,config);
//4.往库里写入内容
FieldType type=new FieldType();
type.setStored(true);//可存储
type.setIndexed(true);//存储索引
type.setTokenized(true);//设置分词
//创建文档对象
Document doc=new Document();
doc.add(new Field("title","doc1",type));
doc.add(new Field("content",content1,type));
writer.addDocument(doc);
Document doc2=new Document();
doc2.add(new Field("title","doc2",type));
doc2.add(new Field("content",content2,type));
writer.addDocument(doc2);
Document doc3=new Document();
doc3.add(new Field("title","doc3",type));
doc3.add(new Field("content",content3,type));
writer.addDocument(doc3);
//5.提交资源
//6.关闭资源
writer.close();
}
//Lucene的更新操作
@Test
public void testUpdate()throws Exception{
//1.定义Lucene存放文件的位置
Directory directory= FSDirectory.open(new File(path));
//2.配置分词对象
Analyzer analyzer=new StandardAnalyzer();
//3.配置对象
IndexWriterConfig config=new IndexWriterConfig(version,analyzer);
IndexWriter writer=new IndexWriter(directory,config);
//4.往库里写入内容
FieldType type=new FieldType();
type.setStored(true);//可存储
type.setIndexed(true);//存储索引
type.setTokenized(true);//设置分词
Document updateDoc=new Document();
updateDoc.add(new Field("title","doc2",type));
updateDoc.add(new Field("content",cn,type));
writer.updateDocument(new Term("title","doc2"),updateDoc);//更新操作:第一个用来寻找的字段,第二个用来更改的内容
writer.close();
}
//Lucene之删除操作
@Test
public void testDelete()throws Exception{
//1.定义Lucene存放文件的位置
Directory directory= FSDirectory.open(new File(path));
//2.配置分词对象
Analyzer analyzer=new StandardAnalyzer();
//3.配置对象
IndexWriterConfig config=new IndexWriterConfig(version,analyzer);
IndexWriter writer=new IndexWriter(directory,config);
//4.往库里写入内容
FieldType type=new FieldType();
type.setStored(true);//可存储
type.setIndexed(true);//存储索引
type.setTokenized(true);//设置分词
//第一种方式
// writer.deleteDocuments(new Term("title","doc3"));
//第二种方式
QueryParser parser=new QueryParser("content",analyzer);
Query query=parser.parse("人");
writer.deleteDocuments(query);
writer.close();
}
//Lucene的查询
@Test
public void testSearch()throws Exception{
//1.定义Lucene存放文件的位置
Directory directory=FSDirectory.open(new File(path));
//2.创建Reader
IndexReader reader= DirectoryReader.open(directory);
//3.创建读取对象
IndexSearcher searcher=new IndexSearcher(reader);
//第一个参数表示:在哪个字段查询内容
//第二参数:分词对象
Analyzer analyzer=new StandardAnalyzer();
QueryParser parser=new QueryParser("content",analyzer);
Query query=parser.parse("人");
//第二个参数表示符合条件的前n条记录
TopDocs tds=searcher.search(query,10000);
System.out.println("总共命中次数: "+tds.totalHits);
ScoreDoc[] scoreDocs=tds.scoreDocs;
ScoreDoc scoredoc=null;
Document doc=null;
System.out.println("-----------------");
Formatter formatter = new SimpleHTMLFormatter("<font color=\"red\">","</font>");
Scorer scorer = new QueryScorer(query);
Highlighter hl = new Highlighter(formatter,scorer);
//hl.setMaxDocCharsToAnalyze(20);
System.out.println("-----------------");
for (int i=0;i<scoreDocs.length;i++){
scoredoc=scoreDocs[i];
System.out.println("文档分数:"+scoredoc.score);
System.out.println("文档编号:"+scoredoc.doc);
doc=searcher.doc(scoredoc.doc); //根据编号获取文档
System.out.println("title>>>>>>>>>>>>>"+doc.get("title")); //获取标题
System.out.println("content>>>>>>>>>>>"+doc.get("content"));//获取文档内容
System.out.println("-------------------------");
String str=hl.getBestFragment(new StandardAnalyzer(), "content",doc.get("content"));
System.out.println(str);
}
}
//分词器的使用(中英文)
//英文分词器
@Test
public void testAnalyzer() throws IOException {
//中文
Analyzer analyzer=new SimpleAnalyzer();
TokenStream token=analyzer.tokenStream("content",cn);
token.reset();//将指针摆放到最原始的位置
while (token.incrementToken()){ //指针下走一格
System.out.println(token);
}
System.out.println("-------------------------------中英文分词对比“--------------------------------------”");
//英文
Analyzer analy=new SimpleAnalyzer();
TokenStream stream=analy.tokenStream("content",en);
stream.reset();//将指针摆放到最原始的位置
while (stream.incrementToken()){//指针下走一格
System.out.println(stream);
}
// 结论:SimpleAnalyzer:对于英文是空格分词
}
@Test
public void testStandAnalyzer() throws IOException {
//中文
Analyzer analyzer=new StandardAnalyzer();
TokenStream token=analyzer.tokenStream("content",cn);
token.reset();//将指针摆放到最原始的位置
while (token.incrementToken()){ //指针下走一格
System.out.println(token);
}
System.out.println("-------------------------------中英文分词对比--------------------------------------");
//英文
Analyzer analy=new StandardAnalyzer();
TokenStream stream=analy.tokenStream("content",en);
stream.reset();//将指针摆放到最原始的位置
while (stream.incrementToken()){//指针下走一格
System.out.println(stream);
}
//结论:StandardAnalyzer对于英文是空格分词,对于中文是单字分词
}
@Test
public void testPerFieldAnalyzerWrapper() throws IOException {
Map<String,Analyzer>fieldAnalyzer=new HashMap<>();
fieldAnalyzer.put("en",new SimpleAnalyzer());
fieldAnalyzer.put("cn",new StandardAnalyzer());
PerFieldAnalyzerWrapper wrapper=new PerFieldAnalyzerWrapper(new SimpleAnalyzer(),fieldAnalyzer);
// TokenStream token=wrapper.tokenStream("content",cn); //如果存在map不存在的key来执行 将会按照SimpleAnalyzer()这种分词器
TokenStream token=wrapper.tokenStream("cn",cn);
token.reset();//将指针摆放到最原始的位置
while (token.incrementToken()){ //指针下走一格
System.out.println(token);
}
System.out.println("-------------------------------中英文分词对比--------------------------------------");
PerFieldAnalyzerWrapper wrapp=new PerFieldAnalyzerWrapper(new SimpleAnalyzer(),fieldAnalyzer);
// TokenStream token=wrapper.tokenStream("content",cn); //如果存在map不存在的key来执行 将会按照SimpleAnalyzer()这种分词器
TokenStream stream=wrapp.tokenStream("en",en);
stream.reset();//将指针摆放到最原始的位置
while (stream.incrementToken()){ //指针下走一格
System.out.println(stream);
}
//结论:可以根据解析的字段来分配分析器
}
//中文分词器
@Test
public void testCJKAnalyzer() throws IOException {
//中文
Analyzer analyzer=new CJKAnalyzer();
TokenStream token=analyzer.tokenStream("content",cn);
token.reset();//将指针摆放到最原始的位置
while (token.incrementToken()){ //指针下走一格
System.out.println(token);
}
System.out.println("-------------------------------中英文分词对比“--------------------------------------”");
//英文
Analyzer analy=new CJKAnalyzer();
TokenStream stream=analy.tokenStream("content",en);
stream.reset();//将指针摆放到最原始的位置
while (stream.incrementToken()){//指针下走一格
System.out.println(stream);
}
//结论:CJKAnalyzer两字两字的分词
}
@Test
public void testSmartCn() throws IOException {
//中文
Analyzer analyzer=new SmartChineseAnalyzer();
TokenStream token=analyzer.tokenStream("content",cn);
token.reset();//将指针摆放到最原始的位置
while (token.incrementToken()){ //指针下走一格
System.out.println(token);
}
System.out.println("-------------------------------中英文分词对比“--------------------------------------”");
//英文
Analyzer analy= new SmartChineseAnalyzer();
TokenStream stream=analy.tokenStream("content",en);
stream.reset();//将指针摆放到最原始的位置
while (stream.incrementToken()){//指针下走一格
System.out.println(stream);
}
//结论:使用字典分词,对于新兴的词语没办法分词.
}
@Test
public void testIKAnalyzer() throws IOException {
//中文
Analyzer analyzer=new IKAnalyzer();
TokenStream token=analyzer.tokenStream("content",cn);
token.reset();//将指针摆放到最原始的位置
while (token.incrementToken()){ //指针下走一格
System.out.println(token);
}
System.out.println("-------------------------------中英文分词对比“--------------------------------------”");
//英文
Analyzer analy= new IKAnalyzer();
TokenStream stream=analy.tokenStream("content",en);
stream.reset();//将指针摆放到最原始的位置
while (stream.incrementToken()){//指针下走一格
System.out.println(stream);
}
}
}