OpenNLP进行中文命名实体识别(下:载入模型识别实体)
http://blog.csdn.net/qdhy199148/article/details/51051321
上一节介绍了使用OpenNLP训练命名实体识别模型的方法,并将模型写到磁盘上形成二进制bin文件,这一节就是将模型从磁盘上载入,然后进行命名实体识别。依然是先上代码:
- import java.io.File;
- import java.util.HashMap;
- import java.util.IdentityHashMap;
- import java.util.Map;
- import java.util.Map.Entry;
- import opennlp.tools.cmdline.namefind.TokenNameFinderModelLoader;
- import opennlp.tools.namefind.NameFinderME;
- import opennlp.tools.tokenize.SimpleTokenizer;
- import opennlp.tools.tokenize.Tokenizer;
- import opennlp.tools.util.Span;
- public class NameEntityFindTester {
- // 默认参数
- private double probThreshold = 0.6;
- private String modelPath;
- private String testFileDirPath;
- public NameEntityFindTester() {
- super();
- // TODO Auto-generated constructor stub
- }
- public NameEntityFindTester(String modelPath, String testFileDirPath) {
- super();
- this.modelPath = modelPath;
- this.testFileDirPath = testFileDirPath;
- }
- public NameEntityFindTester(double probThreshold, String modelPath,
- String testFileDirPath) {
- super();
- this.probThreshold = probThreshold;
- this.modelPath = modelPath;
- this.testFileDirPath = testFileDirPath;
- }
- /**
- * 生成NameFinder
- *
- * @return
- */
- public NameFinderME prodNameFinder() {
- NameFinderME finder = new NameFinderME(
- new TokenNameFinderModelLoader().load(new File(modelPath)));
- return finder;
- }
- /**
- * 计算基本命名实体概率
- *
- * @param finder
- * 命名实体识别模型
- * @return
- * @throws Exception
- */
- public Map<String, String> cptBasicNameProb(NameFinderME finder)
- throws Exception {
- Map<String, String> basicNameProbResMap = new IdentityHashMap<String, String>();
- String testContent = NameEntityTextFactory.loadFileTextDir(this
- .getTestFileDirPath());
- // TODO 大文本情况下,消耗内存大,需要改写成分批处理模式(把一个大文件分成多个小文件再批量处理)
- Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
- // 待测词,测试结果,概率
- String[] tokens = tokenizer.tokenize(testContent);
- Span[] names = finder.find(tokens);
- double[] nameSpanProbs = finder.probs(names);
- System.out.println("tokens size: " + tokens.length);
- System.out.println("names size: " + names.length);
- System.out.println("name_span_probs size: " + nameSpanProbs.length);
- for (int i = 0; i < names.length; i++) {
- String testToken = "";
- for (int j = names[i].getStart(); j <= names[i].getEnd() - 1; j++) {
- testToken += tokens[j];
- }
- String testRes = names[i].getType() + ":"
- + Double.toString(nameSpanProbs[i]);
- // TODO delete print
- System.out.println("find name: \"" + testToken + "\" has res: "
- + testRes);
- basicNameProbResMap.put(testToken, testRes);
- }
- return basicNameProbResMap;
- }
- /**
- * 过滤除去概率值过低的识别结果
- *
- * @param basicNameProbResMap
- * @return
- */
- public Map<String, String> filterNameProbRes(
- Map<String, String> basicNameProbResMap) {
- Map<String, String> filttedNameProbResMap = new HashMap<String, String>();
- for (Entry<String, String> entry : basicNameProbResMap.entrySet()) {
- String token = entry.getKey();
- String res = basicNameProbResMap.get(token);
- if (Double.parseDouble(res.split(":")[1]) >= this
- .getProbThreshold()) {
- filttedNameProbResMap.put(token, res);
- }
- }
- return filttedNameProbResMap;
- }
- /**
- * 预测组件总调用方法
- *
- * @return
- */
- public Map<String, String> execNameFindTester() {
- try {
- NameFinderME finder = this.prodNameFinder();
- Map<String, String> basicNameProbResMap = this
- .cptBasicNameProb(finder);
- Map<String, String> nameProbResMap = this
- .filterNameProbRes(basicNameProbResMap);
- return nameProbResMap;
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- return null;
- }
- }
- }
- <Name, Type:Probility>
key是实体,后面是所属的实体类别以及概率值;3.filterNameProbRes()方法传入刚才得到的Map结果集,返回过滤后的新Map结果集。
随便用了一些很少的关于电子诈骗的语料进行测试,最终识别出来的结果是这个样子的,可以看到,识别出了1个person和10个action,基本上还都有那么点意思,还有person的训练词库本身词量就很少,导致这个结果。
同样,源代码开源在:https://github.com/Ailab403/ailab-mltk4j,test包里面对应有完整的调用demo,以及file文件夹里面的测试语料和已经训练好的模型。