elasticsearch6.0 中文分词
软件版本:
elasticsearch-analysis-ik 6.0.0
https://github.com/medcl/elasticsearch-analysis-ik
elasticsearch 6.0.0
https://github.com/elastic/elasticsearch
elasticsearch-head 0.9
https://github.com/mobz/elasticsearch-head
按照github
https://github.com/medcl/elasticsearch-analysis-ik
上面(略有改动,因为可能报错):
1.create a index
curl -XPUT http://localhost:9200/index
2.create a mapping
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/_mapping -d'
{
"properties": {
"content": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word"
}
}
}'
3.index some docs
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/1 -d'
{"content":"美国留给伊拉克的是个烂摊子吗"}
'
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/2 -d'
{"content":"公安部:各地校车将享最高路权"}
'
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/3 -d'
{"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
'
curl -H 'Content-Type:application/json' http://localhost:9200/index/fulltext/4 -d'
{"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
'
4.query with highlighting
curl -H 'Content-Type:application/json' 'http://localhost:9200/index/fulltext/_search?pretty=true' -d'
{
"query" : { "match" : { "content" : "中国" }},
"highlight" : {
"pre_tags" : ["<tag1>", "<tag2>"],
"post_tags" : ["</tag1>", "</tag2>"],
"fields" : {
"content" : {}
}
}
}
'
5. 默认分词
单个字分开了
[[email protected] ~]# curl -H 'Content-Type:application/json' 'http://localhost:9200/index/_analyze?pretty=true' -d '{"text":"中华人民共和国"}'
{
"tokens" : [
{
"token" : "中",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "华",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
{
"token" : "人",
"start_offset" : 2,
"end_offset" : 3,
"type" : "<IDEOGRAPHIC>",
"position" : 2
},
{
"token" : "民",
"start_offset" : 3,
"end_offset" : 4,
"type" : "<IDEOGRAPHIC>",
"position" : 3
},
{
"token" : "共",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<IDEOGRAPHIC>",
"position" : 4
},
{
"token" : "和",
"start_offset" : 5,
"end_offset" : 6,
"type" : "<IDEOGRAPHIC>",
"position" : 5
},
{
"token" : "国",
"start_offset" : 6,
"end_offset" : 7,
"type" : "<IDEOGRAPHIC>",
"position" : 6
}
]
}
.
6. ik分词
1) max word
curl -H 'Content-Type:application/json' 'http://localhost:9200/index/_analyze?pretty=true' -d '
{
"text":"中华人民共和国",
"analyzer" : "ik_max_word"
}'
{
"tokens" : [
{
"token" : "中华人民共和国",
"start_offset" : 0,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "中华人民",
"start_offset" : 0,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "中华",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "华人",
"start_offset" : 1,
"end_offset" : 3,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "人民共和国",
"start_offset" : 2,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "人民",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "共和国",
"start_offset" : 4,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 6
},
{
"token" : "共和",
"start_offset" : 4,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 7
},
{
"token" : "国",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 8
}
]
}
2) ik_smart
curl -H 'Content-Type:application/json' 'http://localhost:9200/index/_analyze?pretty=true' -d '
{
"text":"中华人民共和国",
"analyzer" : "ik_smart"
}'
{
"tokens" : [
{
"token" : "中华人民共和国",
"start_offset" : 0,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 0
}
]
}
7 elasticsearch-head操作
1) 默认分词
http://192.168.10.81:9200/
index/_analyze?pretty=true
index/_analyze?pretty=true
POST
{
"text": "中华人民共和国"
}
{
"text": "中华人民共和国"
}
2) ik分词
http://192.168.10.81:9200/
index/_analyze?pretty=true
index/_analyze?pretty=true
POST
{
"text": "中华人民共和国",
"analyzer": "ik_max_word"
}
{
"text": "中华人民共和国",
"analyzer": "ik_max_word"
}