我正在将我的elasticsearch从5.6升级到8.9,我有一个查询,其中我按权重字段和_score排序。分配给数据的分数存在差异,因此给出了不同的结果顺序。
谁能帮我找到问题和解决方案相同
查询-
POST /auto-complete/_search?typed_keys=true
{
"size": 5,
"query": {
"bool": {
"should": [
{
"match_phrase_prefix": {
"suggestion": {
"query": "the"
}
}
},
{
"match": {
"suggestion.analyzed": {
"fuzziness": "AUTO",
"operator": "and",
"query": "the"
}
}
}
]
}
},
"sort": [
{
"weight": {
"order": "desc"
}
},
{
"_score": {
"order": "desc"
}
}
]
}
字符串
在Elasticsearch 5.6数据上,
{
"took": 129,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4858,
"max_score": null,
"hits": [
{
"_index": "auto-complete",
"_type": "default",
"_id": "The Thelma Hoop Earrings",
"_score": 6.3522644,
"_source": {
"suggestion": "The Thelma Hoop Earrings",
"weight": 1
},
"sort": [
1,
6.3522644
]
},
{
"_index": "auto-complete",
"_type": "default",
"_id": "The Theresa Ring",
"_score": 6.3522644,
"_source": {
"suggestion": "The Theresa Ring",
"weight": 1
},
"sort": [
1,
6.3522644
]
},
{
"_index": "auto-complete",
"_type": "default",
"_id": "The Theodora Ring",
"_score": 6.337865,
"_source": {
"suggestion": "The Theodora Ring",
"weight": 1
},
"sort": [
1,
6.337865
]
},
{
"_index": "auto-complete",
"_type": "default",
"_id": "The Thea Ring",
"_score": 6.337865,
"_source": {
"suggestion": "The Thea Ring",
"weight": 1
},
"sort": [
1,
6.337865
]
},
{
"_index": "auto-complete",
"_type": "default",
"_id": "The Theor Band For Him",
"_score": 5.7033815,
"_source": {
"suggestion": "The Theor Band For Him",
"weight": 1
},
"sort": [
1,
5.7033815
]
}
]
}
}
型
在ElasticSearch 8.9上,它是-
{
"took": 28,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4874,
"relation": "eq"
},
"max_score": null,
"hits": [
{
"_index": "auto-complete",
"_id": "The Theodora Ring",
"_score": 8.927014,
"_source": {
"suggestion": "The Theodora Ring",
"weight": 1
},
"sort": [
1,
8.927014
]
},
{
"_index": "auto-complete",
"_id": "The Theresa Ring",
"_score": 8.927014,
"_source": {
"suggestion": "The Theresa Ring",
"weight": 1
},
"sort": [
1,
8.927014
]
},
{
"_index": "auto-complete",
"_id": "The Thea Ring",
"_score": 8.927014,
"_source": {
"suggestion": "The Thea Ring",
"weight": 1
},
"sort": [
1,
8.927014
]
},
{
"_index": "auto-complete",
"_id": "The Thelma Hoop Earrings",
"_score": 7.9907713,
"_source": {
"suggestion": "The Thelma Hoop Earrings",
"weight": 1
},
"sort": [
1,
7.9907713
]
}
]
}
}
型
Elasticsearch 5.6的Map文件是-
curl -X PUT "localhost:9201/auto-complete?pretty" -H 'Content-Type: application/json' -d'
{
"mappings" :
{
"default": {
"properties": {
"suggestion": {
"type": "text",
"fields": {
"analyzed": {
"type": "text",
"analyzer": "nGram_analyzer",
"search_analyzer": "whitespace"
}
}
},
"weight": {
"type": "integer"
}
}
}
},
"settings" :
{
"number_of_shards": 1,
"number_of_replicas": 1,
"index": {
"analysis": {
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"nGram_filter"
]
}
},
"filter": {
"nGram_filter": {
"type": "nGram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
]
}
}
}
}
}
}'
型
Elasticsearch 8.9的Map是-
curl -X PUT "localhost:9201/auto-complete?pretty" -H 'Content-Type: application/json' -d'
{
"mappings" :
{
"properties": {
"suggestion": {
"type": "text",
"fields": {
"analyzed": {
"type": "text",
"analyzer": "nGram_analyzer",
"search_analyzer": "whitespace"
}
}
},
"weight": {
"type": "integer"
}
}
},
"settings" :
{
"number_of_shards": 1,
"number_of_replicas": 1,
"max_ngram_diff" : 18,
"index": {
"analysis": {
"analyzer": {
"nGram_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"nGram_filter"
]
}
},
"filter": {
"nGram_filter": {
"type": "ngram",
"min_gram": 2,
"max_gram": 20,
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
]
}
}
}
}
}}
型
1条答案
按热度按时间68de4m5k1#
**原因如下:**Elasticsearch 5.x使用TF/IDF相似度模型,而Elasticsearch 8.x默认使用BM 25模型,这两个模型计算相关度得分的方式不同,这可能会导致不同的结果。
今天,Elasticsearch中的默认评分算法是TF/IDF。一旦Elasticsearch切换到Lucene 6,此默认值将更改为BM 25。在此演讲中,Britta将告诉您有关BM 25的所有内容-它是什么,它与TF/IDF和其他评分技术的区别,以及为什么它可能是更好的默认值。https://www.elastic.co/elasticon/conf/2016/sf/improved-text-scoring-with-bm25
您可以继续阅读以获取更多信息。
有什么区别?
BM 25是TF/IDF模型的扩展,并进行了一些修改以提高其性能。它包括术语频率(TF)和逆文档频率(IDF)的概念,但它还引入了两个额外的因素:
1.**Term Frequency Saturation:**与TF/IDF不同,术语频率分量随着术语出现频率的增加而不断增长,而在BM 25中,术语频率分量的增长在术语出现“足够”次数时会减慢。这被称为术语频率饱和。其思想是,在某个点之后,术语的额外出现不会使文档更相关。
1.字段长度归一化:BM 25还引入了一个因子来处理不同长度的字段(或文档)。在TF/IDF中,出现在短字段中的术语可以具有与长字段中相同的权重,这可能会扭曲相关性。BM 25引入了一个参数来规范化这一点,因此较短的字段不会获得太多的权重。
这些修改通常使BM 25在对给定查询的文档相关性进行排名方面比TF/IDF表现得更好。
x1c 0d1x的数据
要使用您的数据进行测试,您可以使用解释API。解释API ESv5.6和ESv8.0