如何在elasticsearch中突出匹配的潜台词

我得到预期的突出显示子字符串匹配用户搜索关键字在companyName和国家字段。然而，在emailId字段中，整个字段都被突出显示，而不是突出显示子字符串。

查询

POST customers-index/_search
{
  "size": 20,
  "query": {
    "must": [
      {
        "multi_match": {
          "query": "state",
          "fields": [
            "emailId.autocomplete",
            "companyName.autocomplete",
            "country.autocomplete"
          ]
        }
      }
    ]
  },
  "highlight": {
    "fields": {
      "emailId.autocomplete": {},
      "companyName.autocomplete": {},
      "country.autocomplete": {}
    }
  },
  "_source": [
    "emailId",
    "companyName",
    "country"
  ]
}

[
 {
        "_index" : "customers-index",
        "_type" : "_doc",
        "_id" : "c44aefde-22b1-471d-9159-a092e5c604f6",
        "_score" : 14.853605,
        "_source" : {
          "country" : "Ethiopia",
          "companyName" : "La Galtoara, Inc.",
          "emailId" : [
            "[email protected]"
          ]
        },
        "highlight" : {
          "emailId.autocomplete" : [
            "<em>[email protected]</em>"
          ]
        }
      },
      {
        "_index" : "customers-index",
        "_type" : "_doc",
        "_id" : "f76ecf0a-3e7d-41f9-a96f-83c66698f2d1",
        "_score" : 3.6045084,
        "_source" : {
          "country" : "Philippines",
          "companyName" : "Belgone State Medical, Inc.",
          "emailId" : [
            "[email protected]"
          ]
        },
        "highlight" : {
          "companyName.autocomplete" : [
            "Belgone <em>State</em> Medical, Inc."
          ]
        }
      },
      {
        "_index" : "customers-index",
        "_type" : "_doc",
        "_id" : "b41b1c0c-e84d-4424-a862-38b10d380d23",
        "_score" : 2.1431046,
        "_source" : {
          "country" : "United States",
          "companyName" : "DFDFDF Brands Limited",
          "emailId" : [
            "[email protected]"
          ]
        },
        "highlight" : {
          "country.autocomplete" : [
            "United <em>State</em>s"
          ]
        }
      }

emailId字段的设置和Map如下所示

Map

{
  "emailId": {
    "type": "text",
    "fields": {
      "autocomplete": {
        "type": "text",
        "analyzer": "autocomplete_email_analyzer",
        "search_analyzer": "search_analyzer"
      },
      "keyword": {
        "type": "keyword",
        "ignore_above": 256,
        "normalizer": "lowercase_normalizer"
      }
    },
    "analyzer": "index_analyzer",
    "search_analyzer": "search_analyzer"
  }
}

设置

{
  "analysis": {
    "filter": {
      "email_filter": {
        "type": "pattern_capture",
        "preserve_original": "true",
        "patterns": [
          """(?=([@|\.|\!|\#|\$|%|&|'|\*|\+|\-|\/|\=|\?|\^|\_|\`|\{|\||\}|\~](.+)))"""
        ]
      },
      "starts_with_filter": {
        "type": "edge_ngram",
        "min_gram": "1",
        "max_gram": "100"
      }
    },
    "analyzer": {
      "search_analyzer": {
        "filter": [
          "lowercase"
        ],
        "tokenizer": "keyword"
      },
      "index_analyzer": {
        "filter": [
          "lowercase"
        ],
        "tokenizer": "index_analyzer"
      },
      "autocomplete_email_analyzer": {
        "filter": [
          "email_filter",
          "unique",
          "starts_with_filter",
          "lowercase"
        ],
        "tokenizer": "autocomplete_email_tokenizer"
      }
    },
    "normalizer": {
      "lowercase_normalizer": {
        "filter": [
          "lowercase"
        ],
        "type": "custom",
        "char_filter": []
      }
    },
    "tokenizer": {
      "autocomplete_email_tokenizer": {
        "type": "uax_url_email"
      },
      "index_analyzer": {
        "token_chars": [
          "letter",
          "digit",
          "whitespace",
          "punctuation",
          "symbol"
        ],
        "min_gram": "2",
        "type": "ngram",
        "max_gram": "30"
      }
    }
  }
}

注意：我只添加了与emailId字段相关的Map和设置，因为它是关注的字段。
正如我们从响应中注意到的那样，companyName和country字段在匹配搜索查询级别的子字符串上突出显示，而emailId字段则完全突出显示。
如何只突出显示emailId字段中的子字符串。任何帮助都非常感谢。先谢了。

这正是pattern capture filter documentation所警告的问题。
让我们深入探究问题的根源。在搜索过程中，不会保留关于特定查询与特定记录匹配的原因的信息。所以，荧光笔需要逆向工程这个过程，这是相当复杂的，不是很精确。
分析器使这变得非常困难，因为它们可以完全改变原始文本。将结果标记与原始文本中的位置连接的信息由标记器以每个标记的开始和结束偏移量的形式与每个标记一起存储。在标记器放置此信息后，它可以由某些过滤器更新，但由于各种原因，并非所有过滤器都这样做。例如，word_delimiter过滤器将更新偏移信息：

POST test/_analyze
{
  "text": [
    "[email protected]"
  ],
  "tokenizer": "uax_url_email",
  "filter": [ "word_delimiter" ]
}

{
  "tokens": [
    {
      "token": "test",
      "start_offset": 0,
      "end_offset": 4,
      "type": "<EMAIL>",
      "position": 0
    },
    {
      "token": "example",
      "start_offset": 5,
      "end_offset": 12,
      "type": "<EMAIL>",
      "position": 1
    },
    {
      "token": "org",
      "start_offset": 13,
      "end_offset": 16,
      "type": "<EMAIL>",
      "position": 2
    }
  ]
}

但是ngram和pattern_capture只是将原始令牌的位置复制到它们将从中产生的所有令牌。这正是发生在你的情况。您的tokenizer生成一个跨越整个电子邮件地址的令牌，所有其他过滤器只需将此信息复制到它们生成的所有令牌中：

POST customers-index/_analyze
{
  "text": ["[email protected]"],
  "field": "emailId.autocomplete"
}

正如您所看到的，所有这些标记都对应于位置为0、起始偏移量为0、结束偏移量为18的原始标记，这是整个字符串。所以highlighter匹配其中一个token并高亮显示整个字符串。

{
  "tokens": [
    {
      "token": "g",
      "start_offset": 0,
      "end_offset": 18,
      "type": "<EMAIL>",
      "position": 0
    },
.... 
    {
      "token": "state",
      "start_offset": 0,
      "end_offset": 18,
      "type": "<EMAIL>",
      "position": 0
    },
....
,
    {
      "token": "gov",
      "start_offset": 0,
      "end_offset": 18,
      "type": "<EMAIL>",
      "position": 0
    }
  ]
}

换句话说，如果您修复了分析器产生的偏移量，您将修复突出显示。我不知道你为什么要添加所有这些过滤器，所以我不能为你提供一个完整的解决方案，将工作的所有用例，但我希望下面的例子将给你给予一些想法。你可以从替换不更新偏移量的过滤器开始。举例来说：

POST _analyze
{
  "text": [
    "[email protected]"
  ],
  "tokenizer": "uax_url_email",
  "filter": [
    {
      "type": "word_delimiter"
    },
    {
      "type": "lowercase"
    },
    {
      "type": "edge_ngram",
      "min_gram": "1",
      "max_gram": "100"
    }
  ]
}

这个过滤器将产生3个具有正确偏移的令牌，这将保持前缀和这些前缀来自的字符串之间的关系：

{
  "tokens": [
    {
      "token": "g",
      "start_offset": 0,
      "end_offset": 8,
      "type": "<EMAIL>",
      "position": 0
    },
    {
      "token": "ga",
      "start_offset": 0,
      "end_offset": 8,
      "type": "<EMAIL>",
      "position": 0
    },
    {
      "token": "gal",
      "start_offset": 0,
      "end_offset": 8,
      "type": "<EMAIL>",
      "position": 0
    },
....
   {
      "token": "state",
      "start_offset": 9,
      "end_offset": 14,
      "type": "<EMAIL>",
      "position": 1
    },
.....
    {
      "token": "gov",
      "start_offset": 15,
      "end_offset": 18,
      "type": "<EMAIL>",
      "position": 2
    }
  ]
}

您可以通过添加ngram tokenizer而不是将其用作过滤器来实现类似的效果。下面是一个完整的例子：

DELETE test

PUT test
{
  "settings": {
    "max_ngram_diff": 50,
    "analysis": {
      "filter": {
        "email_filter": {
          "type": "pattern_capture",
          "preserve_original": "true",
          "patterns": [
            """(?=([@|\.|\!|\#|\$|%|&|'|\*|\+|\-|\/|\=|\?|\^|\_|\`|\{|\||\}|\~](.+)))"""
            ]
        },
        "starts_with_filter": {
          "type": "edge_ngram",
          "min_gram": "1",
          "max_gram": "100"
        }
      },
      "analyzer": {
        "highlighting_email_analyzer": {
          "tokenizer": "uax_url_email",
          "filter": [
            "word_delimiter",
            "lowercase",
            "starts_with_filter"
            ]
        },
        "search_analyzer": {
          "filter": [
            "lowercase"
            ],
            "tokenizer": "keyword"
        },
        "index_analyzer": {
          "filter": [
            "lowercase"
            ],
            "tokenizer": "index_analyzer"
        },
        "autocomplete_email_analyzer": {
          "filter": [
            "email_filter",
            "unique",
            "starts_with_filter",
            "lowercase"
            ],
            "tokenizer": "autocomplete_email_tokenizer"
        }
      },
      "normalizer": {
        "lowercase_normalizer": {
          "filter": [
            "lowercase"
            ],
            "type": "custom",
            "char_filter": []
        }
      },
      "tokenizer": {
        "autocomplete_email_tokenizer": {
          "type": "uax_url_email"
        },
        "index_analyzer": {
          "token_chars": [
            "letter",
            "digit",
            "whitespace",
            "punctuation",
            "symbol"
            ],
            "min_gram": "2",
            "type": "ngram",
            "max_gram": "30"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "emailId": {
        "type": "text",
        "fields": {
          "autocomplete": {
            "type": "text",
            "analyzer": "autocomplete_email_analyzer",
            "search_analyzer": "search_analyzer"
          },
          "keyword": {
            "type": "keyword",
            "ignore_above": 256,
            "normalizer": "lowercase_normalizer"
          },
          "highlighting": {
            "type": "text",
            "analyzer": "highlighting_email_analyzer",
            "search_analyzer": "search_analyzer"
          }
        },
        "analyzer": "index_analyzer",
        "search_analyzer": "search_analyzer"
      }
    }
  }
}

POST test/_bulk
{"index":{}}
{"country":"Ethiopia","companyName":"La Galtoara, Inc.","emailId":["[email protected]"]}
{"index":{}}
{"country":"Philippines","companyName":"Belgone State Medical, Inc.","emailId":["[email protected]"]}
{"index":{}}
{"country":"United States","companyName":"DFDFDF Brands Limited","emailId":["[email protected]"]}

POST test/_search
{
  "size": 20,
  "query": {
    "bool": {
      "must": [
        {
          "multi_match": {
            "query": "state",
            "fields": [
              "emailId.highlighting",
              "companyName.autocomplete",
              "country.autocomplete"
            ]
          }
        }
      ]
    }
  },
  "highlight": {
    "fields": {
      "emailId.highlighting": {},
      "companyName.autocomplete": {},
      "country.autocomplete": {}
    }
  },
  "_source": [
    "emailId",
    "companyName",
    "country"
  ]
}

这会给你给予

"hits": [
      {
        "_index": "test",
        "_id": "ckGiHIsBImXKNSrQCBEW",
        "_score": 1.014292,
        "_source": {
          "country": "Ethiopia",
          "companyName": "La Galtoara, Inc.",
          "emailId": [
            "[email protected]"
          ]
        },
        "highlight": {
          "emailId.highlighting": [
            "galtoara@<em>state</em>.gov"
          ]
        }
      }
    ]

如何在elasticsearch中突出匹配的潜台词

1条答案

相关问题

热门标签

最新问答