环境- Java版本“11.0.12”2021年7月20日LTS,解决方案-8.9.0
下面是Solr索引的字段声明:
<field name="Field1" type="string" multiValued="false" indexed="false" stored="true"/>
<field name="author" type="text_general" multiValued="false" indexed="true" stored="true"/>
<field name="Field2" type="string" multiValued="false" indexed="false" stored="true"/>
字段类型:
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
Solr-core已使用以下命令创建:./solr create -c fuzzyCore用于索引数据的.csv文件为https://drive.google.com/file/d/1z684x2GKsSQWGAdyi6O4uKit4a96iiuh/view
我知道“Lucene支持基于Levenshtein距离或编辑距离算法的模糊搜索。要进行模糊搜索,需要使用单个单词术语末尾的波浪号“~"符号。
~运算符用于运行模糊搜索。我们需要在每一个项后面添加~运算符,还可以指定距离,距离是可选的,如下所示。”
{FIELD_NAME:TERM_1~{Edit_Distance}
型
由于“KeywordTokenizer”将整个输入保存为单个标记,并且我希望每个单词都是可搜索的,因此使用了“StandardTokenizer”。
请求如下所示:
curl "http://localhost:8983/solr/fuzzyCore/select" --data-urlencode "q=author:beaeb~' AND Field1:(w1 x)" --data-urlencode "rows=20"
{
"responseHeader":{
"status":0,
"QTime":14,
"params":{
"q":"author:beaeb~' AND Field1:(w1 x)",
"rows":"20"}},
"response":{"numFound":12,"start":0,"numFoundExact":true,"docs":[
{
"Field1":"x",
"author":"bbaeb",
"Field2":"o",
"id":"f8fbb58d-9e0d-47b2-aa3c-e3920e25a7d1",
"_version_":1746912583192936455},
{
"Field1":"x",
"author":"beabe",
"Field2":"p",
"id":"7d73e7ba-8455-4eb4-818f-1e19b1d35a22",
"_version_":1746912583244316680},
{
"Field1":"x",
"author":"baeeb",
"Field2":"n",
"id":"b4e86fc3-7ecc-407b-b638-88d167a66934",
"_version_":1746912583292551181},
{
"Field1":"x",
"author":"beaea",
"Field2":"o",
"id":"131ad4de-eaa2-47b8-b58b-e690316eed1c",
"_version_":1746912583314571267},
{
"Field1":"x",
"author":"bbaeb",
"Field2":"q",
"id":"d034e66c-a302-4b24-a186-5a2bafecab40",
"_version_":1746912583392165900},
{
"Field1":"x",
"author":"beacb",
"Field2":"n",
"id":"c0ab3e48-2b2d-438d-8cc2-1acfcf6efde8",
"_version_":1746912583490732036},
{
"Field1":"x",
"author":"aeabe",
"Field2":"m",
"id":"4472ec5d-eace-446f-b1d6-c8911be24368",
"_version_":1746912583266336776},
{
"Field1":"x",
"author":"baeab",
"Field2":"q",
"id":"b4c24da3-9199-4eba-a8a3-e30fc17d9167",
"_version_":1746912583274725377},
{
"Field1":"x",
"author":"aeaea",
"Field2":"n",
"id":"bb17bc26-e392-4fed-ae46-bbdd40af0ac0",
"_version_":1746912583294648329},
{
"Field1":"x",
"author":"aeceb",
"Field2":"p",
"id":"5e5cfe21-ff19-464f-8adf-8b5888c418e4",
"_version_":1746912583296745472},
{
"Field1":"x",
"author":"baeab",
"Field2":"p",
"id":"54a3c8e6-137d-47c3-9192-a5ed1904dc55",
"_version_":1746912583357562889},
{
"Field1":"x",
"author":"aeeeb",
"Field2":"m",
"id":"200694a0-6248-49fd-8182-dac79657e045",
"_version_":1746912583385874444}]
}}
,尽管author:"bebbeb“存在于Field 1:w1的数据中,但上述请求未检索输出”author:bebbeb“。这可以使用以下两个命令进行验证
curl "http://localhost:8983/solr/fuzzyCore/select" --data-urlencode "q=author:beaeb~' AND Field1:w1"
{
"responseHeader":{
"status":0,
"QTime":4,
"params":{
"q":"author:beaeb~' AND Field1:w1"}},
"response":{"numFound":0,"start":0,"numFoundExact":true,"docs":[]
}}
尽管以下命令的输出为
curl "http://localhost:8983/solr/fuzzyCore/select" --data-urlencode "q=Field1:w1"
{
"responseHeader":{
"status":0,
"QTime":1,
"params":{
"q":"Field1:w1"}},
"response":{"numFound":1,"start":0,"numFoundExact":true,"docs":[
{
"Field1":"w1",
"author":"bebbeb",
"Field2":"p",
"id":"4356dff2-ab93-4bab-a4dc-1797db38240c",
"_version_":1746912583504363523}]
}}
所以我试着贴出你需要了解我的问题的一切。有什么想法吗?为什么作者:“bebbeb”没有作为输入的输出结果:beaeb~
1条答案
按热度按时间g52tjvyc1#
在调试Lucene之后,我们发现有一个名为
maxExpansions
的参数默认设置为50,可以扩展到1024。然而,查看Solr代码,我们可以看到
FuzzyQuery
构造函数只被调用了两次,并且总是使用默认的maxExpansions
值(出于性能原因);**这意味着模糊搜索最多只会选取50个最相似的术语,而丢弃其他术语。**这就是为什么当许多文档被编入索引并且大多数术语都相似时(就像您的情况一样),某些文档可能无法返回。需要Solr开源贡献来公开此参数,并使此特性的使用更加灵活(允许设置不同的值)。