仅计算Java MongoDB驱动程序中的文档,而不计算FETCH阶段

b5lpy0ml  于 2023-03-06  发布在  Java
关注(0)|答案(1)|浏览(94)

我正在尝试为给定的过滤器计算集合中的文档数,使用countDocuments最多使用1s per 100'000 documents,这有点慢,而且没有考虑到如果多个用户每隔几秒到几分钟触发一次这个计数,对数据库的影响。
在底层countDocuments()转换为aggregate查询:

db.collection.aggregate([
  {
    "$match": {
      "_id": {"$exists": true}
    }
  },
  {
    "$group": {
      "_id": 1,
      "n": {"$sum": 1}
    }
  }
])

在其上调用explain()将产生:

{
  explainVersion: '1',
  stages: [
    {
      '$cursor': {
        queryPlanner: {
          namespace: 'collection',
          indexFilterSet: false,
          parsedQuery: { _id: { '$exists': true } },
          queryHash: 'BA029CD5',
          planCacheKey: '4D66BB31',
          maxIndexedOrSolutionsReached: false,
          maxIndexedAndSolutionsReached: false,
          maxScansToExplodeReached: false,
          winningPlan: {
            stage: 'FETCH',
            filter: { _id: { '$exists': true } },
            inputStage: {
              stage: 'IXSCAN',
              keyPattern: { _id: 1 },
              indexName: '_id_',
              isMultiKey: false,
              multiKeyPaths: { _id: [] },
              isUnique: true,
              isSparse: false,
              isPartial: false,
              indexVersion: 2,
              direction: 'forward',
              indexBounds: { _id: [ '[MinKey, MaxKey]' ] }
            }
          },
          rejectedPlans: []
        },
        executionStats: {
          executionSuccess: true,
          nReturned: 614833,
          executionTimeMillis: 6589,
          totalKeysExamined: 614833,
          totalDocsExamined: 614833,
          executionStages: {
            stage: 'FETCH',
            filter: { _id: { '$exists': true } },
            nReturned: 614833,
            executionTimeMillisEstimate: 4219,
            works: 614834,
            advanced: 614833,
            needTime: 0,
            needYield: 0,
            saveState: 668,
            restoreState: 668,
            isEOF: 1,
            docsExamined: 614833,
            alreadyHasObj: 0,
            inputStage: {
              stage: 'IXSCAN',
              nReturned: 614833,
              executionTimeMillisEstimate: 568,
              works: 614834,
              advanced: 614833,
              needTime: 0,
              needYield: 0,
              saveState: 668,
              restoreState: 668,
              isEOF: 1,
              keyPattern: { _id: 1 },
              indexName: '_id_',
              isMultiKey: false,
              multiKeyPaths: { _id: [] },
              isUnique: true,
              isSparse: false,
              isPartial: false,
              indexVersion: 2,
              direction: 'forward',
              indexBounds: { _id: [ '[MinKey, MaxKey]' ] },
              keysExamined: 614833,
              seeks: 1,
              dupsTested: 0,
              dupsDropped: 0
            }
          },
          allPlansExecution: []
        }
      },
      nReturned: Long("614833"),
      executionTimeMillisEstimate: Long("6510")
    },
    {
      '$group': { _id: { '$const': 1 }, n: { '$sum': { '$const': 1 } } },
      maxAccumulatorMemoryUsageBytes: { n: Long("80") },
      totalOutputDataSizeBytes: Long("237"),
      usedDisk: false,
      spills: Long("0"),
      nReturned: Long("1"),
      executionTimeMillisEstimate: Long("6584")
    }
  ],
  command: {
    aggregate: 'collection',
    pipeline: [
      { '$match': { _id: { '$exists': true } } },
      { '$group': { _id: 1, n: { '$sum': 1 } } }
    ],
    cursor: {},
    '$db': 'db'
  },
  ok: 1,
  '$clusterTime': {
    clusterTime: Timestamp({ t: 1677593996, i: 1 }),
    signature: {
      hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
      keyId: Long("0")
    }
  },
  operationTime: Timestamp({ t: 1677593996, i: 1 })
}

请注意,制胜策略由两个阶段组成:第一个FETCH且仅在此IXSCAN之后
使用find()的查询不包括_id字段,而是投影到索引字段(称为origin)上,该查询将只针对IXSCAN

db.collection.find({}, {"_id":0, "origin": 1}).count()

将计算正确的结果,也可以使用filter.explain()将产生:

{
    explainVersion: '1',
    queryPlanner: {
        namespace: 'db.collection',
        indexFilterSet: false,
        parsedQuery: { origin: { '$eq': 'WF' } },
        queryHash: '2428EDD1',
        planCacheKey: '85C94249',
        maxIndexedOrSolutionsReached: false,
        maxIndexedAndSolutionsReached: false,
        maxScansToExplodeReached: false,
        winningPlan: {
        stage: 'PROJECTION_COVERED',
        transformBy: { _id: 0, origin: 1 },
        inputStage: {
            stage: 'IXSCAN',
            keyPattern: { origin: 1, 'metadata.iv': -1 },
            indexName: 'originAscending',
            isMultiKey: false,
            multiKeyPaths: { origin: [], 'metadata.iv': [] },
            isUnique: false,
            isSparse: false,
            isPartial: false,
            indexVersion: 2,
            direction: 'forward',
            indexBounds: {
            origin: [ '["WF", "WF"]' ],
            'metadata.iv': [ '[MaxKey, MinKey]' ]
            }
        }
        },
        rejectedPlans: [
        {
            stage: 'PROJECTION_COVERED',
            transformBy: { _id: 0, origin: 1 },
            inputStage: {
            stage: 'IXSCAN',
            keyPattern: { origin: -1, 'metadata.iv': -1 },
            indexName: 'originDescending',
            isMultiKey: false,
            multiKeyPaths: { origin: [], 'metadata.iv': [] },
            isUnique: false,
            isSparse: false,
            isPartial: false,
            indexVersion: 2,
            direction: 'forward',
            indexBounds: {
                origin: [ '["WF", "WF"]' ],
                'metadata.iv': [ '[MaxKey, MinKey]' ]
            }
            }
        }
        ]
    },
    command: {
        find: 'collection',
        filter: { origin: 'WF' },
        projection: { _id: 0, origin: 1 },
        '$db': 'db'
    },
    ok: 1,
    '$clusterTime': {
        clusterTime: Timestamp({ t: 1677593816, i: 1 }),
        signature: {
        hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
        keyId: Long("0")
        }
    },
    operationTime: Timestamp({ t: 1677593816, i: 1 })
}

这将只使用一个IXSCAN阶段,并且执行速度要快得多。
我尝试在MongoDB Java驱动程序版本4.8.0中构建此查询,方法是编写以下代码(仍在进行中):

private int aggregate(InvoiceQuery query) {
    var filter = new MongoInvoiceFilterCriteria(query.getFilter()).asBson();

    var projection = Projections.fields(Projections.excludeId(), Projections.include("origin"));

    var findPublisher =
        this.collection.find(filter).projection(projection).explain();

    var result = Flowable.fromPublisher(findPublisher).blockingFirst();

    LOGGER.info("Aggregate " + result.toJson());

    return 0;
  }

这将产生以下结果:

{
    "explainVersion": "1",
    "queryPlanner": {
        "namespace": "db.collection",
        "indexFilterSet": false,
        "parsedQuery": {
            "_id": {
                "$exists": true
            }
        },
        "queryHash": "88DBAD21",
        "planCacheKey": "D9A2E277",
        "maxIndexedOrSolutionsReached": false,
        "maxIndexedAndSolutionsReached": false,
        "maxScansToExplodeReached": false,
        "winningPlan": {
            "stage": "PROJECTION_SIMPLE",
            "transformBy": {
                "_id": 0,
                "origin": 1
            },
            "inputStage": {
                "stage": "FETCH",
                "filter": {
                    "_id": {
                        "$exists": true
                    }
                },
                "inputStage": {
                    "stage": "IXSCAN",
                    "keyPattern": {
                        "_id": 1
                    },
                    "indexName": "_id_",
                    "isMultiKey": false,
                    "multiKeyPaths": {
                        "_id": []
                    },
                    "isUnique": true,
                    "isSparse": false,
                    "isPartial": false,
                    "indexVersion": 2,
                    "direction": "forward",
                    "indexBounds": {
                        "_id": [
                            "[MinKey, MaxKey]"
                        ]
                    }
                }
            }
        },
        "rejectedPlans": []
    },
    "executionStats": {
        "executionSuccess": true,
        "nReturned": 9533,
        "executionTimeMillis": 18,
        "totalKeysExamined": 9533,
        "totalDocsExamined": 9533,
        "executionStages": {
            "stage": "PROJECTION_SIMPLE",
            "nReturned": 9533,
            "executionTimeMillisEstimate": 4,
            "works": 9534,
            "advanced": 9533,
            "needTime": 0,
            "needYield": 0,
            "saveState": 9,
            "restoreState": 9,
            "isEOF": 1,
            "transformBy": {
                "_id": 0,
                "origin": 1
            },
            "inputStage": {
                "stage": "FETCH",
                "filter": {
                    "_id": {
                        "$exists": true
                    }
                },
                "nReturned": 9533,
                "executionTimeMillisEstimate": 4,
                "works": 9534,
                "advanced": 9533,
                "needTime": 0,
                "needYield": 0,
                "saveState": 9,
                "restoreState": 9,
                "isEOF": 1,
                "docsExamined": 9533,
                "alreadyHasObj": 0,
                "inputStage": {
                    "stage": "IXSCAN",
                    "nReturned": 9533,
                    "executionTimeMillisEstimate": 0,
                    "works": 9534,
                    "advanced": 9533,
                    "needTime": 0,
                    "needYield": 0,
                    "saveState": 9,
                    "restoreState": 9,
                    "isEOF": 1,
                    "keyPattern": {
                        "_id": 1
                    },
                    "indexName": "_id_",
                    "isMultiKey": false,
                    "multiKeyPaths": {
                        "_id": []
                    },
                    "isUnique": true,
                    "isSparse": false,
                    "isPartial": false,
                    "indexVersion": 2,
                    "direction": "forward",
                    "indexBounds": {
                        "_id": [
                            "[MinKey, MaxKey]"
                        ]
                    },
                    "keysExamined": 9533,
                    "seeks": 1,
                    "dupsTested": 0,
                    "dupsDropped": 0
                }
            }
        }
    },
    "command": {
        "find": "collection",
        "filter": {
            "_id": {
                "$exists": true
            }
        },
        "projection": {
            "_id": 0,
            "origin": 1
        },
        "$db": "db"
    },
    "ok": 1.0,
    "$clusterTime": {
        "clusterTime": {
            "$timestamp": {
                "t": 1677769219,
                "i": 1
            }
        },
        "signature": {
            "hash": {
                "$binary": {
                    "base64": "AAAAAAAAAAAAAAAAAAAAAAAAAAA=",
                    "subType": "00"
                }
            },
            "keyId": 0
        }
    },
    "operationTime": {
        "$timestamp": {
            "t": 1677769219,
            "i": 1
        }
    }
}

因此,出于某种原因,mongo执行了一个FETCH和一个IXSCAN阶段,尽管据我所知,查询应该执行相同的阶段。
我考虑过使用documentCount()estimatedDocumentCount()、不同的aggregates()find()查询,但是我没有直接在mongosh上找到任何一个查询像db.collection.find({}, {"_id":0, "origin": 1}).count()一样高效。
我怎样才能提高java中文档计数的性能?
编辑@诺埃尔提供的答案:

// build the aggregation pipeline
List<Bson> pipeline = Arrays.asList(
    Aggregates.match(Filters.gte("_id", new MinKey())),
    Aggregates.group("$1", Accumulators.sum("n", 1)));

// explain the execution stats of the aggregation pipeline
var findPublisher = 
    this.collection.aggregate(pipeline).explain();

它也有一个FETCH步骤:

{
    "explainVersion": "2",
    "queryPlanner": {
        "namespace": "db.collection",
        "indexFilterSet": false,
        "parsedQuery": {
            "_id": {
                "$gte": {
                    "$minKey": 1
                }
            }
        },
        "queryHash": "D1046F5E",
        "planCacheKey": "7E518BFB",
        "optimizedPipeline": true,
        "maxIndexedOrSolutionsReached": false,
        "maxIndexedAndSolutionsReached": false,
        "maxScansToExplodeReached": false,
        "winningPlan": {
            "queryPlan": {
                "stage": "GROUP",
                "planNodeId": 3,
                "inputStage": {
                    "stage": "FETCH",
                    "planNodeId": 2,
                    "inputStage": {
                        "stage": "IXSCAN",
                        "planNodeId": 1,
                        "keyPattern": {
                            "_id": 1
                        },
                        "indexName": "_id_",
                        "isMultiKey": false,
                        "multiKeyPaths": {
                            "_id": []
                        },
                        "isUnique": true,
                        "isSparse": false,
                        "isPartial": false,
                        "indexVersion": 2,
                        "direction": "forward",
                        "indexBounds": {
                            "_id": [
                                "[MinKey, MaxKey]"
                            ]
                        }
                    }
                }
            },
            "slotBasedPlan": {
            }
        },
        "rejectedPlans": []
    },
    "executionStats": {
    },
    "command": {
        "aggregate": "collection",
        "pipeline": [
            {
                "$match": {
                    "_id": {
                        "$gte": {
                            "$minKey": 1
                        }
                    }
                }
            },
            {
                "$group": {
                    "_id": "$1",
                    "n": {
                        "$sum": 1
                    }
                }
            }
        ],
        "cursor": {
            "batchSize": 1
        },
        "$db": "db"
    }
}
chhkpiq4

chhkpiq41#

我认为你应该把COUNT_SCAN作为获胜的计划。这个查询使用索引on_id来计数。

db.collection.explain("executionStats").aggregate([
{
    $match: {
        "_id": {"$gte": MinKey}
    }
},
{
    $group: {
        "_id": 1, "n": {"$sum": 1}
    }
}
]);

java :

List<Bson> pipeline = Arrays.asList(
        Aggregates.match(Filters.gte("_id", new MinKey())),
        Aggregates.group("1", Accumulators.sum("n", 1))
);
    • 注意:这可能不会在分片集合上触发COUNT_SCAN。**

统计数据:

"winningPlan" : {
    "stage" : "COUNT_SCAN",
    "keyPattern" : {
        "_id" : 1
    },
    "indexName" : "_id_",
    "isMultiKey" : false,
    "multiKeyPaths" : {
        "_id" : []
    },
    "isUnique" : true,
    "isSparse" : false,
    "isPartial" : false,
    "indexVersion" : 2,
    "indexBounds" : {
        "startKey" : {
            "_id" : { "$minKey" : 1 }
        },
        "startKeyInclusive" : true,
        "endKey" : {
            "_id" : { "$maxKey" : 1 }
        },
        "endKeyInclusive" : true
    }
}
                
"executionStats" : {
    "executionSuccess" : true,
    "nReturned" : 9,
    "executionTimeMillis" : 0,
    "totalKeysExamined" : 10,
    "totalDocsExamined" : 0,
    "executionStages" : {
        "stage" : "COUNT_SCAN",
        "nReturned" : 9,
        "executionTimeMillisEstimate" : 0,
        "works" : 10,
        "advanced" : 9,
        "needTime" : 0,
        "needYield" : 0,
        "saveState" : 1,
        "restoreState" : 1,
        "isEOF" : 1,
        "keysExamined" : 10,
        "keyPattern" : {
            "_id" : 1
        },
        "indexName" : "_id_",
        "isMultiKey" : false,
        "multiKeyPaths" : {
            "_id" : []
        },
        "isUnique" : true,
        "isSparse" : false,
        "isPartial" : false,
        "indexVersion" : 2,
        "indexBounds" : {
            "startKey" : {
                "_id" : { "$minKey" : 1 }
            },
            "startKeyInclusive" : true,
            "endKey" : {
                "_id" : { "$maxKey" : 1 }
            },
            "endKeyInclusive" : true
        }
    }

相关问题