将嵌套的JSON文件转换为pandas Dataframe

bq9c1y66  于 11个月前  发布在  其他
关注(0)|答案(1)|浏览(105)

我试图将嵌套的JSON文件与列表和字典转换为 Dataframe 。到目前为止,我已经写了这段代码:

for result in rawData['results']:
    openfda_data = result.get('openfda', {})
    # Collect 'openfda' data
    openfda_entries = {
        key: [value for value in values] if isinstance(values, list) else [values]
        for key, values in openfda_data.items()
    }

    # Flatten the 'submissions' data
    for submission in result.get('submissions', []):
        flattened_entry = {
            'application_number': result['application_number'],
            'sponsor_name': result['sponsor_name'],
            'submission_type': submission.get('submission_type', None),
            'submission_number': submission.get('submission_number', None),
            'submission_status': submission.get('submission_status', None),
            'submission_status_date': submission.get('submission_status_date', None),
            'submission_class_code': submission.get('submission_class_code', None),
            'submission_class_code_description': submission.get('submission_class_code_description', None)
        }

        # Add 'openfda' data
        for key, values in openfda_entries.items():
            flattened_entry[f'openfda_{key}'] = values  # Store all values in a list
        flattened_data.append(flattened_entry)
        # Flatten the 'products' data
    for product in result.get('products', []):
        flattened_product = {
            'application_number': result['application_number'],
            'sponsor_name': result['sponsor_name'],
            'product_number': product.get('product_number', None),
            'reference_drug': product.get('reference_drug', None),
            'brand_name': product.get('brand_name', None),
            'active_ingredient_name': product['active_ingredients'][0].get('name', None),
            'active_ingredient_strength': product['active_ingredients'][0].get('strength', None),
            'reference_standard': product.get('reference_standard', None),
            'dosage_form': product.get('dosage_form', None),
            'route': product.get('route', None),
            'marketing_status': product.get('marketing_status', None)
        }
        flattened_data.append(flattened_product)

字符串
这段代码能够flaten大部分的数据,以重新命名
问题在于openfda值。来自openfda条目的值在结果 Dataframe 中的列表中,如下所示:
[“首选关节炎8小时”、“醋氨酚- APAP 8小时”、“醋氨酚”、"关节炎疼痛缓解剂“、”首选关节炎疼痛缓解剂“、”醋氨酚- APAP关节炎“、”关节炎疼痛缓解剂“]
我还想把这些值变平,这样这个值也在列表之外。

"results": [
{
  "submissions": [
    {
      "submission_type": "ORIG",
      "submission_number": "1",
      "submission_status": "AP",
      "submission_status_date": "20021021"
    },
    {
      "submission_type": "SUPPL",
      "submission_number": "6",
      "submission_status": "AP",
      "submission_status_date": "20110610",
      "submission_class_code": "LABELING",
      "submission_class_code_description": "Labeling"
    }
  ],
  "application_number": "ANDA076177",
  "sponsor_name": "DR REDDYS LABS SA",
  "openfda": {
    "application_number": [
      "ANDA076177"
    ],
    "brand_name": [
      "CAMILA"
    ],
    "generic_name": [
      "NORETHINDRONE"
    ],
    "manufacturer_name": [
      "Mayne Pharma Inc."
    ],
    "product_ndc": [
      "51862-884"
    ],
    "product_type": [
      "HUMAN PRESCRIPTION DRUG"
    ],
    "route": [
      "ORAL"
    ],
    "substance_name": [
      "NORETHINDRONE"
    ],
    "rxcui": [
      "198042",
      "748961",
      "748962"
    ],
    "spl_id": [
      "022c6de4-4589-4d4c-8ac4-977c7e3ef568"
    ],
    "spl_set_id": [
      "a786be85-49ba-4369-b510-7dccc10f7f18"
    ],
    "package_ndc": [
      "51862-884-01",
      "51862-884-03"
    ],
    "nui": [
      "M0447349",
      "N0000175602"
    ],
    "pharm_class_cs": [
      "Progesterone Congeners [CS]"
    ],
    "pharm_class_epc": [
      "Progestin [EPC]"
    ],
    "unii": [
      "T18F433X4S"
    ]
  },
  "products": [
    {
      "product_number": "001",
      "reference_drug": "No",
      "brand_name": "CAMILA",
      "active_ingredients": [
        {
          "name": "NORETHINDRONE",
          "strength": "0.35MG"
        }
      ],
      "reference_standard": "No",
      "dosage_form": "TABLET",
      "route": "ORAL-28",
      "marketing_status": "Prescription",
      "te_code": "AB1"
    }
  ]
},
{
  "submissions": [
    {
      "submission_type": "SUPPL",
      "submission_number": "37",
      "submission_status": "AP",
      "submission_status_date": "20200707",
      "review_priority": "STANDARD",
      "submission_class_code": "LABELING",
      "submission_class_code_description": "Labeling"
    },
    {
      "submission_type": "SUPPL",
      "submission_number": "31",
      "submission_status": "AP",
      "submission_status_date": "20130910",
      "review_priority": "STANDARD",
      "submission_class_code": "LABELING",
      "submission_class_code_description": "Labeling"
    },
    {
      "submission_type": "SUPPL",
      "submission_number": "13",
      "submission_status": "AP",
      "submission_status_date": "20080129",
      "submission_class_code": "LABELING",
      "submission_class_code_description": "Labeling"
    },
    {
      "submission_type": "SUPPL",
      "submission_number": "5",
      "submission_status": "AP",
      "submission_status_date": "20040817",
      "submission_class_code": "LABELING",
      "submission_class_code_description": "Labeling"
    },
    {
      "submission_type": "SUPPL",
      "submission_number": "10",
      "submission_status": "AP",
      "submission_status_date": "20070110",
      "submission_class_code": "LABELING",
      "submission_class_code_description": "Labeling"
    }
  ],
  "application_number": "ANDA076194",
  "sponsor_name": "WATSON LABS",
  "openfda": {
    "application_number": [
      "ANDA076194"
    ],
    "brand_name": [
      "LISINOPRIL AND HYDROCHLOROTHIAZIDE"
    ],
    "generic_name": [
      "LISINOPRIL AND HYDROCHLOROTHIAZIDE"
    ],
    "manufacturer_name": [
      "Actavis Pharma, Inc."
    ],
    "product_ndc": [
      "0591-0860",
      "0591-0861",
      "0591-0862"
    ],
    "product_type": [
      "HUMAN PRESCRIPTION DRUG"
    ],
    "route": [
      "ORAL"
    ],
    "substance_name": [
      "HYDROCHLOROTHIAZIDE",
      "LISINOPRIL"
    ],
    "rxcui": [
      "197885",
      "197886",
      "197887"
    ],
    "spl_id": [
      "ff5e5610-775f-48a9-a410-e8d8b547b958"
    ],
    "spl_set_id": [
      "ab5e9ae4-e575-4a63-99f3-2bde36bdd508"
    ],
    "package_ndc": [
      "0591-0860-01",
      "0591-0860-05",
      "0591-0861-01",
      "0591-0861-05",
      "0591-0862-01",
      "0591-0862-05"
    ],
    "nui": [
      "N0000175359",
      "N0000175419",
      "M0471776"
    ],
    "pharm_class_pe": [
      "Increased Diuresis [PE]"
    ],
    "pharm_class_epc": [
      "Thiazide Diuretic [EPC]"
    ],
    "pharm_class_cs": [
      "Thiazides [CS]"
    ],
    "unii": [
      "E7199S1YWR",
      "0J48LPH2TH"
    ]
  },
  "products": [
    {
      "product_number": "003",
      "reference_drug": "No",
      "brand_name": "LISINOPRIL AND HYDROCHLOROTHIAZIDE",
      "active_ingredients": [
        {
          "name": "HYDROCHLOROTHIAZIDE",
          "strength": "12.5MG"
        },
        {
          "name": "LISINOPRIL",
          "strength": "10MG"
        }
      ],
      "reference_standard": "No",
      "dosage_form": "TABLET",
      "route": "ORAL",
      "marketing_status": "Prescription",
      "te_code": "AB"
    },
    {
      "product_number": "001",
      "reference_drug": "No",
      "brand_name": "LISINOPRIL AND HYDROCHLOROTHIAZIDE",
      "active_ingredients": [
        {
          "name": "HYDROCHLOROTHIAZIDE",
          "strength": "12.5MG"
        },
        {
          "name": "LISINOPRIL",
          "strength": "20MG"
        }
      ],
      "reference_standard": "No",
      "dosage_form": "TABLET",
      "route": "ORAL",
      "marketing_status": "Prescription",
      "te_code": "AB"
    },
    {
      "product_number": "002",
      "reference_drug": "No",
      "brand_name": "LISINOPRIL AND HYDROCHLOROTHIAZIDE",
      "active_ingredients": [
        {
          "name": "HYDROCHLOROTHIAZIDE",
          "strength": "25MG"
        },
        {
          "name": "LISINOPRIL",
          "strength": "20MG"
        }
      ],
      "reference_standard": "No",
      "dosage_form": "TABLET",
      "route": "ORAL",
      "marketing_status": "Prescription",
      "te_code": "AB"
    }
  ]
}
]


我有办法做到吗?

hmtdttj4

hmtdttj41#

我没有看到你提供的任何预期输出。但你可以使用这样的东西:

df = pd.json_normalize(json_file["results"])
for i in df.columns:
    df = df.explode(i)
df = df.join(pd.json_normalize(df.pop("submissions"))).reset_index(drop=True)
df = df.join(pd.json_normalize(df.pop("products"))).reset_index(drop=True)
df = df.explode("active_ingredients")
df = df.join(pd.json_normalize(df.pop("active_ingredients"))).reset_index(drop=True)
df = df.drop_duplicates()
len(df) # 3252

字符串

退出

|    | application_number   | sponsor_name      | openfda.application_number   | openfda.brand_name   | openfda.generic_name   | openfda.manufacturer_name   | openfda.product_ndc   | openfda.product_type    | openfda.route   | openfda.substance_name   |   openfda.rxcui | openfda.spl_id                       | openfda.spl_set_id                   | openfda.package_ndc   | openfda.nui   | openfda.pharm_class_cs      | openfda.pharm_class_epc   | openfda.unii   |   openfda.pharm_class_pe | submission_type   |   submission_number | submission_status   |   submission_status_date |   submission_class_code |   submission_class_code_description |   review_priority |   product_number | reference_drug   | brand_name   | reference_standard   | dosage_form   | route   | marketing_status   | te_code   | name          | strength   |
|---:|:---------------------|:------------------|:-----------------------------|:---------------------|:-----------------------|:----------------------------|:----------------------|:------------------------|:----------------|:-------------------------|----------------:|:-------------------------------------|:-------------------------------------|:----------------------|:--------------|:----------------------------|:--------------------------|:---------------|-------------------------:|:------------------|--------------------:|:--------------------|-------------------------:|------------------------:|------------------------------------:|------------------:|-----------------:|:-----------------|:-------------|:---------------------|:--------------|:--------|:-------------------|:----------|:--------------|:-----------|
|  0 | ANDA076177           | DR REDDYS LABS SA | ANDA076177                   | CAMILA               | NORETHINDRONE          | Mayne Pharma Inc.           | 51862-884             | HUMAN PRESCRIPTION DRUG | ORAL            | NORETHINDRONE            |          198042 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-01          | M0447349      | Progesterone Congeners [CS] | Progestin [EPC]           | T18F433X4S     |                      nan | ORIG              |                   1 | AP                  |                 20021021 |                     nan |                                 nan |               nan |              001 | No               | CAMILA       | No                   | TABLET        | ORAL-28 | Prescription       | AB1       | NORETHINDRONE | 0.35MG     |
|  1 | ANDA076177           | DR REDDYS LABS SA | ANDA076177                   | CAMILA               | NORETHINDRONE          | Mayne Pharma Inc.           | 51862-884             | HUMAN PRESCRIPTION DRUG | ORAL            | NORETHINDRONE            |          198042 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-01          | N0000175602   | Progesterone Congeners [CS] | Progestin [EPC]           | T18F433X4S     |                      nan | ORIG              |                   1 | AP                  |                 20021021 |                     nan |                                 nan |               nan |              001 | No               | CAMILA       | No                   | TABLET        | ORAL-28 | Prescription       | AB1       | NORETHINDRONE | 0.35MG     |
|  2 | ANDA076177           | DR REDDYS LABS SA | ANDA076177                   | CAMILA               | NORETHINDRONE          | Mayne Pharma Inc.           | 51862-884             | HUMAN PRESCRIPTION DRUG | ORAL            | NORETHINDRONE            |          198042 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-03          | M0447349      | Progesterone Congeners [CS] | Progestin [EPC]           | T18F433X4S     |                      nan | ORIG              |                   1 | AP                  |                 20021021 |                     nan |                                 nan |               nan |              001 | No               | CAMILA       | No                   | TABLET        | ORAL-28 | Prescription       | AB1       | NORETHINDRONE | 0.35MG     |
|  3 | ANDA076177           | DR REDDYS LABS SA | ANDA076177                   | CAMILA               | NORETHINDRONE          | Mayne Pharma Inc.           | 51862-884             | HUMAN PRESCRIPTION DRUG | ORAL            | NORETHINDRONE            |          198042 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-03          | N0000175602   | Progesterone Congeners [CS] | Progestin [EPC]           | T18F433X4S     |                      nan | ORIG              |                   1 | AP                  |                 20021021 |                     nan |                                 nan |               nan |              001 | No               | CAMILA       | No                   | TABLET        | ORAL-28 | Prescription       | AB1       | NORETHINDRONE | 0.35MG     |
|  4 | ANDA076177           | DR REDDYS LABS SA | ANDA076177                   | CAMILA               | NORETHINDRONE          | Mayne Pharma Inc.           | 51862-884             | HUMAN PRESCRIPTION DRUG | ORAL            | NORETHINDRONE            |          748961 | 022c6de4-4589-4d4c-8ac4-977c7e3ef568 | a786be85-49ba-4369-b510-7dccc10f7f18 | 51862-884-01          | M0447349      | Progesterone Congeners [CS] | Progestin [EPC]           | T18F433X4S     |                      nan | ORIG              |                   1 | AP                  |                 20021021 |                     nan |                                 nan |               nan |              001 | No               | CAMILA       | No                   | TABLET        | ORAL-28 | Prescription       | AB1       | NORETHINDRONE | 0.35MG     |


如果数据文件太大,我们可以尝试拆分框架:

import numpy as np
df = pd.json_normalize(json_file["results"])
splitted_dfs = np.array_split(df, 4)   
del df
final_df = pd.DataFrame()

for loop_df in splitted_dfs:
    for i in loop_df.columns:
        loop_df = loop_df.explode(i)
    loop_df = loop_df.join(pd.json_normalize(loop_df.pop("submissions"))).reset_index(drop=True)
    loop_df = loop_df.join(pd.json_normalize(loop_df.pop("products"))).reset_index(drop=True)
    loop_df = loop_df.explode("active_ingredients")
    loop_df = loop_df.join(pd.json_normalize(loop_df.pop("active_ingredients"))).reset_index(drop=True)
    final_df = pd.concat([final_df,loop_df])
    final_df = final_df.drop_duplicates()


或者我们可以把它添加到你写的代码中(我认为这是最快的方法):

df = pd.DataFrame(flattened_data)
final_df = pd.DataFrame()
splitted_dfs = np.array_split(df, 4)
del df

for loop_df in splitted_dfs:
    for i in loop_df.columns:
        loop_df = loop_df.explode(i)
    final_df = pd.concat([final_df,loop_df])
final_df = final_df.drop_duplicates()

相关问题