pandas 多索引DF聚集到嵌套Dict

dced5bon  于 2023-03-06  发布在  其他
关注(0)|答案(3)|浏览(143)

我试图通过所有索引的聚合和将多索引df转换为嵌套字典。
数据框的格式为

jan  feb
first second third          
bar   bar1   bar3     0    5
             bar4     4    4
      bar2   bar5     1    9
foo   foo1   foo3     7    2
             foo4     7    7
      foo2   foo5     0    3

我尝试单独聚合每个索引,并使用.to_dict('records')转换为dict,但无法实现每个索引聚合的嵌套字典,预期结果为

[
    {'first': 'bar',
     'jan': 5,
     'feb': 18,
     'sub_rows': [{'second': 'bar1',
                   'jan': 4,
                   'feb': 9,
                   'sub_rows': [{'third': 'bar3', 'jan': 0, 'feb': 5}, {'third': 'bar4', 'jan': 4, 'feb': 4}]
                   },
                  {'second': 'bar2',
                   'jan': 1,
                   'feb': 9,
                   'sub_rows': [{'third': 'bar5', 'jan': 1, 'feb': 9}]
                   }]
     },
    {'first': 'foo',
     'jan': 14,
     'feb': 12,
     'sub_rows': [{'second': 'foo1',
                   'jan': 14,
                   'feb': 9,
                   'sub_rows': [{'third': 'foo3', 'jan': 7, 'feb': 2}, {'third': 'foo4', 'jan': 7, 'feb': 7}]
                   },
                  {'second': 'foo2',
                   'jan': 0,
                   'feb': 3,
                   'sub_rows': [{'third': 'foo5', 'jan': 0, 'feb': 3}]
                   }]
     }
]

你能指导我实现这个格式吗?谢谢

vaj7vani

vaj7vani1#

import pandas as pd

def multiindex_df_to_list(df):
    out_list = []
    nlevels = df.index.nlevels
    level_name = df.index.levels[0].name if nlevels > 1 else df.index.name
    for i, level_value in enumerate(df.groupby(level=0)):
        out_list.append({})
        out_list[i][level_name] = level_value[0]
        for col, sum_val in level_value[1].sum().iteritems():
            out_list[i][col] = sum_val
        if nlevels > 1:
            out_list[i]['sub_rows'] = multiindex_df_to_list(level_value[1].droplevel(level=0))
    return out_list

def main():
    idx = pd.MultiIndex.from_arrays(
        [
            ['bar', 'bar', 'bar', 'foo', 'foo', 'foo'],
            ['bar1', 'bar1', 'bar2', 'foo1', 'foo1', 'foo2'],
            ['bar3', 'bar4', 'bar5', 'foo3', 'foo4', 'foo5']
        ],
        names=('first', 'second', 'third'))
    col = ['jan', 'feb']
    data = [[0, 5], [4, 4], [1, 9], [7, 2], [7, 7], [0, 3]]
    df = pd.DataFrame(data, idx, columns=col)

    out_list = multiindex_df_to_list(df)
    return out_list

if __name__ == "__main__":
    main()
alen0pnh

alen0pnh2#

使用列表作为参数的另一个递归答案:

def to_nested_dict(df, data=[]):
    lvl_name = df.index.get_level_values(0).name

    for name, subdf in df.groupby(level=0):
        d = {lvl_name: name} | subdf.sum().to_dict()
        data.append(d)
        if df.index.nlevels > 1:
            l = d.setdefault('sub_rows', [])
            to_nested_dict(subdf.loc[name], l)
    return data

data = to_nested_dict(df)

输出:

>>> data
[{'first': 'bar',
  'jan': 5,
  'feb': 18,
  'sub_rows': [{'second': 'bar1',
    'jan': 4,
    'feb': 9,
    'sub_rows': [{'third': 'bar3', 'jan': 0, 'feb': 5},
     {'third': 'bar4', 'jan': 4, 'feb': 4}]},
   {'second': 'bar2',
    'jan': 1,
    'feb': 9,
    'sub_rows': [{'third': 'bar5', 'jan': 1, 'feb': 9}]}]},
 {'first': 'foo',
  'jan': 14,
  'feb': 12,
  'sub_rows': [{'second': 'foo1',
    'jan': 14,
    'feb': 9,
    'sub_rows': [{'third': 'foo3', 'jan': 7, 'feb': 2},
     {'third': 'foo4', 'jan': 7, 'feb': 7}]},
   {'second': 'foo2',
    'jan': 0,
    'feb': 3,
    'sub_rows': [{'third': 'foo5', 'jan': 0, 'feb': 3}]}]}]
pw136qt2

pw136qt23#

sub_rows3=df.reset_index(level=2).apply(lambda ss:ss.to_dict(),axis=1).tolist()

df1=df.assign(sub_rows=sub_rows3)\
    .groupby(level=[0,1]).agg({'jan':sum,'feb':sum,"sub_rows":lambda ss:ss.tolist()})\
    .assign(second=lambda dd:dd.index.get_level_values(1))\
    .assign(sub_rows=lambda ss:ss.apply(lambda ss:ss.to_dict(),axis=1))\

df1.groupby(level=0).agg({'jan':sum,'feb':sum,"sub_rows":lambda ss:ss.tolist()})\
    .reset_index().to_dict('r')

输出:

[{'first': 'bar',
  'jan': 5,
  'feb': 18,
  'sub_rows': [{'jan': 4,
    'feb': 9,
    'sub_rows': [{'third': 'bar3', 'jan': 0, 'feb': 5},
     {'third': 'bar4', 'jan': 4, 'feb': 4}],
    'second': 'bar1'},
   {'jan': 1,
    'feb': 9,
    'sub_rows': [{'third': 'bar5', 'jan': 1, 'feb': 9}],
    'second': 'bar2'}]},
 {'first': 'foo',
  'jan': 14,
  'feb': 12,
  'sub_rows': [{'jan': 14,
    'feb': 9,
    'sub_rows': [{'third': 'foo3', 'jan': 7, 'feb': 2},
     {'third': 'foo4', 'jan': 7, 'feb': 7}],
    'second': 'foo1'},
   {'jan': 0,
    'feb': 3,
    'sub_rows': [{'third': 'foo5', 'jan': 0, 'feb': 3}],
    'second': 'foo2'}]}]

相关问题