我有一个大型的多索引 Dataframe ,我想根据另一组列中的值填充一列的值。看起来像是将 Dataframe 或字典传递给groupby。fillna()强制您匹配索引。我如何仅使用groupby值作为索引而忽略 Dataframe 索引来填充值?
import pandas as pd
import numpy as np
nan = np.nan
def print_df(df):
with pd.option_context('display.max_rows', None,
'display.max_columns', None,
):
print(df)
d1 = {'I1': {0: 1, 1: 1, 2: 2, 3: 2, 4: 2, 5: 2, 6: 2, 7: 2, 8: 2, 9: 2},
'I2': {0: 1, 1: 2, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2},
'I3': {0: 1, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 1, 9: 2},
'A': {0: 1, 1: 1, 2: 2, 3: 2, 4: 2, 5: 2, 6: 1, 7: 1, 8: 1, 9: 1},
'B': {0: 3, 1: 2, 2: 1, 3: 2, 4: 3, 5: 3, 6: 2, 7: 2, 8: 1, 9: 3},
'C': {0: 2, 1: 1, 2: 2, 3: 1, 4: 2, 5: 1, 6: 1, 7: 1, 8: 2, 9: 1},
'D': {0: nan, 1: nan, 2: 7.0, 3: nan, 4: nan, 5: 8.0, 6: 8.0, 7: 4.0, 8: 1.0, 9: nan},
'E': {0: nan, 1: nan, 2: 1.0, 3: nan, 4: nan, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: nan}}
df1 = pd.DataFrame(d1)
df1.set_index(["I1","I2","I3"], inplace=True)
d2 = {'A': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 2, 7: 2, 8: 2, 9: 2, 10: 2, 11: 2},
'B': {0: 1, 1: 1, 2: 2, 3: 2, 4: 3, 5: 3, 6: 1, 7: 1, 8: 2, 9: 2, 10: 3, 11: 3},
'C': {0: 1, 1: 2, 2: 1, 3: 2, 4: 1, 5: 2, 6: 1, 7: 2, 8: 1, 9: 2, 10: 1, 11: 2},
'D': {0: 5, 1: 1, 2: 4, 3: 3, 4: 2, 5: 2, 6: 3, 7: 5, 8: 5, 9: 4, 10: 4, 11: 3},
'E': {0: 5, 1: 5, 2: 5, 3: 5, 4: 5, 5: 5, 6: 5, 7: 5, 8: 5, 9: 5, 10: 5, 11: 5}}
df2 =pd.DataFrame(d2)
df2.set_index(["A","B","C"], inplace=True)
print("dataframe with values to impute")
print_df(df1)
print("lookup values to fill dataframe")
print_df(df2)
# what I expected to work that is instead using the I1 I2 I3 index
df1.groupby(["A","B","C"]).fillna(df2.to_dict())
这是预期输出(**显示插补行):
data frame with values to impute
A B C D E
I1 I2 I3
1 1 1 1 3 2 2.0 5.0 **
2 1 1 2 1 4.0 5.0 **
2 1 1 2 1 2 7.0 1.0
上面显示的相关查找值仅供参考
D E
A B C
1 1 1 5 5
2 1 5
2 1 4 5**
2 3 5
3 1 2 5
2 2 5**
1条答案
按热度按时间uajslkp61#
我将首先使用
merge
对齐这两个DataFrame,因为它们不共享相同的索引:输出: