在Pandas DataFrame中将不匹配的行拆分为多行

thtygnil  于 2023-09-29  发布在  其他
关注(0)|答案(2)|浏览(101)

我想将每列中不匹配的值拆分到单独的行中,同时为新行保留“Gene.ID”列中的值。

import pandas as pd
data = {
    'Gene.ID': ['NZ_JAHWGH010000001.1_15', 'NZ_JAHWGH010000001.1_17', 'NZ_JAHWGH010000001.1_68', 'NZ_JAHWGH010000001.1_7', 'NZ_JAHWGH010000001.1_7', 'NZ_JAHWGH010000001.1_7', 'NZ_JAHWGH010000001.1_7', 'NZ_JAHWGH010000001.1_7','NZ_JAHWGH010000001.1_7'],
    'DIAMOND': ['SLH', 'GT2', 'GT2', 'CBM41', 'CBM48', 'GH11', 'GH13', 'GH13', ''],
    'HMMER': ['', 'GT2', 'GT2', 'CBM41', 'CBM41', 'GH13', 'GH13', '', 'GH13'],
    'dbCAN_sub': ['', 'GT2', 'GT2', 'CBM41', 'CBM41', 'CBM41', 'CBM48', '', 'GH13']
}

df = pd.DataFrame(data)

# Print the DataFrame
print(df)

结果应如下所示:

expected_data = {
    "Gene.ID": ["NZ_JAHWGH010000001.1_15", "NZ_JAHWGH010000001.1_17", "NZ_JAHWGH010000001.1_68", "NZ_JAHWGH010000001.1_7", "NZ_JAHWGH010000001.1_7", "NZ_JAHWGH010000001.1_7", "NZ_JAHWGH010000001.1_7", "NZ_JAHWGH010000001.1_7", "NZ_JAHWGH010000001.1_7", "NZ_JAHWGH010000001.1_7", "NZ_JAHWGH010000001.1_7", "NZ_JAHWGH010000001.1_7", "NZ_JAHWGH010000001.1_7"],
    "DIAMOND": ["SLH", "GT2", "GT2", "CBM41", "CBM48", "", "GH11", "", "", "GH13", "", "GH13",""],
    "HMMER": ["", "", "GT2", "CBM41", "", "CBM41", "", "GH13", "", "GH13", "", "", "GH13"],
    "dbCAN_sub": ["", "", "GT2", "CBM41", "", "CBM41", "", "", "CBM41", "", "CBM48", "", "GH13"]
}
expected_df = pd.DataFrame(expected_data)
print(expected_df)

我试过这个密码

import pandas as pd
print(df)
def g(df):
    for i in range(len(df)):
        if i == len(df) - 1:
            break
        if df.iloc[i, 0] == '':
            pass
        if df.iloc[i, 0] == df.iloc[i, 1]:
            pass
        if df.iloc[i, 0] != df.iloc[i, 1]:
            df.iloc[i, 1] = df.iloc[i+1, 1]
        if df.iloc[i, 1] == '':
            pass
        if df.iloc[i, 1] == df.iloc[i, 2]:
            pass
        if df.iloc[i, 1] != df.iloc[i, 2]:
            df.iloc[i, 2] = df.iloc[i+1, 2]
        
    return df

df = g(df.copy())
print(df)

但是,在拆分不匹配的值和为新行保留“Gene.ID”列方面,我面临着挑战。有人能帮我一个解决方案或建议一个更有效的方法来实现这一点吗?

fruv7luv

fruv7luv1#

您可以调整DataFrame的形状并重新聚合。
总而言之,stack要去掉空单元格,用groupby.ngroup分配一个新索引,然后pivot返回到2D:

out = (df.rename_axis(index='index', columns='col')
         .set_index('Gene.ID', append=True).replace('', np.nan)
         .stack().reset_index(name='value')
         .assign(index2=lambda d: d.groupby(['index', 'Gene.ID', 'value'], sort=False).ngroup())
         .pivot(index=['Gene.ID', 'index2'], columns='col', values='value')
         .fillna('').reset_index('Gene.ID').rename_axis(index=None, columns=None)
      )

输出量:

Gene.ID DIAMOND  HMMER dbCAN_sub
0   NZ_JAHWGH010000001.1_15     SLH                 
1   NZ_JAHWGH010000001.1_17     GT2    GT2       GT2
2   NZ_JAHWGH010000001.1_68     GT2    GT2       GT2
3    NZ_JAHWGH010000001.1_7   CBM41  CBM41     CBM41
4    NZ_JAHWGH010000001.1_7   CBM48                 
5    NZ_JAHWGH010000001.1_7          CBM41     CBM41
6    NZ_JAHWGH010000001.1_7    GH11                 
7    NZ_JAHWGH010000001.1_7           GH13          
8    NZ_JAHWGH010000001.1_7                    CBM41
9    NZ_JAHWGH010000001.1_7    GH13   GH13          
10   NZ_JAHWGH010000001.1_7                    CBM48
11   NZ_JAHWGH010000001.1_7    GH13                 
12   NZ_JAHWGH010000001.1_7           GH13      GH13
7cwmlq89

7cwmlq892#

是的,我用这个做的,对吗

import pandas as pd
import numpy as np

import pandas as pd
data = {
    'Gene.ID': ['NZ_JAHWGH010000001.1_15', 'NZ_JAHWGH010000001.1_17', 'NZ_JAHWGH010000001.1_68', 'NZ_JAHWGH010000001.1_7', 'NZ_JAHWGH010000001.1_7', 'NZ_JAHWGH010000001.1_7', 'NZ_JAHWGH010000001.1_7', 'NZ_JAHWGH010000001.1_7','NZ_JAHWGH010000001.1_7'],
    'DIAMOND': ['SLH', 'GT2', 'GT2', 'CBM41', 'CBM48', 'GH11', 'GH13', 'GH13', ''],
    'HMMER': ['', 'GT2', 'GT2', 'CBM41', 'CBM41', 'GH13', 'GH13', '', 'GH13'],
    'dbCAN_sub': ['', 'GT2', 'GT2', 'CBM41', 'CBM41', 'CBM41', 'CBM48', '', 'GH13']
}

df = pd.DataFrame(data)

# Print the DataFrame
print(df)

# Initialize a list to store the resulting rows
result_rows = []

# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    gene_id = row['Gene.ID']
    unique_values = set(row[1:]) - {''}
    
    if len(unique_values) > 1:
        mismatched_values = [value for value in row[1:] if value != '']
        mismatched_set = set(mismatched_values)  # Convert to set to remove duplicates
        for mismatched_value in mismatched_set:
            result_row = row.copy()
            result_row[1:] = ['' if value != mismatched_value else value for value in row[1:]]
            result_rows.append(result_row)
    else:
        result_rows.append(row)

# Create a new DataFrame from the resulting rows
result_df = pd.DataFrame(result_rows)

# Reset the index of the new DataFrame
result_df.reset_index(drop=True, inplace=True)

# Print the output
print(result_df)

相关问题