删除30%的值为离群值pandas的行

kg7wmglp  于 2023-03-28  发布在  其他
关注(0)|答案(1)|浏览(125)

我已经写了一些非常低效的代码,如果能帮助我使用pandas方法而不是像我一样使用嵌套的“for”循环来提高效率,我将不胜感激。
我希望删除包含30%离群值的行我通过迭代每行然后遍历该行中的值来完成此操作然后计算该值的列的平均值和标准差以创建区间。i然后+=1计数器如果值不在区间中,则这是我使用的代码:

def outlier(df2):
    drop_rows = []
    for i,row in df2.iterrows():
        count = 0
        for col_name,value in row.items():
            col_mean = df2[col_name].mean()
            col_std = df2[col_name].std()
            interval = pd.Interval(col_mean - 2*col_std, col_mean + 2*col_std)
            if value not in interval:
                count += 1
        if count / len(row) >= 0.3:
            drop_rows.append(i)
    df2.drop(drop_rows, inplace=True)
    return df2

这将按预期删除行,我只是在寻找一种更快的方法来执行此操作
df2的示例如下:

{'ALogP98': [0.777, -0.183, 3.313, 3.24, 2.225], 'ES_Sum_dO': [0.0, 8.805, 0.0, 0.0, 0.0], 'ES_Sum_dssC': [0.0, 0.0, 0.0, 0.0, 0.0], 'ES_Sum_sCH3': [5.803, 1.444, 4.116, 10.781, 6.017], 'ES_Sum_ssCH2': [1.446, 0.0, 4.727, 3.83, 3.537], 'ES_Count_dO': [0, 1, 0, 0, 0], 'ES_Count_dssC': [0, 0, 0, 0, 0], 'ES_Count_sCH3': [3, 1, 2, 5, 3], 'ES_Count_ssCH2': [2, 0, 5, 4, 4], 'CoordDimension': [3, 2, 2, 2, 2], 'LogD': [0.777, -0.183, 3.313, 3.24, 2.225], 'Molecular_Solubility': [-0.883, 0.168, -4.081, -3.227, -2.739], 'HBA_Count': [2, 1, 2, 2, 2], 'NPlusO_Count': [2, 1, 2, 2, 2], 'Num_Atoms': [8, 3, 16, 14, 12], 'Num_Bonds': [7, 2, 16, 13, 11], 'Num_Hydrogens': [14, 4, 22, 26, 20], 'Num_ExplicitHydrogens': [14, 4, 22, 26, 20], 'Num_ExplicitAtoms': [22, 7, 38, 40, 32], 'Num_ExplicitBonds': [21, 6, 38, 39, 31], 'Num_RingBonds': [0, 0, 6, 0, 0], 'Num_RotatableBonds': [4, 0, 8, 8, 7], 'Num_Rings': [0, 0, 1, 0, 0], 'Num_RingAssemblies': [0, 0, 1, 0, 0], 'Num_Rings6': [0, 0, 1, 0, 0], 'Num_Chains': [16, 5, 24, 30, 22], 'Num_ChainAssemblies': [1, 1, 6, 1, 1], 'Num_Fragments': [1, 1, 1, 1, 1], 'Num_ComplexedFragments': [1, 1, 1, 1, 1], 'Num_SingleBonds': [7, 1, 13, 13, 10], 'Num_DoubleBonds': [0, 1, 3, 0, 1], 'Num_AliphaticSingleBonds': [7, 1, 10, 13, 10], 'Num_AliphaticDoubleBonds': [0, 1, 0, 0, 1], 'Num_AtomClasses': [5, 3, 14, 7, 12], 'Num_H_Acceptors': [2, 1, 2, 2, 2], 'Num_H_Acceptors_Lipinski': [2, 1, 2, 2, 2], 'Organic_Count': [8, 3, 16, 14, 12], 'Molecular_FractionalPolarSurfaceArea': [0.126, 0.286, 0.075, 0.075, 0.089], 'Molecular_FractionalPolarSASA': [0.092, 0.221, 0.061, 0.065, 0.071], 'BIC': [0.80146, 1.0, 0.74286, 0.57316, 0.79649], 'CIC': [0.75, -1e-05, 0.84436, 1.6864, 0.72957], 'IAC_Mean': [1.24067, 1.37878, 1.21081, 1.14115, 1.19819], 'IAC_Total': [27.2948, 9.65148, 46.0108, 45.6461, 38.3421], 'IC': [2.25, 1.58496, 3.15563, 2.12095, 2.85538], 'SIC': [0.75, 1.0, 0.7889, 0.55706, 0.79649], 'CHI_0': [6.40577, 2.7071, 11.64, 10.9747, 9.2342], 'CHI_1': [3.80806, 1.41421, 7.82569, 6.51974, 5.80806], 'CHI_2': [2.68252, 0.7071, 6.03589, 5.75467, 4.09673], 'CHI_3_C': [0.28867, 0.0, 0.49279, 1.10517, 0.28867], 'CHI_3_P': [1.56294, 0.0, 4.33451, 2.95679, 2.58195], 'CHI_V_0': [5.80806, 1.98559, 10.3161, 10.377, 8.37697], 'CHI_V_1': [3.04031, 0.81305, 6.09744, 5.752, 4.69014], 'CHI_V_2': [1.47831, 0.2357, 3.89836, 4.67425, 2.56209], 'CHI_V_3_C': [0.09622, 0.0, 0.21407, 0.91272, 0.09622], 'CHI_V_3_P': [0.80274, 0.0, 2.43657, 2.09552, 1.44082], 'JX': [2.69601, 2.04654, 1.98977, 3.08772, 2.97687], 'JY': [2.99253, 2.23346, 2.08558, 3.28157, 3.17827], 'Kappa_1': [8.0, 3.0, 14.0625, 14.0, 12.0], 'Kappa_1_AM': [7.92, 2.67, 13.206, 13.92, 11.66], 'Kappa_2': [5.14285, 2.0, 9.07407, 8.31999, 9.0909], 'Kappa_2_AM': [5.0645, 1.67, 8.31033, 8.24664, 8.7538], 'Kappa_3': [5.0, 0.0, 7.05817, 11.0, 9.0], 'Kappa_3_AM': [4.92, 0.0, 6.36993, 10.92, 8.65999], 'PHI': [5.01386, 1.4863, 6.85916, 8.19952, 8.50578], 'SC_0': [8.0, 3.0, 16.0, 14.0, 12.0], 'SC_1': [7.0, 2.0, 16.0, 13.0, 11.0], 'SC_2': [7.0, 1.0, 18.0, 15.0, 11.0], 'SC_3_C': [1.0, 0.0, 2.0, 3.0, 1.0], 'SC_3_P': [6.0, 0.0, 19.0, 12.0, 10.0]}
eqqqjvef

eqqqjvef1#

根据您的需求调整this答案,您可以利用矢量化操作:

import pandas as pd
import numpy as np
from scipy import stats

df = pd.DataFrame(
 {'ALogP98': [0.777, -0.183, 3.313, 3.24, 2.225], 'ES_Sum_dO': [0.0, 8.805, 0.0, 0.0, 0.0], 'ES_Sum_dssC': [0.0, 0.0, 0.0, 0.0, 0.0], 'ES_Sum_sCH3': [5.803, 1.444, 4.116, 10.781, 6.017], 'ES_Sum_ssCH2': [1.446, 0.0, 4.727, 3.83, 3.537], 'ES_Count_dO': [0, 1, 0, 0, 0], 'ES_Count_dssC': [0, 0, 0, 0, 0], 'ES_Count_sCH3': [3, 1, 2, 5, 3], 'ES_Count_ssCH2': [2, 0, 5, 4, 4], 'CoordDimension': [3, 2, 2, 2, 2], 'LogD': [0.777, -0.183, 3.313, 3.24, 2.225], 'Molecular_Solubility': [-0.883, 0.168, -4.081, -3.227, -2.739], 'HBA_Count': [2, 1, 2, 2, 2], 'NPlusO_Count': [2, 1, 2, 2, 2], 'Num_Atoms': [8, 3, 16, 14, 12], 'Num_Bonds': [7, 2, 16, 13, 11], 'Num_Hydrogens': [14, 4, 22, 26, 20], 'Num_ExplicitHydrogens': [14, 4, 22, 26, 20], 'Num_ExplicitAtoms': [22, 7, 38, 40, 32], 'Num_ExplicitBonds': [21, 6, 38, 39, 31], 'Num_RingBonds': [0, 0, 6, 0, 0], 'Num_RotatableBonds': [4, 0, 8, 8, 7], 'Num_Rings': [0, 0, 1, 0, 0], 'Num_RingAssemblies': [0, 0, 1, 0, 0], 'Num_Rings6': [0, 0, 1, 0, 0], 'Num_Chains': [16, 5, 24, 30, 22], 'Num_ChainAssemblies': [1, 1, 6, 1, 1], 'Num_Fragments': [1, 1, 1, 1, 1], 'Num_ComplexedFragments': [1, 1, 1, 1, 1], 'Num_SingleBonds': [7, 1, 13, 13, 10], 'Num_DoubleBonds': [0, 1, 3, 0, 1], 'Num_AliphaticSingleBonds': [7, 1, 10, 13, 10], 'Num_AliphaticDoubleBonds': [0, 1, 0, 0, 1], 'Num_AtomClasses': [5, 3, 14, 7, 12], 'Num_H_Acceptors': [2, 1, 2, 2, 2], 'Num_H_Acceptors_Lipinski': [2, 1, 2, 2, 2], 'Organic_Count': [8, 3, 16, 14, 12], 'Molecular_FractionalPolarSurfaceArea': [0.126, 0.286, 0.075, 0.075, 0.089], 'Molecular_FractionalPolarSASA': [0.092, 0.221, 0.061, 0.065, 0.071], 'BIC': [0.80146, 1.0, 0.74286, 0.57316, 0.79649], 'CIC': [0.75, -1e-05, 0.84436, 1.6864, 0.72957], 'IAC_Mean': [1.24067, 1.37878, 1.21081, 1.14115, 1.19819], 'IAC_Total': [27.2948, 9.65148, 46.0108, 45.6461, 38.3421], 'IC': [2.25, 1.58496, 3.15563, 2.12095, 2.85538], 'SIC': [0.75, 1.0, 0.7889, 0.55706, 0.79649], 'CHI_0': [6.40577, 2.7071, 11.64, 10.9747, 9.2342], 'CHI_1': [3.80806, 1.41421, 7.82569, 6.51974, 5.80806], 'CHI_2': [2.68252, 0.7071, 6.03589, 5.75467, 4.09673], 'CHI_3_C': [0.28867, 0.0, 0.49279, 1.10517, 0.28867], 'CHI_3_P': [1.56294, 0.0, 4.33451, 2.95679, 2.58195], 'CHI_V_0': [5.80806, 1.98559, 10.3161, 10.377, 8.37697], 'CHI_V_1': [3.04031, 0.81305, 6.09744, 5.752, 4.69014], 'CHI_V_2': [1.47831, 0.2357, 3.89836, 4.67425, 2.56209], 'CHI_V_3_C': [0.09622, 0.0, 0.21407, 0.91272, 0.09622], 'CHI_V_3_P': [0.80274, 0.0, 2.43657, 2.09552, 1.44082], 'JX': [2.69601, 2.04654, 1.98977, 3.08772, 2.97687], 'JY': [2.99253, 2.23346, 2.08558, 3.28157, 3.17827], 'Kappa_1': [8.0, 3.0, 14.0625, 14.0, 12.0], 'Kappa_1_AM': [7.92, 2.67, 13.206, 13.92, 11.66], 'Kappa_2': [5.14285, 2.0, 9.07407, 8.31999, 9.0909], 'Kappa_2_AM': [5.0645, 1.67, 8.31033, 8.24664, 8.7538], 'Kappa_3': [5.0, 0.0, 7.05817, 11.0, 9.0], 'Kappa_3_AM': [4.92, 0.0, 6.36993, 10.92, 8.65999], 'PHI': [5.01386, 1.4863, 6.85916, 8.19952, 8.50578], 'SC_0': [8.0, 3.0, 16.0, 14.0, 12.0], 'SC_1': [7.0, 2.0, 16.0, 13.0, 11.0], 'SC_2': [7.0, 1.0, 18.0, 15.0, 11.0], 'SC_3_C': [1.0, 0.0, 2.0, 3.0, 1.0], 'SC_3_P': [6.0, 0.0, 19.0, 12.0, 10.0]}
)

# Compute percentage of row values that are NOT outliers
pct_row_values_not_outliers = (np.abs(stats.zscore(df, axis=0)) < 2).sum(axis=1) / df.shape[1]

# Drop dataframe rows where percentage of outliers is 30% or more
df[pct_row_values_not_outliers > 0.7]

相关问题