R语言 消除每列的异常值

bbmckpt7  于 2023-04-03  发布在  其他
关注(0)|答案(1)|浏览(138)

对于df.nodup中的每一列,我希望消除小于第25百分位数或大于第75百分位数的值。

for(i in 1:ncol(df.nodup)) {       # for-loop over columns
  Q <- quantile(df.nodup[ , i], probs=c(.25, .75), na.rm=T)
  iqr <- IQR(df.nodup[ , i])
  df.sig <- subset(df.nodup, df.nodup[ , i] > (Q[ , i][1] - 1.5*iqr) & df.nodup[ , i] < (Q[ , i][2]+1.5*iqr))
}

追溯:

Error in Q[, i] : incorrect number of dimensions

数据:

> dput(df.nodup[1:20,1:5])
structure(list(C_1 = c(1.02, 1.02, 0.49, 0.78, 0.66, 0.73, 0.69, 
0.63, 0.71, 0.71, 0.83, 0.87, 0.79, 0.78, 0.63, 0.58, 0.78, 0.72, 
0.73, 0.68), C_2 = c(0.86, 0.71, 0.48, 0.71, 0.62, 0.61, 0.6, 
0.63, 0.61, 0.86, 0.71, 0.92, 0.72, 0.79, 0.5, 0.56, 0.63, 0.59, 
0.63, 0.61), C_3 = c(0.67, 0.87, 0.58, 0.7, 0.64, 0.66, 0.63, 
0.76, 0.63, 0.77, 0.79, 0.96, 0.75, 1.22, 0.61, 0.65, 0.6, 0.79, 
0.64, 0.67), C_4 = c(1.3, 1, 0.75, 1.06, 0.78, 0.88, 0.98, 1.03, 
0.96, 1, 1.01, 1.14, 1.12, 1.01, 0.77, 0.84, 0.97, 0.8, 0.86, 
1.04), D_5 = c(0.73, 0.92, 0.66, 0.71, 0.75, 0.68, 0.65, 0.64, 
0.65, 0.95, 0.8, 1, 0.74, 1.02, 0.55, 0.67, 0.73, 0.75, 0.66, 
0.63)), row.names = c("AAAS", "AAK1", "AAMDC", "AARS", "AASDHPPT", 
"ABCD3", "ABCE1", "ABCF1", "ABHD10", "ABI1", "ACAA1", "ACACA", 
"ACAD9", "ACADVL", "ACBD3", "ACIN1", "ACLY", "ACO1", "ACO2", 
"ACOT7"), class = "data.frame")
46qrfjad

46qrfjad1#

我们可以在lapply中使用replace

df[] <- lapply(df, \(x) {q <- quantile(x, c(.25, .75)); replace(x, x < q[1] | x > q[2], NA_real_)})

df
#           C_1  C_2  C_3  C_4  D_5
# AAAS       NA   NA 0.67   NA 0.73
# AAK1       NA 0.71   NA 1.00   NA
# AAMDC      NA   NA   NA   NA 0.66
# AARS     0.78 0.71 0.70   NA 0.71
# AASDHPPT   NA 0.62 0.64   NA 0.75
# ABCD3    0.73 0.61 0.66 0.88 0.68
# ABCE1    0.69   NA   NA 0.98   NA
# ABCF1      NA 0.63 0.76 1.03   NA
# ABHD10   0.71 0.61   NA 0.96   NA
# ABI1     0.71   NA 0.77 1.00   NA
# ACAA1      NA 0.71   NA 1.01   NA
# ACACA      NA   NA   NA   NA   NA
# ACAD9      NA   NA 0.75   NA 0.74
# ACADVL   0.78   NA   NA 1.01   NA
# ACBD3      NA   NA   NA   NA   NA
# ACIN1      NA   NA 0.65   NA 0.67
# ACLY     0.78 0.63   NA 0.97 0.73
# ACO1     0.72   NA   NA   NA 0.75
# ACO2     0.73 0.63 0.64 0.86 0.66
# ACOT7    0.68 0.61 0.67   NA   NA

然而,使用Tukey的方法可能更好:

df[] <- lapply(df, \(x, na.rm=TRUE) {
  q <- quantile(x, c(1/4, 3/4), na.rm=na.rm) + 1.5*IQR(x, na.rm=na.rm)*c(-1, 1)
  replace(x, x < q[1] | x > q[2], NA_real_)
})

df
#           C_1  C_2  C_3  C_4  D_5
# AAAS       NA 0.86 0.67   NA 0.73
# AAK1       NA 0.71 0.87 1.00   NA
# AAMDC      NA 0.48 0.58 0.75 0.66
# AARS     0.78 0.71 0.70 1.06 0.71
# AASDHPPT 0.66 0.62 0.64 0.78 0.75
# ABCD3    0.73 0.61 0.66 0.88 0.68
# ABCE1    0.69 0.60 0.63 0.98 0.65
# ABCF1    0.63 0.63 0.76 1.03 0.64
# ABHD10   0.71 0.61 0.63 0.96 0.65
# ABI1     0.71 0.86 0.77 1.00   NA
# ACAA1    0.83 0.71 0.79 1.01 0.80
# ACACA    0.87   NA 0.96 1.14   NA
# ACAD9    0.79 0.72 0.75 1.12 0.74
# ACADVL   0.78 0.79   NA 1.01   NA
# ACBD3    0.63 0.50 0.61 0.77 0.55
# ACIN1    0.58 0.56 0.65 0.84 0.67
# ACLY     0.78 0.63 0.60 0.97 0.73
# ACO1     0.72 0.59 0.79 0.80 0.75
# ACO2     0.73 0.63 0.64 0.86 0.66
# ACOT7    0.68 0.61 0.67 1.04 0.63

**注意:有许多方法可以删除离群值,但始终确保您可以论证why an observation might be an outlier

  • 数据:*
df <- structure(list(C_1 = c(1.02, 1.02, 0.49, 0.78, 0.66, 0.73, 0.69, 
0.63, 0.71, 0.71, 0.83, 0.87, 0.79, 0.78, 0.63, 0.58, 0.78, 0.72, 
0.73, 0.68), C_2 = c(0.86, 0.71, 0.48, 0.71, 0.62, 0.61, 0.6, 
0.63, 0.61, 0.86, 0.71, 0.92, 0.72, 0.79, 0.5, 0.56, 0.63, 0.59, 
0.63, 0.61), C_3 = c(0.67, 0.87, 0.58, 0.7, 0.64, 0.66, 0.63, 
0.76, 0.63, 0.77, 0.79, 0.96, 0.75, 1.22, 0.61, 0.65, 0.6, 0.79, 
0.64, 0.67), C_4 = c(1.3, 1, 0.75, 1.06, 0.78, 0.88, 0.98, 1.03, 
0.96, 1, 1.01, 1.14, 1.12, 1.01, 0.77, 0.84, 0.97, 0.8, 0.86, 
1.04), D_5 = c(0.73, 0.92, 0.66, 0.71, 0.75, 0.68, 0.65, 0.64, 
0.65, 0.95, 0.8, 1, 0.74, 1.02, 0.55, 0.67, 0.73, 0.75, 0.66, 
0.63)), row.names = c("AAAS", "AAK1", "AAMDC", "AARS", "AASDHPPT", 
"ABCD3", "ABCE1", "ABCF1", "ABHD10", "ABI1", "ACAA1", "ACACA", 
"ACAD9", "ACADVL", "ACBD3", "ACIN1", "ACLY", "ACO1", "ACO2", 
"ACOT7"), class = "data.frame")

相关问题