R语言 从相关对图中移除离群值

a0x5cqrl  于 2023-06-03  发布在  其他
关注(0)|答案(1)|浏览(222)

我有多个相关性,并使用pairs图绘制它们。其中一些包含“离群值”,在我的情况下,我想删除数据点,并重新绘制没有它们的数据。这里有一个例子来说明我的意思:

数据如下:

structure(list(CS_ERP = c(0.592217399922478, -0.203897301263363, 
0.338222381057887, 0.426894945652707, -0.30737569188464, -0.795959183126649, 
1.08640821731442, 0.436824331063638, 0.177559186752863, 0.722561317859706, 
0.53865872750591, 0.733560773038051, -0.132344192183052, 0.453103863990389, 
-0.111034068702735, 0.650410660832976, 0.342818054696555, 1.52017381893815, 
0.444799662033882, 0.0671036424041152, -0.215260420488558, -0.0270506388385066, 
0.851214146724932, 0.00273742867519521, 0.127085247367676, 0.523553816764235, 
0.535009107850034, -0.644372558676884, -0.432404000319374, -0.0677329504784329, 
0.873902779830881, 1.03822938157827, -0.514942928309014, 1.56161080831987, 
-1.88131135531881, 0.235716700658808, 0.799704960755054, 1.73208222356446, 
0.247784710165136, -0.0143066950525258, -0.473170631517114, 0.302272078688994, 
-0.35980392515202, 0.204326536400011, -0.490551073482312, 1.59651048122808, 
0.943766780480849, -0.184987582447942, 0.17987890568414, 0.150666299610195, 
0.282742988261302, 0.389264036959123, -0.080297860039224, 1.93563704248842, 
-0.365884460671325, 0.959229734481468, -1.47119089191046, -0.444094080641722, 
-0.636361360505438, 0.631361093009673, 2.62946532292634, 0.798510108404532, 
0.718327415177233, 1.50947695158847, 0.990343190301586, 1.44815515537459, 
1.44766809083971, 0.782650510573793, 1.88087173450267, 0.375954708150476, 
1.17043032278192, 0.72685383252287, 1.07555437410217, 0.985200032226283, 
-0.477185205402539, -1.04253562875013, -0.0190684334078904, 0.768789276763837, 
-0.38064050743836, 0.405895629096464, 0.267382730292199, 1.05413737126274, 
1.50350778531938, 0.311460646558454), CS_TF = c(-0.132065184642606, 
0.0367110238359321, 35.3210468915716, 0.153679818105899, 0.0332652037878963, 
-1.86496209543666, -1.07441876044758, -0.180493420311007, 0.00866968936464811, 
0.0552816343285741, -0.0234809689770028, -0.158685173657395, 
0.0700545324982749, 0.56175897135826, 0.0143549285329569, -0.128676438809003, 
0.0339896066570138, -0.0791169538548759, 0.0946317926108172, 
-0.172322368094037, 3.50860496405027, 0.379458772083521, -0.499920059769576, 
0.00685456174939309, 0.0287140297510314, 0.128580438921991, -0.000884474552408394, 
-0.0484650246614444, 0.279520102626941, -0.0233536174060131, 
0.166626389353964, -0.194576767577814, 0.0392959826687796, 0.0329681365611348, 
-0.25676650339929, -0.103471550040564, 0.466462291090004, 0.157227389587046, 
-0.0403901678275274, 0.0682499702171806, 0.130220474391507, -0.110553995224786, 
0.108862414312962, -0.209016224415964, 0.0422840704708455, -0.724812540310788, 
0.0732317891725166, -1.39006752637282, 0.840210152734346, -0.35371878446761, 
0.152698664646824, -0.0859570353609108, -0.0604200987838616, 
-0.240933204990605, 0.0161540243978887, 0.0374657611151743, 0.0874550801411632, 
-0.392507595478393, -1.02392865589898, 0.0807127255980967, -0.307310315199072, 
-0.0355848726007914, -0.347079319313953, -0.158586967635016, 
-0.158793766539776, 0.335271838499242, 0.362318748691139, 0.70789771606091, 
-0.330594118377962, 0.0752984671467694, -0.195902998158877, 0.0638325662398797, 
0.0403354942131794, -0.0683323188130881, 0.287268128823113, 0.056132792250317, 
0.0690000898535413, 0.185228611319422, -0.1999283340051, 0.209462600992504, 
0.0759982221221363, 0.566169094370387, 0.126884703286675, -0.0346748592683052
)), class = "data.frame", row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 
33L, 34L, 35L, 36L, 37L, 38L, 39L, 49L, 50L, 51L, 52L, 53L, 54L, 
55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 
68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 
81L, 82L, 83L, 84L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L
))

我查找了其他问题,并尝试了以下内容:

#fit the linear model and extract the residuals:
resid_CS <- resid(mod <- lm(Corr_data_CS_noOut$CS_ERP ~ Corr_data_CS_noOut$CS_TF, 
                            data = Corr_data_CS_noOut))

# quantile() gives the required quantiles of the residuals.
#If retaining 90% of the data, then we want the upper and lower 0.05 quantiles:
resid.qt <- quantile(resid_CS, probs = c(0.05,0.95))

#select the observations with residuals in the middle 90%
want <- which(resid_CS >= resid.qt[1] & resid_CS <= resid.qt[2]) 

#visualise this, with the red points being those we will retain:
plot(Corr_data_CS_noOut, type = "n")
points(Corr_data_CS_noOut[-want,], col = "black", pch = 21, bg = "black", cex = 0.8)
points(Corr_data_CS_noOut[want,], col = "red", pch = 21, bg = "red", cex = 0.8)
abline(mod, col = "blue", lwd = 2)

以及:

#Try using the absolute residuals:
ares <- abs(resid_CS)
absres.qt <- quantile(ares, prob = c(.9))
abswant <- which(ares <= absres.qt)
## plot 
plot(Corr_data_CS_noOut, type = "n")
points(Corr_data_CS_noOut[-abswant,], col = "black", pch = 21, bg = "black", cex = 0.8)
points(Corr_data_CS_noOut[abswant,], col = "red", pch = 21, bg = "red", cex = 0.8)
abline(mod, col = "blue", lwd = 2)

但是这两个都删除了y轴“两端”的点,留下了异常值:

我如何删除这个离群值,有没有办法将其应用到整个数据中(在这种情况下我有三个变量),然后用下面的方法使用pairs绘制它?

下面是我的pairs代码:

pairs(x = Corr_data_noppt[1:3], 
      diag.panel = labels,
      text.panel = my.text.panel,
      lower.panel = panel,
      font.labels = 1, 
      cex.labels = 1.2,
      label.pos = 0.5,
      upper.panel = NULL)
xfb7svmp

xfb7svmp1#

一个可能的选项是识别并移除max(Corr_data_CS_noOut$CS_TF)值,即

Corr_data_CS_noOut <- structure(list(CS_ERP = c(0.592217399922478, -0.203897301263363, 
                                0.338222381057887, 0.426894945652707, -0.30737569188464, -0.795959183126649, 
                                1.08640821731442, 0.436824331063638, 0.177559186752863, 0.722561317859706, 
                                0.53865872750591, 0.733560773038051, -0.132344192183052, 0.453103863990389, 
                                -0.111034068702735, 0.650410660832976, 0.342818054696555, 1.52017381893815, 
                                0.444799662033882, 0.0671036424041152, -0.215260420488558, -0.0270506388385066, 
                                0.851214146724932, 0.00273742867519521, 0.127085247367676, 0.523553816764235, 
                                0.535009107850034, -0.644372558676884, -0.432404000319374, -0.0677329504784329, 
                                0.873902779830881, 1.03822938157827, -0.514942928309014, 1.56161080831987, 
                                -1.88131135531881, 0.235716700658808, 0.799704960755054, 1.73208222356446, 
                                0.247784710165136, -0.0143066950525258, -0.473170631517114, 0.302272078688994, 
                                -0.35980392515202, 0.204326536400011, -0.490551073482312, 1.59651048122808, 
                                0.943766780480849, -0.184987582447942, 0.17987890568414, 0.150666299610195, 
                                0.282742988261302, 0.389264036959123, -0.080297860039224, 1.93563704248842, 
                                -0.365884460671325, 0.959229734481468, -1.47119089191046, -0.444094080641722, 
                                -0.636361360505438, 0.631361093009673, 2.62946532292634, 0.798510108404532, 
                                0.718327415177233, 1.50947695158847, 0.990343190301586, 1.44815515537459, 
                                1.44766809083971, 0.782650510573793, 1.88087173450267, 0.375954708150476, 
                                1.17043032278192, 0.72685383252287, 1.07555437410217, 0.985200032226283, 
                                -0.477185205402539, -1.04253562875013, -0.0190684334078904, 0.768789276763837, 
                                -0.38064050743836, 0.405895629096464, 0.267382730292199, 1.05413737126274, 
                                1.50350778531938, 0.311460646558454), CS_TF = c(-0.132065184642606, 
                                                                                0.0367110238359321, 35.3210468915716, 0.153679818105899, 0.0332652037878963, 
                                                                                -1.86496209543666, -1.07441876044758, -0.180493420311007, 0.00866968936464811, 
                                                                                0.0552816343285741, -0.0234809689770028, -0.158685173657395, 
                                                                                0.0700545324982749, 0.56175897135826, 0.0143549285329569, -0.128676438809003, 
                                                                                0.0339896066570138, -0.0791169538548759, 0.0946317926108172, 
                                                                                -0.172322368094037, 3.50860496405027, 0.379458772083521, -0.499920059769576, 
                                                                                0.00685456174939309, 0.0287140297510314, 0.128580438921991, -0.000884474552408394, 
                                                                                -0.0484650246614444, 0.279520102626941, -0.0233536174060131, 
                                                                                0.166626389353964, -0.194576767577814, 0.0392959826687796, 0.0329681365611348, 
                                                                                -0.25676650339929, -0.103471550040564, 0.466462291090004, 0.157227389587046, 
                                                                                -0.0403901678275274, 0.0682499702171806, 0.130220474391507, -0.110553995224786, 
                                                                                0.108862414312962, -0.209016224415964, 0.0422840704708455, -0.724812540310788, 
                                                                                0.0732317891725166, -1.39006752637282, 0.840210152734346, -0.35371878446761, 
                                                                                0.152698664646824, -0.0859570353609108, -0.0604200987838616, 
                                                                                -0.240933204990605, 0.0161540243978887, 0.0374657611151743, 0.0874550801411632, 
                                                                                -0.392507595478393, -1.02392865589898, 0.0807127255980967, -0.307310315199072, 
                                                                                -0.0355848726007914, -0.347079319313953, -0.158586967635016, 
                                                                                -0.158793766539776, 0.335271838499242, 0.362318748691139, 0.70789771606091, 
                                                                                -0.330594118377962, 0.0752984671467694, -0.195902998158877, 0.0638325662398797, 
                                                                                0.0403354942131794, -0.0683323188130881, 0.287268128823113, 0.056132792250317, 
                                                                                0.0690000898535413, 0.185228611319422, -0.1999283340051, 0.209462600992504, 
                                                                                0.0759982221221363, 0.566169094370387, 0.126884703286675, -0.0346748592683052
                                )), class = "data.frame", row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 
                                                                        7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 
                                                                        20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 
                                                                        33L, 34L, 35L, 36L, 37L, 38L, 39L, 49L, 50L, 51L, 52L, 53L, 54L, 
                                                                        55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 
                                                                        68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 
                                                                        81L, 82L, 83L, 84L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L
                                ))

#fit the linear model and extract the residuals:
resid_CS <- resid(mod <- lm(Corr_data_CS_noOut$CS_ERP ~ Corr_data_CS_noOut$CS_TF, 
                            data = Corr_data_CS_noOut))

# Removing the largest Corr_data_CS_noOut$CS_TF value
want <- !(Corr_data_CS_noOut$CS_TF == max(Corr_data_CS_noOut$CS_TF))

plot(Corr_data_CS_noOut, type = "n")
points(Corr_data_CS_noOut[-want,], col = "black", pch = 21, bg = "black", cex = 0.8)
points(Corr_data_CS_noOut[want,], col = "red", pch = 21, bg = "red", cex = 0.8)
abline(mod, col = "blue", lwd = 2)

创建于2023-05-31带有reprex v2.0.2
或者,如果要删除前5%的值:

#fit the linear model and extract the residuals:
resid_CS <- resid(mod <- lm(Corr_data_CS_noOut$CS_ERP ~ Corr_data_CS_noOut$CS_TF, 
                            data = Corr_data_CS_noOut))

# Removing the largest Corr_data_CS_noOut$CS_TF value
want <- Corr_data_CS_noOut$CS_TF < quantile(Corr_data_CS_noOut$CS_TF, 0.95)

plot(Corr_data_CS_noOut, type = "n")
points(Corr_data_CS_noOut[-want,], col = "black", pch = 21, bg = "black", cex = 0.8)
points(Corr_data_CS_noOut[want,], col = "red", pch = 21, bg = "red", cex = 0.8)
abline(mod, col = "blue", lwd = 2)

然后当你重新绘制你想要保留的值时,你会得到这个:

plot(Corr_data_CS_noOut[want,], type = "n")
points(Corr_data_CS_noOut[want,], col = "red", pch = 21, bg = "red", cex = 0.8)
abline(mod, col = "blue", lwd = 2)

创建于2023-05-31带有reprex v2.0.2
我不确定这在准确表示数据方面是否可取,但这能解决您的问题吗?

相关问题