我有多个相关性,并使用pairs
图绘制它们。其中一些包含“离群值”,在我的情况下,我想删除数据点,并重新绘制没有它们的数据。这里有一个例子来说明我的意思:
数据如下:
structure(list(CS_ERP = c(0.592217399922478, -0.203897301263363,
0.338222381057887, 0.426894945652707, -0.30737569188464, -0.795959183126649,
1.08640821731442, 0.436824331063638, 0.177559186752863, 0.722561317859706,
0.53865872750591, 0.733560773038051, -0.132344192183052, 0.453103863990389,
-0.111034068702735, 0.650410660832976, 0.342818054696555, 1.52017381893815,
0.444799662033882, 0.0671036424041152, -0.215260420488558, -0.0270506388385066,
0.851214146724932, 0.00273742867519521, 0.127085247367676, 0.523553816764235,
0.535009107850034, -0.644372558676884, -0.432404000319374, -0.0677329504784329,
0.873902779830881, 1.03822938157827, -0.514942928309014, 1.56161080831987,
-1.88131135531881, 0.235716700658808, 0.799704960755054, 1.73208222356446,
0.247784710165136, -0.0143066950525258, -0.473170631517114, 0.302272078688994,
-0.35980392515202, 0.204326536400011, -0.490551073482312, 1.59651048122808,
0.943766780480849, -0.184987582447942, 0.17987890568414, 0.150666299610195,
0.282742988261302, 0.389264036959123, -0.080297860039224, 1.93563704248842,
-0.365884460671325, 0.959229734481468, -1.47119089191046, -0.444094080641722,
-0.636361360505438, 0.631361093009673, 2.62946532292634, 0.798510108404532,
0.718327415177233, 1.50947695158847, 0.990343190301586, 1.44815515537459,
1.44766809083971, 0.782650510573793, 1.88087173450267, 0.375954708150476,
1.17043032278192, 0.72685383252287, 1.07555437410217, 0.985200032226283,
-0.477185205402539, -1.04253562875013, -0.0190684334078904, 0.768789276763837,
-0.38064050743836, 0.405895629096464, 0.267382730292199, 1.05413737126274,
1.50350778531938, 0.311460646558454), CS_TF = c(-0.132065184642606,
0.0367110238359321, 35.3210468915716, 0.153679818105899, 0.0332652037878963,
-1.86496209543666, -1.07441876044758, -0.180493420311007, 0.00866968936464811,
0.0552816343285741, -0.0234809689770028, -0.158685173657395,
0.0700545324982749, 0.56175897135826, 0.0143549285329569, -0.128676438809003,
0.0339896066570138, -0.0791169538548759, 0.0946317926108172,
-0.172322368094037, 3.50860496405027, 0.379458772083521, -0.499920059769576,
0.00685456174939309, 0.0287140297510314, 0.128580438921991, -0.000884474552408394,
-0.0484650246614444, 0.279520102626941, -0.0233536174060131,
0.166626389353964, -0.194576767577814, 0.0392959826687796, 0.0329681365611348,
-0.25676650339929, -0.103471550040564, 0.466462291090004, 0.157227389587046,
-0.0403901678275274, 0.0682499702171806, 0.130220474391507, -0.110553995224786,
0.108862414312962, -0.209016224415964, 0.0422840704708455, -0.724812540310788,
0.0732317891725166, -1.39006752637282, 0.840210152734346, -0.35371878446761,
0.152698664646824, -0.0859570353609108, -0.0604200987838616,
-0.240933204990605, 0.0161540243978887, 0.0374657611151743, 0.0874550801411632,
-0.392507595478393, -1.02392865589898, 0.0807127255980967, -0.307310315199072,
-0.0355848726007914, -0.347079319313953, -0.158586967635016,
-0.158793766539776, 0.335271838499242, 0.362318748691139, 0.70789771606091,
-0.330594118377962, 0.0752984671467694, -0.195902998158877, 0.0638325662398797,
0.0403354942131794, -0.0683323188130881, 0.287268128823113, 0.056132792250317,
0.0690000898535413, 0.185228611319422, -0.1999283340051, 0.209462600992504,
0.0759982221221363, 0.566169094370387, 0.126884703286675, -0.0346748592683052
)), class = "data.frame", row.names = c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 49L, 50L, 51L, 52L, 53L, 54L,
55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L,
68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L,
81L, 82L, 83L, 84L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L
))
我查找了其他问题,并尝试了以下内容:
#fit the linear model and extract the residuals:
resid_CS <- resid(mod <- lm(Corr_data_CS_noOut$CS_ERP ~ Corr_data_CS_noOut$CS_TF,
data = Corr_data_CS_noOut))
# quantile() gives the required quantiles of the residuals.
#If retaining 90% of the data, then we want the upper and lower 0.05 quantiles:
resid.qt <- quantile(resid_CS, probs = c(0.05,0.95))
#select the observations with residuals in the middle 90%
want <- which(resid_CS >= resid.qt[1] & resid_CS <= resid.qt[2])
#visualise this, with the red points being those we will retain:
plot(Corr_data_CS_noOut, type = "n")
points(Corr_data_CS_noOut[-want,], col = "black", pch = 21, bg = "black", cex = 0.8)
points(Corr_data_CS_noOut[want,], col = "red", pch = 21, bg = "red", cex = 0.8)
abline(mod, col = "blue", lwd = 2)
以及:
#Try using the absolute residuals:
ares <- abs(resid_CS)
absres.qt <- quantile(ares, prob = c(.9))
abswant <- which(ares <= absres.qt)
## plot
plot(Corr_data_CS_noOut, type = "n")
points(Corr_data_CS_noOut[-abswant,], col = "black", pch = 21, bg = "black", cex = 0.8)
points(Corr_data_CS_noOut[abswant,], col = "red", pch = 21, bg = "red", cex = 0.8)
abline(mod, col = "blue", lwd = 2)
但是这两个都删除了y轴“两端”的点,留下了异常值:
我如何删除这个离群值,有没有办法将其应用到整个数据中(在这种情况下我有三个变量),然后用下面的方法使用pairs
绘制它?
下面是我的pairs
代码:
pairs(x = Corr_data_noppt[1:3],
diag.panel = labels,
text.panel = my.text.panel,
lower.panel = panel,
font.labels = 1,
cex.labels = 1.2,
label.pos = 0.5,
upper.panel = NULL)
1条答案
按热度按时间xfb7svmp1#
一个可能的选项是识别并移除
max(Corr_data_CS_noOut$CS_TF)
值,即创建于2023-05-31带有reprex v2.0.2
或者,如果要删除前5%的值:
然后当你重新绘制你想要保留的值时,你会得到这个:
创建于2023-05-31带有reprex v2.0.2
我不确定这在准确表示数据方面是否可取,但这能解决您的问题吗?