bounty还有3天到期。回答此问题可获得+100声望奖励。Tung希望引起更多的注意这个问题。
我正在寻找有效的方法来删除我的数据中的离群值。我已经尝试了几个解决方案,我发现在这里StackOverflow和其他地方,但没有一个为我工作(1993年6月和1994年8月的3个高值21637,19590和21659应该被检测到,并在本文末尾发布的样本数据中删除)。任何建议将不胜感激!
以下是我到目前为止测试的内容:
数据概述
- How to remove outliers from a dataset
3个异常值仍然存在,并且在时间序列结束时删除了许多合法的高值。
y <- dat$Value
y_filter <- y[!y %in% boxplot.stats(y)$out]
plot(y_filter)
- Calculating outliers in R
与第一种方法类似的问题
FindOutliers <- function(data) {
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq #Or use IQR(data)
# we identify extreme outliers
extreme.threshold.upper = (iqr * 1.5) + upperq
extreme.threshold.lower = lowerq - (iqr * 1.5)
result <- which(data > extreme.threshold.upper | data < extreme.threshold.lower)
return(result)
}
# use the function to identify outliers
temp <- FindOutliers(y)
# remove the outliers
y1 <- y[-temp]
# show the data with the outliers removed
y1
#> [1] 511 524 310 721 514 318 511 511 318 510 21637 0
#> [13] 319 511 305 317 0 0 0 0 0 0 0 0
#> [25] 19590 0 0 0 0 0 0 0 0 21659 0 0
#> [37] 0 0 19 7 0 5 9 21 50 187 291 321
#> [49] 445 462 462 695 694 693 1276 2560 5500 11663 24307 25205
#> [61] 21667 18610 16739 14294 13517 12296 11247 11209 10954 11228 11387 11190
#> [73] 11193 11562 12279 11994 12073 11965 11386 10512 10677 10115 10118 9530
#> [85] 9016 9086 8167 8171 7561 7268 7359 7753 7168 7206 6926 6646
#> [97] 6674 6100 6177 5975 6033 5767 5497 5862 5594 5319
plot(y1)
- Checking outliers with performance
与其他两种方法相同的问题
library(performance)
outliers_list <- check_outliers(y) # Find outliers
outliers_list # Show the row index of the outliers
#> 9 outliers detected: cases 60, 61, 62, 63, 64, 65, 66, 67, 68.
#> - Based on the following method and threshold: zscore_robust (3.291).
#> - For variable: y.
#>
#> -----------------------------------------------------------------------------
#> Outliers per variable (zscore_robust):
#>
#> $y
#> Row Distance_Zscore_robust
#> 60 60 3.688817
#> 61 61 4.384524
#> 62 62 4.491276
#> 63 63 4.362517
#> 64 64 4.438994
#> 65 65 4.525319
#> 66 66 4.576871
#> 67 67 4.393886
#> 68 68 3.804809
str(outliers_list)
#> 'check_outliers' logi [1:116] FALSE FALSE FALSE FALSE FALSE FALSE ...
#> - attr(*, "data")='data.frame': 116 obs. of 4 variables:
#> ..$ Row : int [1:116] 1 2 3 4 5 6 7 8 9 10 ...
#> ..$ Distance_Zscore_robust: num [1:116] 0.645 0.643 0.669 0.619 0.644 ...
#> ..$ Outlier_Zscore_robust : num [1:116] 0 0 0 0 0 0 0 0 0 0 ...
#> ..$ Outlier : num [1:116] 0 0 0 0 0 0 0 0 0 0 ...
#> - attr(*, "threshold")=List of 1
#> ..$ zscore_robust: num 3.29
#> - attr(*, "method")= chr "zscore_robust"
#> - attr(*, "text_size")= num 3
#> - attr(*, "variables")= chr "y"
#> - attr(*, "raw_data")='data.frame': 116 obs. of 1 variable:
#> ..$ y: num [1:116] 511 524 310 721 514 318 511 511 318 510 ...
#> - attr(*, "outlier_var")=List of 1
#> ..$ zscore_robust:List of 1
#> .. ..$ y:'data.frame': 9 obs. of 2 variables:
#> .. .. ..$ Row : int [1:9] 60 61 62 63 64 65 66 67 68
#> .. .. ..$ Distance_Zscore_robust: num [1:9] 3.69 4.38 4.49 4.36 4.44 ...
#> - attr(*, "outlier_count")=List of 2
#> ..$ zscore_robust:'data.frame': 0 obs. of 2 variables:
#> .. ..$ Row : int(0)
#> .. ..$ n_Zscore_robust: int(0)
#> ..$ all :'data.frame': 0 obs. of 2 variables:
#> .. ..$ Row : num(0)
#> .. ..$ n_Zscore_robust: num(0)
y[!outliers_list]
#> [1] 511 524 310 721 514 318 511 511 318 510 21637 0
#> [13] 319 511 305 317 0 0 0 0 0 0 0 0
#> [25] 19590 0 0 0 0 0 0 0 0 21659 0 0
#> [37] 0 0 19 7 0 5 9 21 50 187 291 321
#> [49] 445 462 462 695 694 693 1276 2560 5500 11663 24307 30864
#> [61] 25205 21667 18610 16739 14294 13517 12296 11247 11209 10954 11228 11387
#> [73] 11190 11193 11562 12279 11994 12073 11965 11386 10512 10677 10115 10118
#> [85] 9530 9016 9086 8167 8171 7561 7268 7359 7753 7168 7206 6926
#> [97] 6646 6674 6100 6177 5975 6033 5767 5497 5862 5594 5319
par(mfrow = c(2,1), oma = c(2,2,0,0) + 0.1, mar = c(0,0,2,1) + 0.2)
plot(y)
plot(y[!outliers_list])
测试数据
library(tibble)
dat <- tibble::tribble(
~DateTime, ~Value,
"1993-06-06 11:00:00", NA,
"1993-06-06 12:00:00", 524,
"1993-06-06 13:00:00", 310,
"1993-06-06 14:00:00", 721,
"1993-06-06 15:00:00", 514,
"1993-06-06 16:00:00", 318,
"1993-06-06 17:00:00", 511,
"1993-06-06 18:00:00", 511,
"1993-06-06 19:00:00", 318,
"1993-06-06 20:00:00", 510,
"1993-06-06 21:00:00", 21637,
"1993-06-06 22:00:00", NA,
"1993-06-06 23:00:00", 319,
"1993-06-07 24:00:00", 511,
"1993-06-07 01:00:00", 305,
"1993-06-07 02:00:00", 317,
"1994-08-25 06:00:00", 0,
"1994-08-25 07:00:00", 0,
"1994-08-25 08:00:00", 0,
"1994-08-25 09:00:00", NA,
"1994-08-25 10:00:00", 0,
"1994-08-25 11:00:00", 0,
"1994-08-25 12:00:00", 0,
"1994-08-25 13:00:00", 0,
"1994-08-25 14:00:00", 19590,
"1994-08-26 06:00:00", 0,
"1994-08-26 07:00:00", 0,
"1994-08-26 08:00:00", 0,
"1994-08-26 09:00:00", NA,
"1994-08-26 10:00:00", NA,
"1994-08-26 11:00:00", 0,
"1994-08-26 12:00:00", 0,
"1994-08-26 13:00:00", 0,
"1994-08-26 14:00:00", 21659,
"1994-08-26 15:00:00", 0,
"1994-08-26 16:00:00", 0,
"1994-08-26 17:00:00", 0,
"1994-08-26 20:00:00", 0,
"1994-08-26 21:00:00", 19,
"1994-08-26 22:00:00", 7,
"1995-03-09 18:00:00", NA,
"1995-03-09 19:00:00", 5,
"1995-03-09 20:00:00", 9,
"1995-03-09 21:00:00", 21,
"1995-03-09 22:00:00", 50,
"1995-03-09 23:00:00", 187,
"1995-03-10 24:00:00", 291,
"1995-03-10 01:00:00", 321,
"1995-03-10 02:00:00", 445,
"1995-03-10 03:00:00", 462,
"1995-03-10 04:00:00", 462,
"1995-03-10 05:00:00", 695,
"1995-03-10 06:00:00", 694,
"1995-03-10 07:00:00", 693,
"1995-03-10 08:00:00", 1276,
"1995-03-10 09:00:00", 2560,
"1995-03-10 10:00:00", 5500,
"1995-03-10 11:00:00", 11663,
"1995-03-10 12:00:00", 24307,
"1995-03-10 15:00:00", 36154,
"1995-03-10 17:00:00", 41876,
"1995-03-10 18:00:00", 42754,
"1995-03-10 19:00:00", 41695,
"1995-03-10 20:00:00", 42324,
"1995-03-10 21:00:00", 43034,
"1995-03-10 22:00:00", 43458,
"1995-03-10 23:00:00", 41953,
"1995-03-11 24:00:00", 37108,
"1995-03-11 01:00:00", 30864,
"1995-03-11 02:00:00", 25205,
"1995-03-11 03:00:00", NA,
"1995-03-11 04:00:00", 18610,
"1995-03-11 05:00:00", 16739,
"1995-03-11 06:00:00", 14294,
"1995-03-11 07:00:00", 13517,
"1995-03-11 08:00:00", 12296,
"1995-03-11 09:00:00", 11247,
"1995-03-11 10:00:00", 11209,
"1995-03-11 11:00:00", 10954,
"1995-03-11 12:00:00", 11228,
"1995-03-11 13:00:00", 11387,
"1995-03-11 14:00:00", 11190,
"1995-03-11 15:00:00", 11193,
"1995-03-11 16:00:00", 11562,
"1995-03-11 17:00:00", 12279,
"1995-03-11 18:00:00", 11994,
"1995-03-11 19:00:00", 12073,
"1995-03-11 20:00:00", 11965,
"1995-03-11 21:00:00", 11386,
"1995-03-11 22:00:00", NA,
"1995-03-11 23:00:00", 10677,
"1995-03-12 24:00:00", 10115,
"1995-03-12 01:00:00", 10118,
"1995-03-12 02:00:00", 9530,
"1995-03-12 03:00:00", 9016,
"1995-03-12 04:00:00", 9086,
"1995-03-12 05:00:00", 8167,
"1995-03-12 06:00:00", 8171,
"1995-03-12 07:00:00", 7561,
"1995-03-12 08:00:00", 7268,
"1995-03-12 09:00:00", 7359,
"1995-03-12 10:00:00", NA,
"1995-03-12 11:00:00", 7168,
"1995-03-12 12:00:00", 7206,
"1995-03-12 13:00:00", 6926,
"1995-03-12 14:00:00", 6646,
"1995-03-12 15:00:00", 6674,
"1995-03-12 16:00:00", 6100,
"1995-03-12 17:00:00", 6177,
"1995-03-12 18:00:00", 5975,
"1995-03-12 19:00:00", 6033,
"1995-03-12 20:00:00", 5767,
"1995-03-12 21:00:00", 5497,
"1995-03-12 22:00:00", 5862,
"1995-03-12 23:00:00", 5594,
"1995-03-13 24:00:00", NA
)
创建于2023-10-05使用reprex v2.0.2
4条答案
按热度按时间szqfcxe21#
在你的时间序列中,检测局部离群值而不是全局离群值可能是一个更好的主意。要删除这些离群值,您可以使用时间窗口,例如(n = 10):
but5z9lq2#
您可以使用{forecast}包中的函数来识别具有和不具有季节性的时间序列异常值。它使用弗里德曼的超级平滑。它还可以插入缺失值。阅读更多Hyndman(2021)“Detecting time series outliers”https://robjhyndman.com/hyndsight/tsoutliers/。
q9yhzks03#
您可以使用Hampel filter,这是一种常见的信号处理滤波器来去除离群值。
此过滤器在
pracma
包see中实现。使用所提供的示例:
注意,结果取决于观察窗口长度,在本例中,
k=6
提供了预期结果。性能对比:
lvmkulzt4#
我认为@Waldi(我个人最喜欢的)、@Shadow JA和@DrJerryTAO的现有解决方案都足以检测出异常值。
理念:基于MA的方法
实际上存在许多方法来检测异常值,并且移动平均(MA)方法也是一种可能的解决方案,其中来自先前
n
连续样本的局部均值和标准差将用于检查当前样本。1.下面的代码示例应用了“简单MA”,比如说,等权重。这对于具有缓慢和小变化的时间序列来说很好,但可能对突然和大变化React过度,从而标记为“离群值”。
代码示例
下面是一个基本的R实现(应该有成熟的软件包可以更漂亮、更有效地实现它)
输出
显示了如下内容