R语言 去除时间序列中异常值的有效方法

c8ib6hqw  于 2023-10-13  发布在  其他
关注(0)|答案(4)|浏览(93)

bounty还有3天到期。回答此问题可获得+100声望奖励。Tung希望引起更多的注意这个问题。

我正在寻找有效的方法来删除我的数据中的离群值。我已经尝试了几个解决方案,我发现在这里StackOverflow和其他地方,但没有一个为我工作(1993年6月和1994年8月的3个高值21637,19590和21659应该被检测到,并在本文末尾发布的样本数据中删除)。任何建议将不胜感激!
以下是我到目前为止测试的内容:

数据概述

  1. How to remove outliers from a dataset
    3个异常值仍然存在,并且在时间序列结束时删除了许多合法的高值。
y <- dat$Value
y_filter <- y[!y %in% boxplot.stats(y)$out]
plot(y_filter)

  1. Calculating outliers in R
    与第一种方法类似的问题
FindOutliers <- function(data) {
  lowerq = quantile(data)[2]
  upperq = quantile(data)[4]
  iqr = upperq - lowerq #Or use IQR(data)
  # we identify extreme outliers
  extreme.threshold.upper = (iqr * 1.5) + upperq
  extreme.threshold.lower = lowerq - (iqr * 1.5)
  result <- which(data > extreme.threshold.upper | data < extreme.threshold.lower)
  
  return(result)
}

# use the function to identify outliers
temp <- FindOutliers(y)
# remove the outliers
y1 <- y[-temp]
# show the data with the outliers removed
y1
#>   [1]   511   524   310   721   514   318   511   511   318   510 21637     0
#>  [13]   319   511   305   317     0     0     0     0     0     0     0     0
#>  [25] 19590     0     0     0     0     0     0     0     0 21659     0     0
#>  [37]     0     0    19     7     0     5     9    21    50   187   291   321
#>  [49]   445   462   462   695   694   693  1276  2560  5500 11663 24307 25205
#>  [61] 21667 18610 16739 14294 13517 12296 11247 11209 10954 11228 11387 11190
#>  [73] 11193 11562 12279 11994 12073 11965 11386 10512 10677 10115 10118  9530
#>  [85]  9016  9086  8167  8171  7561  7268  7359  7753  7168  7206  6926  6646
#>  [97]  6674  6100  6177  5975  6033  5767  5497  5862  5594  5319
plot(y1)

  1. Checking outliers with performance
    与其他两种方法相同的问题
library(performance)
outliers_list <- check_outliers(y) # Find outliers
outliers_list # Show the row index of the outliers
#> 9 outliers detected: cases 60, 61, 62, 63, 64, 65, 66, 67, 68.
#> - Based on the following method and threshold: zscore_robust (3.291).
#> - For variable: y.
#> 
#> -----------------------------------------------------------------------------
#> Outliers per variable (zscore_robust): 
#> 
#> $y
#>    Row Distance_Zscore_robust
#> 60  60               3.688817
#> 61  61               4.384524
#> 62  62               4.491276
#> 63  63               4.362517
#> 64  64               4.438994
#> 65  65               4.525319
#> 66  66               4.576871
#> 67  67               4.393886
#> 68  68               3.804809
str(outliers_list)
#>  'check_outliers' logi [1:116] FALSE FALSE FALSE FALSE FALSE FALSE ...
#>  - attr(*, "data")='data.frame': 116 obs. of  4 variables:
#>   ..$ Row                   : int [1:116] 1 2 3 4 5 6 7 8 9 10 ...
#>   ..$ Distance_Zscore_robust: num [1:116] 0.645 0.643 0.669 0.619 0.644 ...
#>   ..$ Outlier_Zscore_robust : num [1:116] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ Outlier               : num [1:116] 0 0 0 0 0 0 0 0 0 0 ...
#>  - attr(*, "threshold")=List of 1
#>   ..$ zscore_robust: num 3.29
#>  - attr(*, "method")= chr "zscore_robust"
#>  - attr(*, "text_size")= num 3
#>  - attr(*, "variables")= chr "y"
#>  - attr(*, "raw_data")='data.frame': 116 obs. of  1 variable:
#>   ..$ y: num [1:116] 511 524 310 721 514 318 511 511 318 510 ...
#>  - attr(*, "outlier_var")=List of 1
#>   ..$ zscore_robust:List of 1
#>   .. ..$ y:'data.frame': 9 obs. of  2 variables:
#>   .. .. ..$ Row                   : int [1:9] 60 61 62 63 64 65 66 67 68
#>   .. .. ..$ Distance_Zscore_robust: num [1:9] 3.69 4.38 4.49 4.36 4.44 ...
#>  - attr(*, "outlier_count")=List of 2
#>   ..$ zscore_robust:'data.frame':    0 obs. of  2 variables:
#>   .. ..$ Row            : int(0) 
#>   .. ..$ n_Zscore_robust: int(0) 
#>   ..$ all          :'data.frame':    0 obs. of  2 variables:
#>   .. ..$ Row            : num(0) 
#>   .. ..$ n_Zscore_robust: num(0)

y[!outliers_list]
#>   [1]   511   524   310   721   514   318   511   511   318   510 21637     0
#>  [13]   319   511   305   317     0     0     0     0     0     0     0     0
#>  [25] 19590     0     0     0     0     0     0     0     0 21659     0     0
#>  [37]     0     0    19     7     0     5     9    21    50   187   291   321
#>  [49]   445   462   462   695   694   693  1276  2560  5500 11663 24307 30864
#>  [61] 25205 21667 18610 16739 14294 13517 12296 11247 11209 10954 11228 11387
#>  [73] 11190 11193 11562 12279 11994 12073 11965 11386 10512 10677 10115 10118
#>  [85]  9530  9016  9086  8167  8171  7561  7268  7359  7753  7168  7206  6926
#>  [97]  6646  6674  6100  6177  5975  6033  5767  5497  5862  5594  5319

par(mfrow = c(2,1), oma = c(2,2,0,0) + 0.1, mar = c(0,0,2,1) + 0.2)
plot(y)
plot(y[!outliers_list])

测试数据

library(tibble)

dat <- tibble::tribble(
  ~DateTime, ~Value,
  "1993-06-06 11:00:00",     NA,
  "1993-06-06 12:00:00",    524,
  "1993-06-06 13:00:00",    310,
  "1993-06-06 14:00:00",    721,
  "1993-06-06 15:00:00",    514,
  "1993-06-06 16:00:00",    318,
  "1993-06-06 17:00:00",    511,
  "1993-06-06 18:00:00",    511,
  "1993-06-06 19:00:00",    318,
  "1993-06-06 20:00:00",    510,
  "1993-06-06 21:00:00",  21637,
  "1993-06-06 22:00:00",     NA,
  "1993-06-06 23:00:00",    319,
  "1993-06-07 24:00:00",    511,
  "1993-06-07 01:00:00",    305,
  "1993-06-07 02:00:00",    317,
  "1994-08-25 06:00:00",      0,
  "1994-08-25 07:00:00",      0,
  "1994-08-25 08:00:00",      0,
  "1994-08-25 09:00:00",     NA,
  "1994-08-25 10:00:00",      0,
  "1994-08-25 11:00:00",      0,
  "1994-08-25 12:00:00",      0,
  "1994-08-25 13:00:00",      0,
  "1994-08-25 14:00:00",  19590,
  "1994-08-26 06:00:00",      0,
  "1994-08-26 07:00:00",      0,
  "1994-08-26 08:00:00",      0,
  "1994-08-26 09:00:00",     NA,
  "1994-08-26 10:00:00",     NA,
  "1994-08-26 11:00:00",      0,
  "1994-08-26 12:00:00",      0,
  "1994-08-26 13:00:00",      0,
  "1994-08-26 14:00:00",  21659,
  "1994-08-26 15:00:00",      0,
  "1994-08-26 16:00:00",      0,
  "1994-08-26 17:00:00",      0,
  "1994-08-26 20:00:00",      0,
  "1994-08-26 21:00:00",     19,
  "1994-08-26 22:00:00",      7,
  "1995-03-09 18:00:00",     NA,
  "1995-03-09 19:00:00",      5,
  "1995-03-09 20:00:00",      9,
  "1995-03-09 21:00:00",     21,
  "1995-03-09 22:00:00",     50,
  "1995-03-09 23:00:00",    187,
  "1995-03-10 24:00:00",    291,
  "1995-03-10 01:00:00",    321,
  "1995-03-10 02:00:00",    445,
  "1995-03-10 03:00:00",    462,
  "1995-03-10 04:00:00",    462,
  "1995-03-10 05:00:00",    695,
  "1995-03-10 06:00:00",    694,
  "1995-03-10 07:00:00",    693,
  "1995-03-10 08:00:00",   1276,
  "1995-03-10 09:00:00",   2560,
  "1995-03-10 10:00:00",   5500,
  "1995-03-10 11:00:00",  11663,
  "1995-03-10 12:00:00",  24307,
  "1995-03-10 15:00:00",  36154,
  "1995-03-10 17:00:00",  41876,
  "1995-03-10 18:00:00",  42754,
  "1995-03-10 19:00:00",  41695,
  "1995-03-10 20:00:00",  42324,
  "1995-03-10 21:00:00",  43034,
  "1995-03-10 22:00:00",  43458,
  "1995-03-10 23:00:00",  41953,
  "1995-03-11 24:00:00",  37108,
  "1995-03-11 01:00:00",  30864,
  "1995-03-11 02:00:00",  25205,
  "1995-03-11 03:00:00",     NA,
  "1995-03-11 04:00:00",  18610,
  "1995-03-11 05:00:00",  16739,
  "1995-03-11 06:00:00",  14294,
  "1995-03-11 07:00:00",  13517,
  "1995-03-11 08:00:00",  12296,
  "1995-03-11 09:00:00",  11247,
  "1995-03-11 10:00:00",  11209,
  "1995-03-11 11:00:00",  10954,
  "1995-03-11 12:00:00",  11228,
  "1995-03-11 13:00:00",  11387,
  "1995-03-11 14:00:00",  11190,
  "1995-03-11 15:00:00",  11193,
  "1995-03-11 16:00:00",  11562,
  "1995-03-11 17:00:00",  12279,
  "1995-03-11 18:00:00",  11994,
  "1995-03-11 19:00:00",  12073,
  "1995-03-11 20:00:00",  11965,
  "1995-03-11 21:00:00",  11386,
  "1995-03-11 22:00:00",     NA,
  "1995-03-11 23:00:00",  10677,
  "1995-03-12 24:00:00",  10115,
  "1995-03-12 01:00:00",  10118,
  "1995-03-12 02:00:00",   9530,
  "1995-03-12 03:00:00",   9016,
  "1995-03-12 04:00:00",   9086,
  "1995-03-12 05:00:00",   8167,
  "1995-03-12 06:00:00",   8171,
  "1995-03-12 07:00:00",   7561,
  "1995-03-12 08:00:00",   7268,
  "1995-03-12 09:00:00",   7359,
  "1995-03-12 10:00:00",     NA,
  "1995-03-12 11:00:00",   7168,
  "1995-03-12 12:00:00",   7206,
  "1995-03-12 13:00:00",   6926,
  "1995-03-12 14:00:00",   6646,
  "1995-03-12 15:00:00",   6674,
  "1995-03-12 16:00:00",   6100,
  "1995-03-12 17:00:00",   6177,
  "1995-03-12 18:00:00",   5975,
  "1995-03-12 19:00:00",   6033,
  "1995-03-12 20:00:00",   5767,
  "1995-03-12 21:00:00",   5497,
  "1995-03-12 22:00:00",   5862,
  "1995-03-12 23:00:00",   5594,
  "1995-03-13 24:00:00",     NA
)

创建于2023-10-05使用reprex v2.0.2

szqfcxe2

szqfcxe21#

在你的时间序列中,检测局部离群值而不是全局离群值可能是一个更好的主意。要删除这些离群值,您可以使用时间窗口,例如(n = 10):

library(data.table)

setDT(dat)

dat[, upper := frollapply(Value, n = 10, FUN = quantile, p = 0.75, na.rm = TRUE, align = "center")]
dat[, lower := frollapply(Value, n = 10, FUN = quantile, p = 0.25, na.rm = TRUE, align = "center")]
dat[, iqr := frollapply(Value, n = 10, FUN = IQR, na.rm = TRUE, align = "center")]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]

              DateTime Value upper  lower    iqr
1: 1993-06-06 21:00:00 21637   511 317.25 193.75
2: 1994-08-25 14:00:00 19590     0   0.00   0.00
3: 1994-08-26 14:00:00 21659     0   0.00   0.00

dat[Value < lower - iqr * 3 | Value > upper + iqr * 3, Value := NA_integer_]

dat[, mean := frollmean(Value, n = 10, na.rm = TRUE, align = "center")]
dat[, Value := fcoalesce(Value, mean)]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]

plot(dat$Value)

but5z9lq

but5z9lq2#

您可以使用{forecast}包中的函数来识别具有和不具有季节性的时间序列异常值。它使用弗里德曼的超级平滑。它还可以插入缺失值。阅读更多Hyndman(2021)“Detecting time series outliers”https://robjhyndman.com/hyndsight/tsoutliers/

dat %>%
  mutate(
    outlier = row_number() %in% tsoutliers(
      Value, lambda = "auto")$index, 
    replacement = tsclean(Value, lambda = "auto")
  )
q9yhzks0

q9yhzks03#

您可以使用Hampel filter,这是一种常见的信号处理滤波器来去除离群值。
此过滤器在pracmasee中实现。
使用所提供的示例:

library(pracma)

# non NA values index
ValInd <- which(!is.na(dat$Value))

# Find outliers index using Hampel filter
OutlierInd <- ValInd[pracma::hampel(dat$Value[ValInd],k=6)$ind]

dat$Value[OutlierInd]
#> [1] 21637 19590 21659

# Remove outliers
dat$Value[OutlierInd] <- NA

plot(dat$Value)

注意,结果取决于观察窗口长度,在本例中,k=6提供了预期结果。

性能对比:

Hampel <- function(){
# Keep non NA values
ValInd <- which(!is.na(dat$Value))

# Find outliers index using hampel filter
OutlierInd <- ValInd[pracma::hampel(dat$Value[ValInd],k=6)$ind]
dat$Value[OutlierInd]

# Remove outliers
dat$Value[OutlierInd] <- NA}

dt <- function() {
setDT(dat)

dat[, upper := frollapply(Value, n = 10, FUN = quantile, p = 0.75, na.rm = TRUE, align = "center")]
dat[, lower := frollapply(Value, n = 10, FUN = quantile, p = 0.25, na.rm = TRUE, align = "center")]
dat[, iqr := frollapply(Value, n = 10, FUN = IQR, na.rm = TRUE, align = "center")]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]

dat[Value < lower - iqr * 3 | Value > upper + iqr * 3, Value := NA_integer_]

dat[, mean := frollmean(Value, n = 10, na.rm = TRUE, align = "center")]
dat[, Value := fcoalesce(Value, mean)]
dat[Value < lower - iqr * 3 | Value > upper + iqr * 3]}

microbenchmark::microbenchmark(Hampel(),dt())

#Unit: milliseconds
#     expr     min      lq      mean   median       uq      max neval
# Hampel()  3.1412  3.6044  4.178006  3.77995  4.22660   9.9679   100
#     dt() 23.3904 24.3968 28.848421 25.99170 30.91555 125.6603   100
lvmkulzt

lvmkulzt4#

我认为@Waldi(我个人最喜欢的)、@Shadow JA和@DrJerryTAO的现有解决方案都足以检测出异常值。

理念:基于MA的方法

实际上存在许多方法来检测异常值,并且移动平均(MA)方法也是一种可能的解决方案,其中来自先前n连续样本的局部均值和标准差将用于检查当前样本。

  • 跑步前的几件事 *:
  1. MA有多种选择,例如,简单MA、累积MA、加权MA、指数MA等,这可能导致不同的离群值检测标准。
    1.下面的代码示例应用了“简单MA”,比如说,等权重。这对于具有缓慢和小变化的时间序列来说很好,但可能对突然和大变化React过度,从而标记为“离群值”。

代码示例

下面是一个基本的R实现(应该有成熟的软件包可以更漂亮、更有效地实现它)

DetectOutliers <- function(v, n = 3) {
    helper <- function(x) {
        wts <- rep(1, n) / n
        m <- stats::filter(x, wts, sides = 1)
        m2 <- stats::filter(x^2, wts, sides = 1)
        sd <- sqrt(m2 - m^2)
        idx <- abs(x - c(NA, head(m, -1))) / c(NA, head(sd, -1)) >= 3
        replace(idx, is.na(idx), FALSE)
    }
    x <- na.omit(v)
    k <- !(helper(x) | rev(helper(rev(x))))
    idx <- replace(rep(FALSE, length(v)), which(!is.na(v))[!k], TRUE)
    ifelse(is.na(v), NA, idx)
}

输出

dat %>%
    dplyr::filter(!DetectOutliers(Value)) %>%
    select(Value) %>%
    pluck(1) %>%
    plot()

显示了如下内容

相关问题