在R中,如何为每5分钟大于第95百分位数的数据划分子集

qjp7pelc  于 2023-04-27  发布在  其他
关注(0)|答案(1)|浏览(77)

我需要为我的数据框设置子集。我需要每5分钟为每列包含大于95百分位的值。
我的 Dataframe 是t:

dput(t)

structure(list(Date = structure(c(1468814400, 1468814700, 1468815000, 
1468815300, 1468815600, 1468815900, 1468816200, 1468816800, 1468817400, 
1468817700, 1468818000, 1468818300, 1468818600, 1468818900, 1468819200, 
1468819500, 1468819800, 1468820100, 1468820400, 1468820700, 1468821000, 
1468821300, 1468821600, 1468821900, 1468822200, 1468822500, 1468822800, 
1468823100, 1468823400, 1468823700), class = c("POSIXct", "POSIXt"
), tzone = ""), CPU = c(6.09, 4.96, 8.61, 1.07, 5.13, 9.7, 1.97, 
4.39, 3.25, 13.5, 1.86, 3.79, 4, 2.68, 8.71, 1.99, 14, 2.96, 
2.75, 15.38, 7.97, 4.41, 5.08, 16.26, 12.19, 7.05, 6.97, 17.78, 
17.57, 7.23), Trans_A = c(35, 32, 18, 23, 13, 51, 12, 15, 22, 
228, 219, 71, 277, 434, 414, 154, 273, 284, 331, 170, 320, 287, 
277, 157, 313, 316, 629, 448, 594, 478), Trans_B = c(53, 11, 
56, 10, 11, 15, 7, 91, 8, 10, 197, 98, 101, 354, 209, 449, 429, 
788, 391, 312, 131, 212, 229, 189, 529, 389, 438, 662, 855, 559
), Heap_A = c(4.58, 7.81, 7.81, 3.3, 3.95, 9.75, 3.01, 10.07, 
10.4, 10.64, 3.2, 9.85, 10.56, 7.51, 4.3, 7.31, 10.18, 3.54, 
10.64, 9.16, 7.49, 6.61, 10.72, 6.48, 10.48, 9.97, 11.22, 10.8, 
10.73, 11.94), Heap_B = c(53, 11, 56, 10, 11, 15, 7, 91, 8, 10, 
197, 98, 101, 354, 209, 449, 429, 788, 391, 312, 131, 212, 229, 
189, 529, 389, 438, 662, 855, 559)), .Names = c("Date", "CPU", 
"Trans_A", "Trans_B", "Heap_A", "Heap_B"), row.names = c(NA, 
-30L), class = "data.frame")

我可以像这样得到最大值,但我需要每5分钟大于第95百分位的值。我该怎么做?

library(dplyr)
ff<-t %>%
  mutate(Date = as.POSIXct(Date, format = '%Y-%m-%d %H:%M:%S') 
         %>% cut('5 min')) %>%
  group_by(Date) %>%
  dplyr::summarise(mCpu=max(CPU),
                   mTrans_a=max(Trans_A),
                   mTrans_b=max(Trans_B),
                   mHeap_a=max(Heap_A),
                   mHeap_b=max(Heap_B))
lyr7nygr

lyr7nygr1#

你不能使用summarise进行子集划分,但可以使用filter进行子集划分。分组后,将为Date的每个值计算quantile s。
例如,下面的代码将给予CPU值高于95%的所有行:

library(dplyr)
ff<-t %>%
  mutate(
    Date = as.POSIXct(Date, format = '%Y-%m-%d %H:%M:%S') %>% cut('5 min')
  ) %>%
  group_by(Date) %>%
  filter(CPU > quantile(CPU, 0.95))

如果只希望所有列的值〉95%的行,请用途:

filter(
  CPU > quantile(CPU, 0.95),
  Trans_a > quantile(Trans_a, 0.95),
  Trans_b > quantile(Trans_b, 0.95),
  Heap_A > quantile(Heap_A, 0.95),
  Heap_B > quantile(Heap_B, 0.95)
)

或者更简单地说:

filter(if_all(CPU:Heap_B, \(v) quantile(v, 0.95)))

相关问题