R语言 优化步骤扩展序列数据、求交和分组计算,table

raogr8fs  于 2023-01-15  发布在  其他
关注(0)|答案(3)|浏览(118)

我有一个巨大的数据集,在那里我使用数据。表包由于快速计算。有这种类型的数据集:

library(data.table)
library(dplyr)

dt <- data.table(
  gr1 = rep(LETTERS[1:2], each = 4),
  gr2 = rep(letters[3:6], each = 2),
  date1 = as.Date(c('2020-01-01', '2020-02-01', '2020-02-01', '2020-02-04', '2020-01-01', '2020-02-01', '2020-02-01', '2020-02-04')),
  date2 = as.Date(c('2020-01-05', '2020-02-05', '2020-02-02', '2020-02-07', '2020-01-05', '2020-02-05', '2020-02-02', '2020-02-07')),
  value = 1:8
)
dt

   gr1 gr2      date1      date2 value
1:   A   c 2020-01-01 2020-01-05     1
2:   A   c 2020-02-01 2020-02-05     2
3:   A   d 2020-02-01 2020-02-02     3
4:   A   d 2020-02-04 2020-02-07     4
5:   B   e 2020-01-01 2020-01-05     5
6:   B   e 2020-02-01 2020-02-05     6
7:   B   f 2020-02-01 2020-02-02     7
8:   B   f 2020-02-04 2020-02-07     8

我想对value列中与同一个gr1对应的所有gr2上的日期(日期范围序列的结果)求和。(gr1之间的独立计算)。
我的解决方法是:
1.创建包含扩展日期范围(date1date2)的date列的数据集

dt2 <- dt[, .(gr1, gr2, date = seq(date1, date2, by = 'day'), value), by = 1:nrow(dt)]

1.如果date存在于每个gr1的所有gr2上,则使用Reduceintersect函数found here添加is_shared

dt2[, date := as.character(date)]

dt3 <- split(dt2, by = 'gr1') %>% lapply(function(x) {
  
  dates <- Reduce(intersect, x[, .(list(unique(date))), gr2]$V1)
  x[, is_shared := date %in% dates][]
  
}) %>% rbindlist()
dt3

    gr1 gr2       date value is_shared
 1:   A   c 2020-01-01     1     FALSE
 2:   A   c 2020-01-02     1     FALSE
 3:   A   c 2020-01-03     1     FALSE
 4:   A   c 2020-01-04     1     FALSE
 5:   A   c 2020-01-05     1     FALSE
 6:   A   c 2020-02-01     2      TRUE
 7:   A   c 2020-02-02     2      TRUE
 8:   A   c 2020-02-03     2     FALSE
 9:   A   c 2020-02-04     2      TRUE
10:   A   c 2020-02-05     2      TRUE
11:   A   d 2020-02-01     3      TRUE
12:   A   d 2020-02-02     3      TRUE
13:   A   d 2020-02-04     4      TRUE
14:   A   d 2020-02-05     4      TRUE
15:   A   d 2020-02-06     4     FALSE
16:   A   d 2020-02-07     4     FALSE
17:   B   e 2020-01-01     5     FALSE
18:   B   e 2020-01-02     5     FALSE
19:   B   e 2020-01-03     5     FALSE
20:   B   e 2020-01-04     5     FALSE
21:   B   e 2020-01-05     5     FALSE
22:   B   e 2020-02-01     6      TRUE
23:   B   e 2020-02-02     6      TRUE
24:   B   e 2020-02-03     6     FALSE
25:   B   e 2020-02-04     6      TRUE
26:   B   e 2020-02-05     6      TRUE
27:   B   f 2020-02-01     7      TRUE
28:   B   f 2020-02-02     7      TRUE
29:   B   f 2020-02-04     8      TRUE
30:   B   f 2020-02-05     8      TRUE
31:   B   f 2020-02-06     8     FALSE
32:   B   f 2020-02-07     8     FALSE

1.过滤 * 共享日期 * 并按gr1计算

dt4 <- dt3[is_shared == TRUE][, .(value = sum(value)), by = .(gr1, date)]
dt4

   gr1       date value
1:   A 2020-02-01     5
2:   A 2020-02-02     5
3:   A 2020-02-04     6
4:   A 2020-02-05     6
5:   B 2020-02-01    13
6:   B 2020-02-02    13
7:   B 2020-02-04    14
8:   B 2020-02-05    14

问题:

  • dt2的超大尺寸
  • splitlapply步骤导致系统崩溃(15Gb RAM和4Gb交换空间)

可能的优化:

  • 避免使用dt2和dt3对象,因为要从日期范围中扩展日期。
  • 我尝试使用.I来创建按行的日期序列,但是我有一个错误'from' must be of length 1,所以我更改为1:nrow(dt),这会创建一个不必要的名为nrow的列(在后验计算中删除)。
  • 不要将date转换为dt2上的字符类(使用%in%搜索lapply时需要
    • 编辑:**添加大小写
dt <- data.table(
  id1 = c(rep(1, 8), rep(2, 4)),
  id2 = rep(c(10, 20, 30), each = 4),
  id3 = rep(rep(LETTERS[1:2], each = 2), 3),
  gr  = rep(1:2, 6),
  date1 = as.Date(rep(c('2020-01-01', '2020-01-05'), 6)),
  date2 = as.Date(rep(c('2020-01-10', '2020-01-12'), 6)),
  value = 1:12
)

dt

    id1 id2 id3 gr      date1      date2 value
 1:   1  10   A  1 2020-01-01 2020-01-10     1
 2:   1  10   A  2 2020-01-05 2020-01-12     2
 3:   1  10   B  1 2020-01-01 2020-01-10     3
 4:   1  10   B  2 2020-01-05 2020-01-12     4
 5:   1  20   A  1 2020-01-01 2020-01-10     5
 6:   1  20   A  2 2020-01-05 2020-01-12     6
 7:   1  20   B  1 2020-01-01 2020-01-10     7
 8:   1  20   B  2 2020-01-05 2020-01-12     8
 9:   2  30   A  1 2020-01-01 2020-01-10     9
10:   2  30   A  2 2020-01-05 2020-01-12    10
11:   2  30   B  1 2020-01-01 2020-01-10    11
12:   2  30   B  2 2020-01-05 2020-01-12    12
    • 目的是:**
  • 创建/扩展包含date1date2之间日期的date列的数据集
  • 对于id1-id2-id3的每个组合,筛选所有gr上存在的 * 共享 * 日期
  • 将对应的value相加
omjgkv6w

omjgkv6w1#

另一种可能的解决方案,与OP和@r2evan的解决方案进行基准测试:

library(data.table)
library(collapse) # for the function "fndistinct"
library(dplyr)

fun <- function(d1, d2, v, g2) {
  # needed for r2evan's solution
  tmp <- as.data.table(tidyr::unnest(
    cbind(data.table(v=v, date=Map(seq, d1, d2, by = "days")), g2=g2),
    date))
  allg3 <- unique(g2)
  tmp[, .SD[all(allg3 %in% g2),], by = date][, .(value = sum(v)), by = date]
}

OP溶液

f1 <- function(dt) {
  # OP solution
  dt2 <- dt[, .(gr1, gr2, date = seq(date1, date2, by = 'day'), value), by = 1:nrow(dt)]
  
  dt3 <- split(dt2, by = 'gr1') %>% lapply(function(x) {
    
    dates <- Reduce(intersect, x[, .(list(unique(date))), gr2]$V1)
    x[, is_shared := date %in% dates][]
    
  }) %>% rbindlist()
  dt3[is_shared == TRUE][, .(value = sum(value)), by = .(gr1, date)]
}

备选方案二:

f2 <- function(dt, idcols = grep("id", names(dt)), grcol = grep("gr", names(dt))) {
  d <- as.integer(dt$date2 - dt$date1) + 1L
  setDT(
    c(
      lapply(dt[, ..idcols], rep.int, times = d),
      list(
        date = as.Date(sequence(d, dt$date1), origin = "1970-01-01"),
        value = rep.int(dt$value, d),
        # the count of unique gr by ids
        n = rep.int(fndistinct(dt[[grcol]], dt[, ..idcols], 1), d)
      )
    )
  )[
    # aggregate value by id and date
    # keep only if the count is equal to the number of unique gr
    , .(value = if (n[1] == .N) sum(value) else value[0]),
    c(names(dt)[idcols], "date")
  ]
}

基准:

microbenchmark::microbenchmark(f1 = f1(dt),
                               r2evans = dt[, fun(date1, date2, value, gr2), by = gr1],
                               f2 = f2(dt, 1L, 2L),
                               check = "identical")
#> Unit: microseconds
#>     expr     min       lq      mean   median       uq     max neval
#>       f1  5042.1  5390.60  6225.412  6182.55  6763.70  9123.5   100
#>  r2evans 11061.0 11836.35 12555.882 12145.10 12865.85 19127.9   100
#>       f2   694.9   835.70   885.403   879.80   932.80  1127.3   100

数据:

dt <- data.table(
  gr1 = rep(LETTERS[1:2], each = 4),
  gr2 = rep(letters[3:6], each = 2),
  date1 = as.Date(c('2020-01-01', '2020-02-01', '2020-02-01', '2020-02-04', '2020-01-01', '2020-02-01', '2020-02-01', '2020-02-04')),
  date2 = as.Date(c('2020-01-05', '2020-02-05', '2020-02-02', '2020-02-07', '2020-01-05', '2020-02-05', '2020-02-02', '2020-02-07')),
  value = 1:8
)

对于“真实的病例”数据:

f2(dt)
#>     id1 id2 id3       date value
#>  1:   1  10   A 2020-01-05     3
#>  2:   1  10   A 2020-01-06     3
#>  3:   1  10   A 2020-01-07     3
#>  4:   1  10   A 2020-01-08     3
#>  5:   1  10   A 2020-01-09     3
#>  6:   1  10   A 2020-01-10     3
#>  7:   1  10   B 2020-01-05     7
#>  8:   1  10   B 2020-01-06     7
#>  9:   1  10   B 2020-01-07     7
#> 10:   1  10   B 2020-01-08     7
#> 11:   1  10   B 2020-01-09     7
#> 12:   1  10   B 2020-01-10     7
#> 13:   1  20   A 2020-01-05    11
#> 14:   1  20   A 2020-01-06    11
#> 15:   1  20   A 2020-01-07    11
#> 16:   1  20   A 2020-01-08    11
#> 17:   1  20   A 2020-01-09    11
#> 18:   1  20   A 2020-01-10    11
#> 19:   1  20   B 2020-01-05    15
#> 20:   1  20   B 2020-01-06    15
#> 21:   1  20   B 2020-01-07    15
#> 22:   1  20   B 2020-01-08    15
#> 23:   1  20   B 2020-01-09    15
#> 24:   1  20   B 2020-01-10    15
#> 25:   2  30   A 2020-01-05    19
#> 26:   2  30   A 2020-01-06    19
#> 27:   2  30   A 2020-01-07    19
#> 28:   2  30   A 2020-01-08    19
#> 29:   2  30   A 2020-01-09    19
#> 30:   2  30   A 2020-01-10    19
#> 31:   2  30   B 2020-01-05    23
#> 32:   2  30   B 2020-01-06    23
#> 33:   2  30   B 2020-01-07    23
#> 34:   2  30   B 2020-01-08    23
#> 35:   2  30   B 2020-01-09    23
#> 36:   2  30   B 2020-01-10    23
bxjv4tth

bxjv4tth2#

试试这个

fun <- function(d1, d2, v, g2) {
  tmp <- as.data.table(tidyr::unnest(
    cbind(data.table(v=v, d=Map(seq, d1, d2, by = "days")), g2=g2),
    d))
  allg3 <- unique(g2)
  tmp[, .SD[all(allg3 %in% g2),], by = d][, .(value = sum(v)), by = d]
}

dt[, fun(date1, date2, value, gr2), by = gr1]
#       gr1          d value
#    <char>     <Date> <int>
# 1:      A 2020-02-01     5
# 2:      A 2020-02-02     5
# 3:      A 2020-02-04     6
# 4:      A 2020-02-05     6
# 5:      B 2020-02-01    13
# 6:      B 2020-02-02    13
# 7:      B 2020-02-04    14
# 8:      B 2020-02-05    14

目前还没有(data.table#2146data.table#3672data.table-内部unnest函数,这些问题中的讨论表明tidyr::unnest是高效的-在这一点上足以防止自己跳入一个函数。

sbdsn5lh

sbdsn5lh3#

下面的方法使用data.table非等值连接和链接来避免任何中间结果的具体化。不确定它是否可能在使用完整大小的data.set时遇到内存问题,但我相信它应该是高性能的。

## Generate an index of all dates included in ranges
Index <- data.table(date = seq.Date(from = dt[,min(date1)],
                                    to = dt[,max(date2)],
                                    by = 1))

## Create an extra column since data.table dt[dt2, ...] style
## joins modify the original key columns
Index[, dateKey := date]    

## Set keys ahead of time
setkey(dt,date1,date2)
setkey(Index,dateKey)

## Perform a non-equijoin to expand to the equivalent of dt2
Index[dt, on = .(dateKey >= date1, dateKey <= date2)][
  ## Determine whether dates are shared
  , is_shared := .N > 1, keyby = .(gr1,date)][
    ## Subset only shared dates and sum values by gr1 and date
    is_shared == TRUE, .(value = sum(value)), keyby = .(gr1,date)]

至于基准测试,到目前为止@jblood94 * 所展示的结果(以微秒计)* 通常不太可能代表更大数据集的性能。您提供的数据集非常适合以易于理解的方式演示所需的输入和输出,但生成的合成数据集要大得多,其中包含组计数、日期范围长度、并且与当前数据集类似的总体时间跨度将是真正比较性能所必需的。
下面的代码演示了一种生成任意大数据集的方法,并比较了当前提出的方法,每种方法只运行一次。本答案中概述的data.table方法和@jblood94的f2()解决方案都可以很好地扩展。但是,我注意到不同的方法对这种合成数据给出了不同的结果。我不确定第一次的结果是否“正确”。* 非常确定我遇到了比较OP/@jblood94的问题和答案的不同编辑版本的工件。*
另外,另一个例子显示了将日期列转换为data.tableIDate类如何略微提高性能。

library(data.table)
library(collapse) # for the function "fndistinct"
library(dplyr)

## Number of rows in synthetic data
N = 1e5

## Larger date ranges and mean duration will impact resulting
## intermediate result sizes significantly
start <- as.Date("2020-01-01")
end <- as.Date("2022-12-31")
range <- seq.Date(from = start,
                  to = end,
                  by = 1)
meanDuration <- 60
sdDuration <- meanDuration/2

## Cardinality of groups will also impact results
gr1Set <- as.character(seq_len(100))
gr2Set <- as.character(seq_len(20))

dt <- data.table(
  gr1 = sample(gr1Set, N, replace = T),
  gr2 = sample(gr2Set, N, replace = T),
  date1 = sample(range, N, replace = T),
  date2 = as.Date(NA),
  value = sample(1:10, N, replace = T)
)[order(gr1,gr2,date1)]

dt[, date2 := date1 + ceiling(pmax(1,rnorm(N, mean = meanDuration,sd = sdDuration)))]
str(dt)
## Create copies so functions each get a "clean" set of data
## Since data.table modifies by reference
dt1 <- copy(dt)
dt2 <- copy(dt)
dt3 <- copy(dt)
dt4 <- copy(dt)
dt5 <- copy(dt)

函数定义:

fun <- function(d1, d2, v, g2) {
  # needed for r2evan's solution
  tmp <- as.data.table(tidyr::unnest(
    cbind(data.table(v=v, date=Map(seq, d1, d2, by = "days")), g2=g2),
    date))
  allg3 <- unique(g2)
  tmp[, .SD[all(allg3 %in% g2),], by = date][, .(value = sum(v)), by = date]
}

f1 <- function(dt) {
  # OP solution
  dt2 <- dt[, .(gr1, gr2, date = seq(date1, date2, by = 'day'), value), by = 1:nrow(dt)]
  
  dt3 <- split(dt2, by = 'gr1') %>% lapply(function(x) {
    
    dates <- Reduce(intersect, x[, .(list(unique(date))), gr2]$V1)
    x[, is_shared := date %in% dates][]
    
  }) %>% rbindlist()
  dt3[is_shared == TRUE][, .(value = sum(value)), by = .(gr1, date)]
}

f2 <- function(dt, idcols, grcol) {
  d <- as.integer(dt$date2 - dt$date1) + 1L
  setDT(
    c(
      lapply(dt[, ..idcols], rep.int, times = d),
      list(
        date = as.Date(sequence(d, dt$date1), origin = "1970-01-01"),
        value = rep.int(dt$value, d),
        # the count of unique gr by ids
        n = rep.int(fndistinct(dt[[grcol]], dt[, ..idcols], 1), d)
      )
    )
  )[
    # aggregate value by id and date
    # keep only if the count is equal to the number of unique gr
    , .(value = if (n[1] == .N) sum(value) else value[0]), 
    c(names(dt)[idcols], "date")
  ]
}

matt <- function(dt){
  ## Generate an index of all dates included in ranges
  Index <- data.table(date = seq.Date(from = dt[,min(date1)],
                                      to = dt[,max(date2)],
                                      by = 1))
  ## Create an extra column since data.table dt[dt2, ...] style
  ## joins modify the original key columns
  Index[, dateKey := date]
  
  ## Set keys ahead of time
  setkey(dt,date1,date2)
  setkey(Index,dateKey)
  
  ## Perform a non-equijoin to expand to the equivalent of dt2
  Index[dt, on = .(dateKey >= date1, dateKey <= date2)][
    ## Determine whether dates are shared
    , is_shared := .N > 1, keyby = .(gr1,date)][
      ## Subset only shared dates and sum values by gr1 and date
      is_shared == TRUE, .(value = sum(value)), keyby = .(gr1,date)][]
  
}

mattIDate <- function(dt){
  ## Generate an index of all dates included in ranges
  Index <- data.table(date = as.IDate(seq.Date(from = dt[,min(date1)],
                                               to = dt[,max(date2)],
                                               by = 1)))
  ## Create an extra column since data.table dt[dt2, ...] style
  ## joins modify the original key columns
  Index[, dateKey := date]
  
  ## Convert to data.table IDates for performance
  dt[,date1 := as.IDate(date1)]
  dt[,date2 := as.IDate(date2)]
  ## Set keys ahead of time
  setkey(dt,date1,date2)
  setkey(Index,dateKey)
  
  ## Perform a non-equijoin to expand to the equivalent of dt2
  Index[dt, on = .(dateKey >= date1, dateKey <= date2)][
    ## Determine whether dates are shared
    , is_shared := .N > 1, keyby = .(gr1,date)][
      ## Subset only shared dates and sum values by gr1 and date
      is_shared == TRUE, .(value = sum(value)), keyby = .(gr1,date)][]
  
}

100,000个输入行的基准测试:

microbenchmark::microbenchmark(f1 = f1(dt1),
                               r2evans = dt2[, fun(date1, date2, value, gr2), by = gr1],
                               f2 = f2(dt3, 1L, 2L),
                               matt = matt(dt4),
                               mattIDate = mattIDate(dt5),
                               #check = "identical",
                               times =  1)

# Unit: milliseconds
#     expr        min         lq       mean     median         uq        max neval
#        f1  8269.9858  8269.9858  8269.9858  8269.9858  8269.9858  8269.9858     1
#   r2evans 37229.5579 37229.5579 37229.5579 37229.5579 37229.5579 37229.5579     1
#        f2   821.1306   821.1306   821.1306   821.1306   821.1306   821.1306     1
#      matt  1335.4477  1335.4477  1335.4477  1335.4477  1335.4477  1335.4477     1
# mattIDate  1037.2043  1037.2043  1037.2043  1037.2043  1037.2043  1037.2043     1

相关问题