R语言 遇到0时重置的累积和

v64noz0r  于 2023-04-03  发布在  其他
关注(0)|答案(5)|浏览(186)

我想做一个字段的累计和,但每当遇到0时重置聚合值。
下面是我想要的一个例子:

data.frame(campaign = letters[1:4] , 
       date=c("jan","feb","march","april"),
       b = c(1,0,1,1) ,
       whatiwant = c(1,0,1,2)
       )

 campaign  date b whatiwant
1        a   jan 1         1
2        b   feb 0         0
3        c march 1         1
4        d april 1         2
hts6caw3

hts6caw31#

另一个基地

with(df, ave(b, cumsum(b == 0), FUN = cumsum))
## [1] 1 0 1 2

这将根据0外观将列b划分为组,并计算每个组的b的累积和
另一种解决方案使用最新的data.table版本(v 1.9.6+)

library(data.table) ## v 1.9.6+
setDT(df)[, whatiwant := cumsum(b), by = rleid(b == 0L)]
#    campaign  date b whatiwant
# 1:        a   jan 1         1
# 2:        b   feb 0         0
# 3:        c march 1         1
# 4:        d april 1         2

根据评论的一些基准

set.seed(123)
x <- sample(0:1e3, 1e7, replace = TRUE)
system.time(res1 <- ave(x, cumsum(x == 0), FUN = cumsum))
# user  system elapsed 
# 1.54    0.24    1.81 
system.time(res2 <- Reduce(function(x, y) if (y == 0) 0 else x+y, x, accumulate=TRUE))
# user  system elapsed 
# 33.94    0.39   34.85 
library(data.table)
system.time(res3 <- data.table(x)[, whatiwant := cumsum(x), by = rleid(x == 0L)])
# user  system elapsed 
# 0.20    0.00    0.21 

identical(res1, as.integer(res2))
## [1] TRUE
identical(res1, res3$whatiwant)
## [1] TRUE
9rnv2umw

9rnv2umw2#

另一个迟来的想法:

ff = function(x)
{
    cs = cumsum(x)
    cs - cummax((x == 0) * cs)
}
ff(c(0, 1, 3, 0, 0, 5, 2))
#[1] 0 1 4 0 0 5 7

并进行比较:

library(data.table)
ffdt = function(x) 
    data.table(x)[, whatiwant := cumsum(x), by = rleid(x == 0L)]$whatiwant

x = as.numeric(x) ##because 'cumsum' causes integer overflow
identical(ff(x), ffdt(x))
#[1] TRUE
microbenchmark::microbenchmark(ff(x), ffdt(x), times = 25)
#Unit: milliseconds
#    expr      min       lq   median       uq      max neval
#   ff(x) 315.8010 362.1089 372.1273 386.3892 405.5218    25
# ffdt(x) 374.6315 407.2754 417.6675 447.8305 534.8153    25
7d7tgy0s

7d7tgy0s3#

您可以将Reduce函数与一个自定义函数一起使用,该函数在遇到的新值为0时返回0,否则将新值添加到累积值中:

Reduce(function(x, y) if (y == 0) 0 else x+y, c(1, 0, 1, 1), accumulate=TRUE)
# [1] 1 0 1 2
mwyxok5s

mwyxok5s4#

hutilscpp::cumsum_reset就是为此目的而设计的。第一个参数是一个逻辑向量,指示累积和何时应该继续。第二个参数是累积和本身的输入

library(hutilscpp)
b <- c(1, 0, 1, 1)
cumsum_reset(as.logical(b), b)

在我的机器上,与上面的data.table函数相比,使用cumsum_reset的速度大约快3倍。

qrjkbowd

qrjkbowd5#

另一个变体使用rep的值在地方重置。这只使用cumsum一次,但缺点将导致大的数字,并可能导致整数溢出或不准确的数值,并将传播NA到所有以下组。

x <- cumsum(DF$b)
i <- which(DF$b == 0)
x - rep(c(0, x[i]), diff(c(1L, i, length(x)+1L)))
#[1] 1 0 1 2

另一种方法是使用Rcpp-在本例中为 integer

Rcpp::cppFunction('IntegerVector csrA(const IntegerVector x, int z=0) {
  IntegerVector out(no_init(x.size()));
  int init = z == NA_INTEGER ? 0 : z;
  int s = 0;
  for(int i = 0; i < x.size(); ++i) {
    if(x[i] == z) s = init;
    else [[likely]] s += x[i];
    out[i] = s;
  }
  return out;
}')
csrA(DF$b)
#[1] 1 0 1 2

一个也处理NA的变体可能看起来像:

Rcpp::cppFunction('IntegerVector csr(const IntegerVector x, int z=0) {
  IntegerVector out(no_init(x.size()));
  int init = z == NA_INTEGER ? 0 : z;
  LogicalVector isNA = is_na(x);
  int s = 0;
  for(int i = 0; i < x.size(); ++i) {
    if(x[i] == z) s = init;
    else [[likely]] if(isNA[i] || s == NA_INTEGER) s = NA_INTEGER;
      else [[likely]] s += x[i];
    out[i] = s;
  }
  return out;
}')

csr(c(2,4,3,0,3,5), 0)
#[1] 2 6 9 0 3 8

csr(c(2,NA,3,0,3,5), 0)
#[1]  2 NA NA  0  3  8

csr(c(2,4,3,1,3,5), 1)
#[1] 2 6 9 1 4 9

csr(c(2,4,3,NA,3,5), NA)
#[1] 2 6 9 0 3 8

数据

DF <- data.frame(campaign = letters[1:4] , 
                 date=c("jan","feb","march","april"),
                 b = c(1,0,1,1) ,
                 whatiwant = c(1,0,1,2)
                 )

基准-基于@大卫Arenburg

set.seed(123)
#Using 1e3 instead of 1e2 would lead to an integer overflow for whichRep and cummax
x <- sample(0:1e2, 1e7, TRUE)

library(data.table)

bench::mark(
ave = ave(x, cumsum(x == 0), FUN = cumsum),
data.table = data.table(x)[, whatiwant := cumsum(x), by = rleid(x == 0L)]$whatiwant,
cummax = {cs = cumsum(x)
  cs - cummax((x == 0) * cs)},
whichRep = {y <- cumsum(x)
i <- which(x == 0)
y - rep(c(0, y[i]), diff(c(1L, i, length(x)+1L)))},
RcppNA = csr(x),
RcppSimple = csrA(x)
)

结果

expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_…¹
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl> <bch:tm>
1 ave           1.06s    1.06s     0.945   751.8MB     3.78     1     4    1.06s
2 data.table 199.01ms 266.26ms     3.76    231.9MB     1.88     2     1 532.53ms
3 cummax      90.57ms  93.76ms    10.4     152.6MB     6.92     6     4  578.4ms
4 whichRep     74.5ms  77.05ms    12.9     195.6MB    11.1      7     6 541.63ms
5 RcppNA      39.55ms  40.84ms    24.2      76.3MB     5.60    13     3  536.1ms
6 RcppSimple  29.73ms  30.59ms    32.3      38.1MB     3.80    17     2  526.1ms

相关问题