R语言 按组对某些行求和,但将结果写入每组的所有行

blpfk2vs  于 2023-07-31  发布在  其他
关注(0)|答案(4)|浏览(85)

假设我有这个数据.table:

library(data.table)

dt <- data.table("value" = c(10,15,30,55,20,10,40,70,80,20,40,140),
                 "id" = c(1,1,1,1,2,2,2,2,3,3,3,3),
                 "variable" = c("V1","V1","V1","Total","V1","V1","V1","Total",
                                "V1","V1","V1","Total"))
value id variable
 1:    10  1       V1
 2:    15  1       V1
 3:    30  1       V1
 4:    55  1    Total
 5:    20  2       V1
 6:    10  2       V1
 7:    40  2       V1
 8:    70  2    Total
 9:    80  3       V1
10:    20  3       V1
11:    40  3       V1
12:   140  3    Total

如何计算valueid的和,但仅当variable=="V1"同时在variable为Total的行中显示该和时。
我知道,我可以子集行,然后计算总和,并创建一个新的列,但然后总和不存在于行中的变量是“总”。

dt[variable != "Total", sum_value := sum(value), by = id]
value id variable sum_value
 1:    10  1       V1        55
 2:    15  1       V1        55
 3:    30  1       V1        55
 4:    55  1    Total        NA
 5:    20  2       V1        70
 6:    10  2       V1        70
 7:    40  2       V1        70
 8:    70  2    Total        NA
 9:    80  3       V1       140
10:    20  3       V1       140
11:    40  3       V1       140
12:   140  3    Total        NA

我想得到的是:

value id variable sum_value
 1:    10  1       V1        55
 2:    15  1       V1        55
 3:    30  1       V1        55
 4:    55  1    Total        55
 5:    20  2       V1        70
 6:    10  2       V1        70
 7:    40  2       V1        70
 8:    70  2    Total        70
 9:    80  3       V1       140
10:    20  3       V1       140
11:    40  3       V1       140
12:   140  3    Total       140

qq24tv8q

qq24tv8q1#

dt[, sum_value := sum(value[variable != "Total"]), by = id]
# or if data is ordered:
dt[, sum_value := sum(first(value, -1)), by = id]

#     value    id variable sum_value
#     <num> <num>   <char>     <num>
#  1:    10     1       V1        55
#  2:    15     1       V1        55
#  3:    30     1       V1        55
#  4:    55     1    Total        55
#  5:    20     2       V1        70
#  6:    10     2       V1        70
#  7:    40     2       V1        70
#  8:    70     2    Total        70
#  9:    80     3       V1       140
# 10:    20     3       V1       140
# 11:    40     3       V1       140
# 12:   140     3    Total       140

字符串

7cwmlq89

7cwmlq892#

您可以按如下方式使用fill

library(dplyr)

dt[variable != "Total", sum_value := sum(value), by = id] %>% fill(sum_value)

    value id variable sum_value
 1:    10  1       V1        55
 2:    15  1       V1        55
 3:    30  1       V1        55
 4:    55  1    Total        55
 5:    20  2       V1        70
 6:    10  2       V1        70
 7:    40  2       V1        70
 8:    70  2    Total        70
 9:    80  3       V1       140
10:    20  3       V1       140
11:    40  3       V1       140
12:   140  3    Total       140

字符串

fbcarpbf

fbcarpbf3#

我会这样做作为一个自我结合。也就是说,计算你想要的东西,然后把它加回原来的东西。

setkey(dt, id)
dt <- dt[dt[variable!='Total',.(sum_value=sum(value)),by='id']]

字符串
或者,您可以使用ifelsevalue设置为0,当variable=='Total'像这样

dt[,sum_value:=sum(ifelse(variable=='Total', 0, value)),by='id']


不管怎样

value id variable sum_value
1:    10  1       V1        55
2:    15  1       V1        55
3:    30  1       V1        55
4:    55  1    Total        55
5:    20  2       V1        70
6:    10  2       V1        70
7:    40  2       V1        70
8:    70  2    Total        70
9:    80  3       V1       140
10:    20  3       V1       140
11:    40  3       V1       140
12:   140  3    Total       140

cigdeys3

cigdeys34#

下面是一个dplyr和一个base r选项:

#dplyr
df |>
  dplyr::group_by(id) |>
  dplyr::mutate(sum_value = value[variable == "Total"])
#> # A tibble: 12 x 4
#> # Groups:   id [3]
#>    value    id variable sum_value
#>    <dbl> <dbl> <chr>        <dbl>
#>  1    10     1 V1              55
#>  2    15     1 V1              55
#>  3    30     1 V1              55
#>  4    55     1 Total           55
#>  5    20     2 V1              70
#>  6    10     2 V1              70
#>  7    40     2 V1              70
#>  8    70     2 Total           70
#>  9    80     3 V1             140
#> 10    20     3 V1             140
#> 11    40     3 V1             140
#> 12   140     3 Total          140

#base
split(df, df$id) |>
  Map(f = \(x) transform(x, sum_value = value[variable == "Total"])) |>
  Reduce(f = rbind.data.frame)
#>    value id variable sum_value
#> 1     10  1       V1        55
#> 2     15  1       V1        55
#> 3     30  1       V1        55
#> 4     55  1    Total        55
#> 5     20  2       V1        70
#> 6     10  2       V1        70
#> 7     40  2       V1        70
#> 8     70  2    Total        70
#> 9     80  3       V1       140
#> 10    20  3       V1       140
#> 11    40  3       V1       140
#> 12   140  3    Total       140

字符串
或更简单的基r:

#base
by(df, df$id, \(x) transform(x, sum_value = value[variable == "Total"]))|>
  Reduce(f = rbind.data.frame)
#>    value id variable sum_value
#> 1     10  1       V1        55
#> 2     15  1       V1        55
#> 3     30  1       V1        55
#> 4     55  1    Total        55
#> 5     20  2       V1        70
#> 6     10  2       V1        70
#> 7     40  2       V1        70
#> 8     70  2    Total        70
#> 9     80  3       V1       140
#> 10    20  3       V1       140
#> 11    40  3       V1       140
#> 12   140  3    Total       140


或者一个更简单的dplyr解决方案:

dplyr::mutate(df, sum_value  = max(value), .by = id)
#>    value id variable sum_value
#> 1     10  1       V1        55
#> 2     15  1       V1        55
#> 3     30  1       V1        55
#> 4     55  1    Total        55
#> 5     20  2       V1        70
#> 6     10  2       V1        70
#> 7     40  2       V1        70
#> 8     70  2    Total        70
#> 9     80  3       V1       140
#> 10    20  3       V1       140
#> 11    40  3       V1       140
#> 12   140  3    Total       140

相关问题