高效地对R中的多个列求和

igetnqfo  于 2023-03-27  发布在  其他
关注(0)|答案(8)|浏览(101)

我有以下压缩数据集:

a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)

colnames(a)<-c("year","Col1","Col2")

for (i in 1:2){
  a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}

我想对Var 1和Var 2列求和,我用途:

a$sum<-a$Var_1 + a$Var_2

实际上,我的数据集要大得多-我想从Var_1到Var_n求和(n最多可以是20)。必须有一种比以下方法更有效的方法:

a$sum<-a$Var_1 + ... + a$Var_n
a5g8bdjr

a5g8bdjr1#

这里有一个使用tidyverse的解决方案。您可以使用select()函数在mutate()中选择适当的列,将其扩展到任意多个列。

library(tidyverse)

a<-as.data.frame(c(2000:2005))
a$Col1<-c(1:6)
a$Col2<-seq(2,12,2)

colnames(a)<-c("year","Col1","Col2")

for (i in 1:2){
    a[[paste("Var_", i, sep="")]]<-i*a[[paste("Col", i, sep="")]]
}
a
#>   year Col1 Col2 Var_1 Var_2
#> 1 2000    1    2     1     4
#> 2 2001    2    4     2     8
#> 3 2002    3    6     3    12
#> 4 2003    4    8     4    16
#> 5 2004    5   10     5    20
#> 6 2005    6   12     6    24

# Tidyverse solution
a %>%
    mutate(Total = select(., Var_1:Var_2) %>% rowSums(na.rm = TRUE))
#>   year Col1 Col2 Var_1 Var_2 Total
#> 1 2000    1    2     1     4     5
#> 2 2001    2    4     2     8    10
#> 3 2002    3    6     3    12    15
#> 4 2003    4    8     4    16    20
#> 5 2004    5   10     5    20    25
#> 6 2005    6   12     6    24    30

reprex package(v0.2.1)于2019-01-01创建

disbfnqx

disbfnqx2#

您可以使用colSums(a[,c("Var1", "Var2")])rowSums(a[,c("Var_1", "Var_2")])。在您的情况下,您需要后者。

0pizxfdo

0pizxfdo3#

使用dplyr可以使用

a %>%
rowwise() %>%
mutate(sum = sum(Col1,Col1, na.rm = T))

或更有效地

a %>%
rowwise() %>%
mutate(sum = sum(across(starts_with("Col")), na.rm = T))
myzjeezk

myzjeezk4#

如果您正在处理非常大的数据集,rowSums可能会很慢。
另一种方法是Rfast包中的rowsums函数。这需要您在过程中将数据转换为matrix,并使用列索引而不是名称。下面是基于您的代码的示例:

## load Rfast
library(Rfast)

## create dataset
a <- as.data.frame(c(2000:2005))
a$Col1 <- c(1:6)
a$Col2 <- seq(2,12,2)

colnames(a) <- c("year","Col1","Col2")

for (i in 1:2){
  a[[paste("Var_", i, sep="")]] <- i*a[[paste("Col", i, sep="")]]
}

## get column indices based on names
col_st <- grep("Var_1", colnames(a))  # index of "Var_1" col
col_en <- grep("Var_2", colnames(a))  # index of "Var_2" col
cols   <- c(col_st:col_en)  # indices of all cols from "Var_1" to "Var_2"

## sum rows 4 to 5
a$Total <- rowsums(as.matrix(a[,cols]))
qvsjd97n

qvsjd97n5#

基准测试似乎表明普通的Reduce('+', ...)是最快的,库只是让它(至少稍微)慢一点,至少对于mtcars来说是这样,即使我把它扩展到很大。

Unit: milliseconds
         expr        min         lq       mean     median         uq        max
      rowSums   8.672061   9.014344  13.708022   9.602312  10.672726  148.47183
       Reduce   2.994240   3.157500   6.331503   3.223612   3.616555   99.49181
        apply 524.488376 651.549401 771.095002 743.286441 857.993418 1235.53153
        Rfast   5.649006   5.901787  11.110896   6.387990   9.727408   66.03151
   DT_rowSums   9.209539   9.566574  20.955033  10.131163  12.967030  294.32911
    DT_Reduce   3.590719   3.774761  10.595256   3.924592   4.259343  340.52855
 tidy_rowSums  15.532917  15.997649  33.736883  17.316108  27.072343  343.21254
  tidy_Reduce   8.627810   8.960008  12.271105   9.603124  11.089334   79.98853

验证码:

library('data.table')
library('tidyverse')
library('Rfast')
DFcars = data.table::copy(mtcars)
DFcars = do.call("rbind", replicate(10000, DFcars, simplify = FALSE))
DT_cars = data.table::copy(DFcars)
DFcars2 = data.table::copy(DFcars)
setDT(DT_cars)
colnms = c("mpg", "cyl", "disp", "hp", "drat")

microbenchmark::microbenchmark(
    rowSums =
        {
            DFcars$new_col = rowSums(DFcars[, colnms])
            (as.numeric(DFcars$new_col))
        },
    Reduce =
        {
            DFcars$new_col = Reduce('+', DFcars[, colnms])
            (as.numeric(DFcars$new_col))
        },
    apply =
        {
            DFcars$new_col = apply(DFcars[, 1:5], 1, sum)
            (as.numeric(DFcars$new_col))
        },
    Rfast =
        {
            DFcars$new_col = rowsums(as.matrix(DFcars[, colnms]))
            (as.numeric(DFcars$new_col))
        },
    DT_rowSums =
        {
            DT_cars[, new_col := rowSums(.SD), .SDcols = colnms]
            (as.numeric(DT_cars$new_col))
        },
    DT_Reduce =
        {
            DT_cars[, new_col := Reduce('+', .SD), .SDcols = colnms]
            (as.numeric(DT_cars$new_col))
        },
    tidy_rowSums =
        {
            DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% rowSums())
            (as.numeric(DFcars2$new_col))
        },
    tidy_Reduce =
        {
            DFcars2 = DFcars2 %>% mutate(new_col = select(., colnms) %>% Reduce('+', .))
            (as.numeric(DFcars2$new_col))
        },
    check = 'equivalent'
)
xjreopfe

xjreopfe6#

您可以使用以下命令:

library(dplyr)
a$Sum <- apply(a[,select(a, starts_with("Var_"))], 1, sum)
6tqwzwtp

6tqwzwtp7#

在Base R中:

你可以使用sapply

sapply(unique(sub(".$", "", colnames(a))), function(x) rowSums(a[startsWith(colnames(a), x)]))

这是非常可靠的,它适用于任何事情。

a5g8bdjr

a5g8bdjr8#

另一个解决方案是janitor包:

janitor::adorn_totals(a, where = "col", ... = Var_1:Var_2)

或者等效地,使用紧凑tidyselect语法,

janitor::adorn_totals(a, where = "col", ... = starts_with("Var"))

请注意,janitor::adorn_totals(a, where = "col")会将除第一列之外的所有数值列相加。

相关问题