有没有一种有效的方法来分组和计算R中的 Dataframe 中的多个列?

des4xlb0  于 2023-04-27  发布在  其他
关注(0)|答案(3)|浏览(128)

有:\

date        age1 age2 age3 age4 age5 age6 useditems total items
11/10/2021   1     2   1    2    1    1     15         20
11/11/2021   2     1   2    4    1    2      5         30
09/16/2022   1     2   1    5    3    1      3         10
09/17/2022   2     3   1    3    1    1      2         15
04/05/2021   1     1   2    1    2    1      3          9
04/06/2021   2     1   2    1    3    2      1          9

需求:\

Week        age_sum useditems_sum total_sum Week
11/07/2021    20     20              50
09/11/2022    24      5              25
04/04/2021    19      4              18

我想按周对我的数据进行分组,就像在“想要”中一样,然后计算各行的总和,这样我就有了每周的总和。
我是R的新手,正在努力弄清楚这一点。任何帮助或指导都是值得的。

t40tm48m

t40tm48m1#

下面是一个基于R的解(比dplyr解快大约三倍)。
首先,将日期转换为"Date"

dat$date <- as.Date(dat$date, format='%m/%d/%Y')

然后拆分by周,找到周的第一天1并添加计算:

by(dat, strftime(dat$date, '%Y-%U'), \(x) {
  w <- seq.Date(x$date[1] - 6, x$date[1], by='day')
  data.frame(week=w[weekdays(w) == 'Sunday']) |>
    cbind(t(c(age_sum=sum(x[grep('^age\\d+', names(x))]), sum=colSums(x[c("useditems", "total_items")]))))
}) |> do.call(what=rbind)
#               week age_sum sum.useditems sum.total_items
# 2021-14 2021-04-04      19             4              18
# 2021-45 2021-11-07      20            20              50
# 2022-37 2022-09-11      24             5              25

请注意,这遵循美国惯例。对于ISO,您可能希望使用此版本:

by(dat, strftime(dat$date, '%Y-%V'), \(x) {
  w <- seq.Date(x$date[1] - 6, x$date[1], by='day')
  data.frame(week=w[weekdays(w) == 'Monday']) |>
    cbind(t(c(age_sum=sum(x[grep('^age\\d+', names(x))]), sum=colSums(x[c("useditems", "total_items")]))))
}) |> do.call(what=rbind)
#               week age_sum sum.useditems sum.total_items
# 2021-14 2021-04-05      19             4              18
# 2021-45 2021-11-08      20            20              50
# 2022-37 2022-09-12      24             5              25
  • 数据:*
dat <- structure(list(date = c("11/10/2021", "11/11/2021", "09/16/2022", 
"09/17/2022", "04/05/2021", "04/06/2021"), age1 = c(1L, 2L, 1L, 
2L, 1L, 2L), age2 = c(2L, 1L, 2L, 3L, 1L, 1L), age3 = c(1L, 2L, 
1L, 1L, 2L, 2L), age4 = c(2L, 4L, 5L, 3L, 1L, 1L), age5 = c(1L, 
1L, 3L, 1L, 2L, 3L), age6 = c(1L, 2L, 1L, 1L, 1L, 2L), useditems = c(15L, 
5L, 3L, 2L, 3L, 1L), total_items = c(20L, 30L, 10L, 15L, 9L, 
9L)), class = "data.frame", row.names = c(NA, -6L))
tyu7yeag

tyu7yeag2#

我们也可以做

library(dplyr) # >= 1.1.0
library(lubridate)
df1 %>% 
  mutate(date = mdy(date),
  Week = format(as.Date(format(date, '%Y-%U-0'), 
     format = '%Y-%W-%w'), "%m/%d/%Y")) %>% 
  reframe(age_sum = sum(pick(starts_with("age"))), 
   across(c(useditems, totalitems), ~ sum(.x), .names = "{.col}_sum"),
    .by = Week)
  • 输出
Week age_sum useditems_sum totalitems_sum
1 11/07/2021      20            20             50
2 09/11/2022      24             5             25
3 04/04/2021      19             4             18

数据

df1 <- structure(list(date = c("11/10/2021", "11/11/2021", "09/16/2022", 
"09/17/2022", "04/05/2021", "04/06/2021"), age1 = c(1L, 2L, 1L, 
2L, 1L, 2L), age2 = c(2L, 1L, 2L, 3L, 1L, 1L), age3 = c(1L, 2L, 
1L, 1L, 2L, 2L), age4 = c(2L, 4L, 5L, 3L, 1L, 1L), age5 = c(1L, 
1L, 3L, 1L, 2L, 3L), age6 = c(1L, 2L, 1L, 1L, 1L, 2L), useditems = c(15L, 
5L, 3L, 2L, 3L, 1L), totalitems = c(20L, 30L, 10L, 15L, 9L, 9L
)), class = "data.frame", row.names = c(NA, -6L))
aiazj4mn

aiazj4mn3#

我们可以这样做:

library(dplyr)

df %>%
  mutate(date = as.Date(date, format = "%m/%d/%Y"),
         week = format(date, format = "%U")) %>% 
  group_by(week) %>%
  summarise(age_sum = sum(age1 + age2 + age3 + age4 + age5 + age6),
            useditems_sum = sum(useditems),
            total_sum = sum(total_items)) %>% 
  arrange(-total_sum)
week  age_sum useditems_sum total_sum
  <chr>   <int>         <int>     <int>
1 45         20            20        50
2 37         24             5        25
3 14         19             4        18

相关问题