计算从 Dataframe 的前一个组中添加和删除的新元素

ozxc1zmp  于 2023-05-11  发布在  其他
关注(0)|答案(4)|浏览(118)

我有一个 Dataframe

df <- data.frame(
  "Quarter" = c("Q1 2019","Q1 2019","Q1 2019","Q2 2019","Q2 2019","Q2 2019","Q2 2019","Q3 2019","Q3 2019","Q3 2019","Q3 2019","Q4 2019","Q4 2019"),
  "Name" = c("Ram","John","Jack","Ram","Rach","Will","John","Ram","Rach","Will","John","Rach","John"),
  stringsAsFactors = FALSE
)

我需要计算每一季度增加和留下的人数,并与上一季度进行比较。
预期输出为

quarterYear status Count
1    Q1 2019 Added   3
2    Q1 2019 Left    0
3    Q2 2019 Added   2
4    Q2 2019 Left    1
5    Q3 2019 Added   0
6    Q3 2019 Left    0
7    Q4 2019 Added   0
8    Q4 2019 Left    2

我不知道如何比较两组并得到计数。
如何在R中实现预期的输出?

deyfvvtc

deyfvvtc1#

不确定速度的影响,但其中很大一部分本质上是比较连续的计数,所以diff出现在脑海中。

tab <- table(df$Quarter, df$Name)
tab <- rbind(tab[1,,drop=FALSE], diff(tab))
out <- rbind(added = rowSums(tab == 1), left = rowSums(tab == -1))

#      Q1 2019 Q2 2019 Q3 2019 Q4 2019
#added       3       2       0       0
#left        0       1       0       2

如果您需要长输出:

setNames(data.frame(as.table(out)), c("status","quarter","count"))
#  status quarter count
#1  added Q1 2019     3
#2   left Q1 2019     0
#3  added Q2 2019     2
#4   left Q2 2019     1
#5  added Q3 2019     0
#6   left Q3 2019     0
#7  added Q4 2019     0
#8   left Q4 2019     2
8e2ybdfx

8e2ybdfx2#

拆分以创建一个列表并Map两个列表以获得“不等”元素的长度,即

l1 <- split(df$Name, df$Quarter)
do.call(rbind, Map(function(x, y) { i1 <- length(setdiff(x, y)); 
                                    i2 <- length(setdiff(y, x)); 
                                    data.frame(Added = i1, Left = i2)},
          l1[-1], l1[-length(l1)]))

#        Added Left
#Q2 2019     2    1
#Q3 2019     0    0
#Q4 2019     0    2

您可以按照自己的方式整理输出

busg9geu

busg9geu3#

下面的工作原理是首先将Name列转换为Quarter的名称列表,然后使用purrr::map2_int将每个Quarter与前一个Quarter进行比较。最后,使用tidyr::pivot_longer将添加的两个列AddedLeft透视为长格式。

library(tidyverse)

df %>%
  group_by(Quarter) %>%
  summarise(names = list(Name)) %>%
  mutate(Added = map2_int(names, lag(names, default = list(list())), ~ length(setdiff(.x, .y))),
         Left = map2_int(names, lag(names, default = list(list())), ~ length(setdiff(.y, .x)))) %>%
  pivot_longer(Added:Left, names_to = "status", values_to = "Count") %>%
  select(-names)

结果:

# A tibble: 8 x 3
  Quarter status Count
  <chr>   <chr>  <int>
1 Q1 2019 Added      3
2 Q1 2019 Left       0
3 Q2 2019 Added      2
4 Q2 2019 Left       1
5 Q3 2019 Added      0
6 Q3 2019 Left       0
7 Q4 2019 Added      0
8 Q4 2019 Left       2
wfsdck30

wfsdck304#

这里有一种方法可以保持数据的宽度。我们将数据拆分为基于Quarter的 Dataframe 列表。使用map2,我们比较上一个季度和下一个季度的值,并计算每个季度增加和留下的人数。分别计算第一季度的值,并将其绑定到原始 Dataframe 。

library(tidyverse)

list_df <- df %>% group_split(Quarter)

list_df %>%
   .[[1]] %>%
  summarise(quarterYear  = first(Quarter),
            status = c('Added', 'Left'), 
            Count = c(n(), 0)) %>%
    bind_rows(map2_df(list_df[-1], list_df[-length(list_df)], 
            ~tibble(quarterYear = .x$Quarter[1],
                    status = c('Added', 'Left'), 
                    Count = c(sum(!.x$Name %in% .y$Name), 
                              sum(!.y$Name %in% .x$Name)))))

# A tibble: 8 x 3
#  quarterYear status Count
#  <chr>       <chr>  <dbl>
#1 Q1 2019     Added      3
#2 Q1 2019     Left       0
#3 Q2 2019     Added      2
#4 Q2 2019     Left       1
#5 Q3 2019     Added      0
#6 Q3 2019     Left       0
#7 Q4 2019     Added      0
#8 Q4 2019     Left       2

在R中使用相同的逻辑:

list_df <- split(df, df$Quarter)
temp <- list_df[[1]]

rbind(data.frame(quarterYear = temp$Quarter[1], 
                 status =  c('Added', 'Left'), 
                 Count = c(nrow(temp), 0)),
   do.call(rbind, Map(function(x, y) 
      data.frame(quarterYear = x$Quarter[1],
                 status = c('Added', 'Left'), 
                 Count = c(sum(!x$Name %in% y$Name), sum(!y$Name %in% x$Name))), 
list_df[-1],list_df[-length(list_df)])))

相关问题