R语言 使用不同的汇总过滤器按函数对组中的不同匹配项进行计数

fnvucqvd  于 2023-01-18  发布在  其他
关注(0)|答案(2)|浏览(88)

我试图得到一个变量的不同观测值的数量,以及基于另一个变量的条件得到同一个变量的不同观测值的数量,所有这些都在一个group_bysummarise操作中完成,假设这是我的df:

library(tidyverse)

set.seed(1)

sample_size = 20

df = tibble(id = 1:sample_size,
            var_1 = sample(c(1:5), sample_size, replace = TRUE),
            var_2 = sample(c('yes','no','maybe'), sample_size, replace = TRUE),
            month = sample(c(1:12), sample_size, replace = TRUE),
            year = sample(c(2022,2023), sample_size, replace = TRUE)
)

# A tibble: 6 × 5
     id var_1 var_2 month  year
  <int> <int> <chr> <int> <dbl>
1     1     1 yes       8  2023
2     2     4 no        6  2022
3     3     1 no       10  2022
4     4     2 no        7  2022
5     5     5 yes       3  2023
6     6     3 maybe    10  2022

以下是每个var_1值的不同出现次数:

df %>% 
  group_by(month, year) %>% 
  summarise(distinct_var_1 = var_1 %>% n_distinct()) %>% 
  ungroup()

# A tibble: 6 × 3
  month  year distinct_var_1
  <int> <dbl>          <int>
1     1  2023              1
2     2  2022              1
3     2  2023              1
4     3  2022              2
5     3  2023              1
6     6  2022              2

如果var_2 = 'maybe',则每个var_1值的不同出现次数为:

df %>% 
  filter(var_2 == 'maybe') %>% 
  group_by(month, year) %>% 
  summarise(distinct_2 = var_1 %>% n_distinct()) %>%
  ungroup()

# A tibble: 4 × 3
  month  year distinct_2
  <int> <dbl>      <int>
1     1  2023          1
2     6  2023          1
3     8  2022          2
4    10  2022          1

我希望能够在一个group_by + summarise中同时执行这两个操作。有什么想法吗?

uqcuzwp8

uqcuzwp81#

你可以试试

df %>% 
  group_by(month, year) %>% 
  summarise(distinct_var_1 = n_distinct(var_1),
            distinct_2 = n_distinct(var_1[var_2 == 'maybe'])) %>% 
  ungroup()

# # A tibble: 13 × 4
#    month  year distinct_var_1 distinct_2
#    <int> <dbl>          <int>      <int>
#  1     1  2023              1          1
#  2     2  2022              1          0
#  3     2  2023              1          0
#  4     3  2022              2          0
#  5     3  2023              1          0
#  6     6  2022              2          0
#  7     6  2023              2          1
#  8     7  2022              1          0
#  9     7  2023              1          0
# 10     8  2022              2          2
# 11     8  2023              1          0
# 12    10  2022              2          1
# 13    12  2022              1          0
guicsvcw

guicsvcw2#

您可以从var_1中提取函数内部var_2等于"maybe"的值

df %>% 
  group_by(month, year) %>% 
  summarise(distinct_var_1 = n_distinct(var_1 ), distinct_var_2 = n_distinct(var_1[var_2 == 'maybe'])) %>% 
  ungroup()

退货:

# A tibble: 13 × 4
   month  year distinct_var_1 distinct_var_2
   <int> <dbl>          <int>          <int>
 1     1  2023              1              1
 2     2  2022              1              0
 3     2  2023              1              0
 4     3  2022              2              0
 5     3  2023              1              0
 6     6  2022              2              0
 7     6  2023              2              1
 8     7  2022              1              0
 9     7  2023              1              0
10     8  2022              2              2
11     8  2023              1              0
12    10  2022              2              1
13    12  2022              1              0

相关问题