R:以其他不同列为条件的多个变量的分组汇总统计量

yftpprvb  于 2023-04-18  发布在  其他
关注(0)|答案(2)|浏览(105)

我试图获得多个变量的分组汇总统计量,条件是其他不同的列。例如,我有三个总差异变量(n.diff.total)和三个变量,指示每个变量是否应从汇总统计量中排除(na.diff.total = 1,这意味着应将观察结果从计算中排除)。汇总统计量按id1变量分组。有没有比下面的代码更好、更有效的方法来获取这些值?

示例 Dataframe

set.seed(100)
  df <- 
    data.frame(
      id1 = c(rep('A', 10), rep('B', 10)),
      id2 = stri_rand_strings(20, 1),
      n.diff.total_rare = sample(0:30, 20, replace=TRUE),
      n.diff.total_general = sample(0:30, 20, replace=TRUE),
      n.diff.total_specialty = sample(0:30, 20, replace=TRUE),
      na.diff.total_rare = sample(0:1, 20, replace=TRUE),
      na.diff.total_general = sample(0:1, 20, replace=TRUE),
      na.diff.total_specialty = sample(0:1, 20, replace=TRUE)
    )

当前代码及示例输出

output_rare <-
    df %>% 
    select(id1, id2, n.diff.total_rare, na.diff.total_rare) %>% 
    filter(na.diff.total_rare == 0) %>% 
    mutate(zero = ifelse(n.diff.total_rare == 0, 1, 0)) %>% 
    group_by(id1) %>% 
    summarise(
      min = min(n.diff.total_rare, na.rm = T),
      max = max(n.diff.total_rare, na.rm = T),
      sd = sd(n.diff.total_rare, na.rm = T),
      mean = mean(n.diff.total_rare, na.rm = T),
      zeros = sum(zero,  na.rm = T)
    ) %>% 
    ungroup %>% 
    mutate(variable = 'n.diff.total_rare')
  
  output_specialty <-
    df %>% 
    select(id1, id2, n.diff.total_specialty, na.diff.total_specialty) %>% 
    filter(na.diff.total_specialty == 0) %>% 
    mutate(zero = ifelse(n.diff.total_specialty == 0, 1, 0)) %>% 
    group_by(id1) %>% 
    summarise(
      min = min(n.diff.total_specialty, na.rm = T),
      max = max(n.diff.total_specialty, na.rm = T),
      sd = sd(n.diff.total_specialty, na.rm = T),
      mean = mean(n.diff.total_specialty, na.rm = T),
      zeros = sum(zero,  na.rm = T)
    ) %>% 
    ungroup %>% 
    mutate(variable = 'n.diff.total_specialty')
  
  output_general <-
    df %>% 
    select(id1, id2, n.diff.total_general, na.diff.total_general) %>% 
    filter(na.diff.total_general == 0) %>% 
    mutate(zero = ifelse(n.diff.total_general == 0, 1, 0)) %>% 
    group_by(id1) %>% 
    summarise(
      min = min(n.diff.total_general, na.rm = T),
      max = max(n.diff.total_general, na.rm = T),
      sd = sd(n.diff.total_general, na.rm = T),
      mean = mean(n.diff.total_general, na.rm = T),
      zeros = sum(zero,  na.rm = T)
    ) %>% 
    ungroup %>% 
    mutate(variable = 'n.diff.total_general')
  
  output <- 
    output_rare %>% 
    rbind(
      output_specialty
    ) %>% 
    rbind(
      output_general
    )
u1ehiz5o

u1ehiz5o1#

要将这三个步骤合并为一个步骤并一次性输出单个 Dataframe ,可以使用pivot_ s按三个变量分组:

library(tidyverse)

set.seed(100)
df <- 
  tibble(
    id1 = c(rep('A', 10), rep('B', 10)),
    id2 = sample(letters, 20),
    n.diff.total_rare = sample(0:30, 20, replace=TRUE),
    n.diff.total_general = sample(0:30, 20, replace=TRUE),
    n.diff.total_specialty = sample(0:30, 20, replace=TRUE),
    na.diff.total_rare = sample(0:1, 20, replace=TRUE),
    na.diff.total_general = sample(0:1, 20, replace=TRUE),
    na.diff.total_specialty = sample(0:1, 20, replace=TRUE)
  )

df |> 
  pivot_longer(!starts_with("id"), names_to = c("metric", "variable"), 
               values_to = "val", names_sep = "_") |> 
  pivot_wider(names_from = metric, values_from = val) |> 
  filter(na.diff.total == 0) %>% 
  mutate(zero = ifelse(n.diff.total == 0, 1, 0)) %>% 
  group_by(id1, variable) %>% 
  summarise(
    min = min(n.diff.total, na.rm = T),
    max = max(n.diff.total, na.rm = T),
    sd = sd(n.diff.total, na.rm = T),
    mean = mean(n.diff.total, na.rm = T),
    zeros = sum(zero,  na.rm = T),
    .groups = "drop"
  ) |> 
  arrange(variable, id1)
#> # A tibble: 6 × 7
#>   id1   variable    min   max    sd  mean zeros
#>   <chr> <chr>     <int> <int> <dbl> <dbl> <dbl>
#> 1 A     general      22    30  3.42  26.5     0
#> 2 B     general       6    23  7.37  12.7     0
#> 3 A     rare          1    29 10.6   18.2     0
#> 4 B     rare          1    29 10.1   15.2     0
#> 5 A     specialty     8    23  6.76  15.5     0
#> 6 B     specialty     3    29 10.9   15.7     0

它给出的值与上面的绑定 Dataframe 相同。

5cnsuln7

5cnsuln72#

这可能更容易在长格式中使用。
首先,添加一列以指示行号,因为在筛选中会引用同一行中的值。
然后,使用pivot_longer将其转换为长格式。函数extract可以将标签分隔为“n”或“na”以及total的类型(例如,“total_rare”)。
希望这能帮上忙。

library(tidyverse)

df %>%
  mutate(rn = row_number()) %>%
  pivot_longer(cols = -c(id1, id2, rn)) %>%
  extract(col = name, into = c("diff", "total"), regex = "^(\\w+).diff.(\\w+)") %>%
  filter(any(value == 0 & diff == "na"), .by = c(id1, id2, total, rn)) %>%
  filter(diff == "n") %>%
  summarise(
    min = min(value, na.rm = T),
    max = max(value, na.rm = T),
    sd = sd(value, na.rm = T),
    mean = mean(value, na.rm = T),
    zeros = sum(value == 0, na.rm = T),
    .by = c(id1, total)
  )

输出

id1   total             min   max    sd  mean zeros
  <chr> <chr>           <int> <int> <dbl> <dbl> <int>
1 A     total_rare          2    24  8.31  15.5     0
2 A     total_general       1    29 10.1   15.2     0
3 A     total_specialty     6    23  6.95  12       0
4 B     total_rare          1    29 11.6   16.8     0
5 B     total_general      22    30  3.42  26.5     0
6 B     total_specialty     3    23  8.77  14.2     0

相关问题