识别TidyVerse中的二元变量组合

gr8qqesn  于 2023-01-28  发布在  其他
关注(0)|答案(2)|浏览(135)

我有一个 Dataframe ,其中包含几个具有二进制yes/no条件的变量(在我的示例中为23个),我正在尝试识别变量对的组合

df <- tibble(V1 = sample(c(0,1), 25, replace=TRUE, prob=c(0.6, 0.4)),
             V2 = sample(c(0,1), 25, replace=TRUE, prob=c(0.6, 0.4)),
             V3 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)),
             V4 = sample(c(0,1), 25, replace=TRUE, prob=c(0.7, 0.3)),
             V5 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)),
             V6 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)),
             V7 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)))

如果我想标识 Dataframe 中的每个唯一组,我将使用cur_group_id(),如下所示:

df %>% group_by(across(everything())) %>%
  mutate(combo_id = cur_group_id())

但我实际上想要识别“是”条件的“对”组合,例如,我想要识别V1 == 1 & V2 == 1的情况,忽略任何其他列包含的内容。
所以基本上我想这么做:

df %>% 
  mutate(combo_id = case_when(V1 == 1 & V2 == 1 ~ "V1_V2"))

但我希望能够将此应用于 Dataframe 中所有变量的每种可能的2变量组合。
也许这是map()的工作?我被卡住了。

dohp0rv5

dohp0rv51#

我们可以使用combn

out <- combn(names(df), 2, FUN = \(x)
   case_when(rowSums(df[x]) == 2 ~ paste(x, collapse = "_")))

colnames(out) <- paste0("combo_id_", combn(names(df), 2, 
    FUN = paste, collapse = "_"))
df2 <- cbind(df, out)
  • 输出
> head(df2, 2)
  V1 V2 V3 V4 V5 V6 V7 combo_id_V1_V2 combo_id_V1_V3 combo_id_V1_V4 combo_id_V1_V5 combo_id_V1_V6 combo_id_V1_V7 combo_id_V2_V3
1  1  0  0  1  1  0  1           <NA>           <NA>          V1_V4          V1_V5           <NA>          V1_V7           <NA>
2  1  0  0  1  0  0  0           <NA>           <NA>          V1_V4           <NA>           <NA>           <NA>           <NA>
  combo_id_V2_V4 combo_id_V2_V5 combo_id_V2_V6 combo_id_V2_V7 combo_id_V3_V4 combo_id_V3_V5 combo_id_V3_V6 combo_id_V3_V7
1           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>
2           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>
  combo_id_V4_V5 combo_id_V4_V6 combo_id_V4_V7 combo_id_V5_V6 combo_id_V5_V7 combo_id_V6_V7
1          V4_V5           <NA>          V4_V7           <NA>          V5_V7           <NA>
2           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>

或者使用tidyverse

library(dplyr)
library(purrr)
library(stringr)
combn(df, 2, simplify = FALSE) %>%
  map_dfc(~ .x %>%
   transmute(!! sprintf("combo_id_%s", str_c(names(.),
   collapse = "_")) := case_when(rowSums(across(everything())) == 2 
   ~ str_c(names(.), collapse = "_")))) %>%
  bind_cols(df, .)
b1zrtrql

b1zrtrql2#

下面是一种方法,我将数据设置为长数据,将其连接到自身以匹配列之间的匹配,然后将其调整为宽数据。

library(dplyr); library(tidyr)
df_r <- df %>%
  mutate(row = row_number()) %>%
  pivot_longer(-row) %>%
  filter(value == 1)

df_r %>%
  left_join(df_r, by = "row") %>%
  filter(name.x != name.y) %>%
  transmute(row, combo = paste(name.x, name.y, sep = "_"), value = 1) %>%
  complete(row, combo, fill = list(value = 0)) %>%
  arrange(row, combo) %>%
  pivot_wider(names_from = combo, values_from = value)

结果

# A tibble: 16 × 37
     row V1_V2 V1_V3 V1_V4 V1_V5 V1_V6 V1_V7 V2_V1 V2_V3 V2_V4 V2_V5 V2_V6 V2_V7 V3_V1 V3_V2 V3_V4 V3_V5
   <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1     0     0     0     0     0     0     0     0     0     1     1     0     0     0     0     0
 2     2     0     0     0     0     0     0     0     0     1     0     0     0     0     0     0     0
 3     3     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
 4     4     0     0     0     0     0     0     0     0     1     0     0     0     0     0     0     0
 5     6     1     0     0     0     0     0     1     0     0     0     0     0     0     0     0     0
 6     7     0     1     0     0     0     1     0     0     0     0     0     0     1     0     0     0
 7     8     1     0     0     0     0     0     1     0     0     0     0     0     0     0     0     0
 8     9     1     0     1     0     0     0     1     0     1     0     0     0     0     0     0     0
 9    11     0     0     0     0     0     0     0     0     0     0     0     0     0     0     1     0
10    12     1     0     0     0     0     0     1     0     0     0     0     0     0     0     0     0
11    13     0     0     0     0     0     0     0     0     0     0     0     1     0     0     0     0
12    15     0     0     0     0     0     0     0     1     0     0     0     1     0     1     0     0
13    16     1     0     1     0     1     0     1     0     1     0     1     0     0     0     0     0
14    19     0     0     0     0     0     0     0     1     1     1     1     0     0     1     1     1
15    20     0     0     0     0     0     0     0     1     0     0     0     0     0     1     0     0
16    21     0     0     0     1     0     0     0     0     0     0     0     0     0     0     0     0
# … with 20 more variables: V3_V6 <dbl>, V3_V7 <dbl>, V4_V1 <dbl>, V4_V2 <dbl>, V4_V3 <dbl>, V4_V5 <dbl>,
#   V4_V6 <dbl>, V5_V1 <dbl>, V5_V2 <dbl>, V5_V3 <dbl>, V5_V4 <dbl>, V5_V6 <dbl>, V6_V1 <dbl>, V6_V2 <dbl>,
#   V6_V3 <dbl>, V6_V4 <dbl>, V6_V5 <dbl>, V7_V1 <dbl>, V7_V2 <dbl>, V7_V3 <dbl>
# ℹ Use `colnames()` to see all variable names

相关问题