group_by如果两列中的任何一列形成一个组

i86rm4rw  于 2023-10-13  发布在  其他
关注(0)|答案(2)|浏览(131)

我想组建一个团体。应始终考虑两列(ID、ADDRESS),然后考虑NAME_1或NAME_2来形成组(基于是否存在相同的条目)。下面是一个简单的例子:

library(dplyr)

# Sample input data frame
df <- data.frame(
  ID = c("1234", "1234", "1234", "1234", "1234", "1234", "2345"),
  ADDRESS = c("Weg 1", "Weg 1", "Weg 1", "Weg 1", "Weg 1", "Weg 1", "Weg 1234"),
  NAME_1 = c("Müller", "Peter", "Hirn", "Hirn", "Test", "Test", "Müller"),
  NAME_2 = c("Meier", "Meier", "Mensch", "Maler", "Hallo", "Velo", "Meier")
)

# Create a grouping variable based on conditions
df_grouped <- df %>%
  group_by(ID, ADDRESS, EITHER(NAME_1, NAME_2)) %>% 
  mutate(GRP = ...)

# Desired output
df <- data.frame(
  ID = c("1234", "1234", "1234", "1234", "1234", "1234", "2345"),
  ADDRESS = c("Weg 1", "Weg 1", "Weg 1", "Weg 1", "Weg 1", "Weg 1", "Weg 1234"),
  NAME_1 = c("Müller", "Peter", "Hirn", "Hirn", "Test", "Test", "Müller"),
  NAME_2 = c("Meier", "Meier", "Mensch", "Maler", "Hallo", "Velo", "Meier"),
  GRP = c(1, 1, 2, 2, 3, 3, 4)
)
)

有办法解决吗?
我尝试了一些嵌套分组,但还没有找到解决方案。

9rbhqvlz

9rbhqvlz1#

如果你允许NAME_1NAME_2中的组重叠,你可以构造一个无向图并提取它的组件:

library(tidyverse)
library(igraph)

# create data with overlapping groups
df <- tribble(
  ~ID,   ~ADDRESS,  ~NAME_1,  ~NAME_2,
  "1234",    "Weg 1", "Müller",  "Meier",
  "1234",    "Weg 1",   "Hirn",  "Meier",
  "1234",    "Weg 1",   "Test",  "Hallo",
  "1234",    "Weg 1",   "Test",   "Velo",
  "1234",    "Weg 1",   "Hirn", "Mensch",
  "2345", "Weg 1234", "Müller",  "Meier"
)

# add group identifiers for NAME_1 and NAME_2
df <- df |>
  mutate(id_1 = cur_group_id(),
         .by = c(ID, ADDRESS, NAME_1)) |>
  mutate(id_2 = nrow(df) + cur_group_id(),
         .by = c(ID, ADDRESS, NAME_2))

# find components of undirected graph
components <- df |> 
  select(id_1, id_2) |> 
  igraph::graph_from_data_frame(directed = FALSE) |> 
  igraph::components() |> 
  pluck("membership")

df |> mutate(GRP = components[id_1])
#> # A tibble: 6 × 7
#>   ID    ADDRESS  NAME_1 NAME_2  id_1  id_2   GRP
#>   <chr> <chr>    <chr>  <chr>  <int> <int> <dbl>
#> 1 1234  Weg 1    Müller Meier      1     7     1
#> 2 1234  Weg 1    Hirn   Meier      2     7     1
#> 3 1234  Weg 1    Test   Hallo      3     8     2
#> 4 1234  Weg 1    Test   Velo       3     9     2
#> 5 1234  Weg 1    Hirn   Mensch     2    10     1
#> 6 2345  Weg 1234 Müller Meier      4    11     3

创建于2023-10-07附带reprex v2.0.2
NAME_1NAME_2不重叠的简单情况下,您可以为每个观察选择较大的组:

library(tidyverse)

# Sample input data frame
df <- tibble(
  ID = c("1234", "1234", "1234", "1234", "1234", "1234", "2345"),
  ADDRESS = c("Weg 1", "Weg 1", "Weg 1", "Weg 1", "Weg 1", "Weg 1", "Weg 1234"),
  NAME_1 = c("Müller", "Peter", "Hirn", "Hirn", "Test", "Test", "Müller"),
  NAME_2 = c("Meier", "Meier", "Mensch", "Maler", "Hallo", "Velo", "Meier")
)

# number NAME_1 and NAME_2
name_1 <- df |> 
  distinct(ID, ADDRESS, NAME_1) |> 
  mutate(id_1 = row_number())

name_2 <- df |> 
  distinct(ID, ADDRESS, NAME_2) |> 
  mutate(id_2 = nrow(df) + row_number())

# merge and choose group ID of larger group
df |> 
  left_join(name_1) |> 
  left_join(name_2) |> 
  add_count(id_1, name = "n_1") |> 
  add_count(id_2, name = "n_2") |> 
  mutate(grp = if_else(n_2 > n_1, id_2, id_1)) |> 
  mutate(GRP = cur_group_id(),
         .by = c(ID, ADDRESS, grp))
#> Joining with `by = join_by(ID, ADDRESS, NAME_1)`
#> Joining with `by = join_by(ID, ADDRESS, NAME_2)`
#> # A tibble: 7 × 10
#>   ID    ADDRESS  NAME_1 NAME_2  id_1  id_2   n_1   n_2   grp   GRP
#>   <chr> <chr>    <chr>  <chr>  <int> <int> <int> <int> <int> <int>
#> 1 1234  Weg 1    Müller Meier      1     8     1     2     8     1
#> 2 1234  Weg 1    Peter  Meier      2     8     1     2     8     1
#> 3 1234  Weg 1    Hirn   Mensch     3     9     2     1     3     2
#> 4 1234  Weg 1    Hirn   Maler      3    10     2     1     3     2
#> 5 1234  Weg 1    Test   Hallo      4    11     2     1     4     3
#> 6 1234  Weg 1    Test   Velo       4    12     2     1     4     3
#> 7 2345  Weg 1234 Müller Meier      5    13     1     1     5     4

创建于2023-10-07带有reprex v2.0.2

brgchamk

brgchamk2#

另一个tidyverse选项可能是:

df %>%
 group_by(ID, ADDRESS) %>%
 mutate(GRP = map_chr(seq_along(ID), 
                      function(names) {
                       set1 <- toString(which(NAME_1 %in% NAME_1[names]))
                       set2 <- toString(which(NAME_2 %in% NAME_2[names]))
                       res = if_else(nchar(set1) > nchar(set2), set1, set2)
                       return(res)
                       })) %>%
 ungroup() %>%
 mutate(GRP = consecutive_id(GRP))

  ID    ADDRESS  NAME_1 NAME_2   GRP
  <chr> <chr>    <chr>  <chr>  <int>
1 1234  Weg 1    Müller Meier      1
2 1234  Weg 1    Peter  Meier      1
3 1234  Weg 1    Hirn   Mensch     2
4 1234  Weg 1    Hirn   Maler      2
5 1234  Weg 1    Test   Hallo      3
6 1234  Weg 1    Test   Velo       3
7 2345  Weg 1234 Müller Meier      4

相关问题