我们如何在R中从性别和家庭id以及个人id列中创建兄弟姐妹组合列

yacmzcpb  于 2023-11-14  发布在  其他
关注(0)|答案(2)|浏览(148)

我将建议的代码应用于原始数据集。但它没有在siblings_composition列中产生所需的结果,例如至少1个男性兄弟姐妹为1,至少1个女性兄弟姐妹为2,男性和女性兄弟姐妹均为3,没有兄弟姐妹为0。在原始数据集中,BIRIMNO用于family_id,CINSIYET表示性别,id表示individual_id。作为示例,我提供了由以下代码产生的结果:
标题(数据)

# A tibble: 6 × 4
# Groups:   BIRIMNO [5]
  BIRIMNO CINSIYET       id siblings_composition
    <dbl> <fct>       <dbl>                <int>
1  144003 F        14400307                    3
2  144003 M        14400306                    3
3  144009 F        14400903                    3
4  144014 M        14401409                    3
5  144015 M        14401501                    2
6  144016 M        14401603                    3

字符串
对于原始数据集的可再现性,以下结果:
dput(head(data,100))

structure(list(BIRIMNO = c(144003, 144003, 144009, 144014, 144015, 
144016, 144020, 144020, 144021, 144025, 144025, 144025, 144028, 
144028, 144029, 144031, 144034, 144036, 144039, 144040, 144042, 
144042, 144046, 144047, 144047, 144049, 144054, 144056, 144056, 
144060, 144061, 144069, 144071, 144071, 144071, 144071, 144073, 
144074, 144074, 144077, 144079, 144080, 144084, 144084, 144084, 
144088, 144088, 144090, 144092, 144092, 144092, 144094, 144113, 
144118, 144120, 144122, 144123, 144123, 144123, 144124, 144127, 
144127, 144129, 144129, 144130, 144134, 144137, 144138, 144151, 
144152, 144154, 144158, 144162, 144162, 144162, 144162, 144163, 
144163, 144163, 144167, 144172, 144172, 144176, 144176, 144181, 
144181, 144183, 144185, 144189, 144202, 144202, 144214, 144215, 
144217, 144219, 144224, 144224, 144247, 144247, 144249), CINSIYET = structure(c(2L, 
1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 
1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 
1L, 1L, 2L), levels = c("M", "F"), class = "factor"), id = c(14400307, 
14400306, 14400903, 14401409, 14401501, 14401603, 14402003, 14402004, 
14402103, 14402503, 14402505, 14402506, 14402803, 14402804, 14402904, 
14403104, 14403404, 14403603, 14403903, 14404003, 14404205, 14404204, 
14404603, 14404703, 14404704, 14404905, 14405403, 14405603, 14405604, 
14406004, 14406103, 14406903, 14407109, 14407112, 14407111, 14407110, 
14407303, 14407403, 14407404, 14407706, 14407908, 14408006, 14408405, 
14408404, 14408403, 14408803, 14408804, 14409004, 14409204, 14409205, 
14409203, 14409405, 14411303, 14411804, 14412003, 14412203, 14412304, 
14412306, 14412305, 14412407, 14412704, 14412705, 14412906, 14412905, 
14413003, 14413403, 14413703, 14413804, 14415103, 14415203, 14415404, 
14415803, 14416207, 14416204, 14416206, 14416205, 14416306, 14416307, 
14416308, 14416704, 14417204, 14417205, 14417603, 14417604, 14418104, 
14418103, 14418303, 14418503, 14418903, 14420204, 14420203, 14421403, 
14421503, 14421704, 14421903, 14422403, 14422404, 14424704, 14424703, 
14424903), siblings_composition = c(3L, 3L, 3L, 3L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
    BIRIMNO = c(144003, 144009, 144014, 144015, 144016, 144020, 
    144021, 144025, 144028, 144029, 144031, 144034, 144036, 144039, 
    144040, 144042, 144046, 144047, 144049, 144054, 144056, 144060, 
    144061, 144069, 144071, 144073, 144074, 144077, 144079, 144080, 
    144084, 144088, 144090, 144092, 144094, 144113, 144118, 144120, 
    144122, 144123, 144124, 144127, 144129, 144130, 144134, 144137, 
    144138, 144151, 144152, 144154, 144158, 144162, 144163, 144167, 
    144172, 144176, 144181, 144183, 144185, 144189, 144202, 144214, 
    144215, 144217, 144219, 144224, 144247, 144249), .rows = structure(list(
        1:2, 3L, 4L, 5L, 6L, 7:8, 9L, 10:12, 13:14, 15L, 16L, 
        17L, 18L, 19L, 20L, 21:22, 23L, 24:25, 26L, 27L, 28:29, 
        30L, 31L, 32L, 33:36, 37L, 38:39, 40L, 41L, 42L, 43:45, 
        46:47, 48L, 49:51, 52L, 53L, 54L, 55L, 56L, 57:59, 60L, 
        61:62, 63:64, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 
        73:76, 77:79, 80L, 81:82, 83:84, 85:86, 87L, 88L, 89L, 
        90:91, 92L, 93L, 94L, 95L, 96:97, 98:99, 100L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -68L), .drop = TRUE))


非常感谢

xkftehaa

xkftehaa1#

我假设你打算有“2”的个人谁有“一个”女性兄弟姐妹(而不是2个或更多的女性兄弟姐妹)。
除了上面关于使用“data”作为data.frame名称以及函数参数的评论外,您可以尝试以下方法。
使用tidyverse,你可以按family_id分组,然后从purrr中使用map_int来遍历家庭中的每一行。row_number()表示要从家庭中删除的行,这样你就可以评估家庭兄弟姐妹的结果,看看谁是男性和女性。所以,gender[-.x]将有效地包含给定系列的所有行 * 除了 * 基于row_number()的当前行。
为了重现性,我可能会明确地用case_when定义每种情况,让我知道这是否提供了预期的输出。

library(tidyverse)

data |>
  group_by(family_id) |>
  mutate(siblings_composition = map_int(
    row_number(),
    ~ case_when(
      all(c("M", "F") %in% gender[-.x]) ~ 3,
      "F" %in% gender[-.x] ~ 2,
      "M" %in% gender[-.x] ~ 1,
      .default = 0
  )))

字符串

输出

individual_id family_id gender siblings_composition
          <dbl>     <dbl> <chr>                 <int>
1             1         1 M                         2
2             2         1 F                         1
3             3         2 M                         2
4             4         2 F                         1
5             5         3 M                         0
6             6         4 M                         2
7             7         4 F                         1
8             8         5 M                         2
9             9         5 F                         1

编辑(11/7/23):

原始帖子中提供了额外的数据。代码已被修改以匹配列名。

data |>
  group_by(BIRIMNO) |>
  mutate(siblings_composition = map_int(
    row_number(),
    ~ case_when(
      all(c("M", "F") %in% CINSIYET[-.x]) ~ 3,
      "F" %in% CINSIYET[-.x] ~ 2,
      "M" %in% CINSIYET[-.x] ~ 1,
      .default = 0
  )))

输出

BIRIMNO CINSIYET       id siblings_composition
     <dbl> <fct>       <dbl>                <int>
 1  144003 F        14400307                    1
 2  144003 M        14400306                    2
 3  144009 F        14400903                    0
 4  144014 M        14401409                    0
 5  144015 M        14401501                    0
 6  144016 M        14401603                    0
 7  144020 F        14402003                    2
 8  144020 F        14402004                    2
 9  144021 F        14402103                    0
10  144025 F        14402503                    3

dfddblmv

dfddblmv2#

我猜你的“2”至少是一个女性

library(dplyr)
count_data <- data |> 
  summarize(count = n(),.by = c(family_id,gender)) |> 
  mutate(type = case_when(gender == "M" & count >=1 ~ 1,
                          gender == "F" & count >=1 ~ 2)) |> 
  summarise(class = sum(type), .by = family_id)

left_join(data, count_data, by = "family_id")

#  individual_id family_id gender class
# 1             1         1      M     3
# 2             2         1      F     3
# 3             3         2      M     3
# 4             4         2      F     3
# 5             5         3      M     1
# 6             6         4      M     3
# 7             7         4      F     3
# 8             8         5      M     3
# 9             9         5      F     3

字符串

相关问题