高级 Dataframe 合并:忽略NA

5f0d552i  于 2023-04-18  发布在  其他
关注(0)|答案(2)|浏览(125)

Context

我的问题与这两个类似:

但是,如果两行之间存在“冲突”,我希望保留多行。(这些问题也是在合并 Dataframe 之后开始的。对于我的问题,可以提供修改合并过程的答案。我可以保证,每个 Dataframe 的ID都是唯一的。当然,也可以提供更通用的解决方案,在合并之后开始。)

问题

我想合并两个 Dataframe ,但忽略包含的NA。

示例

鉴于

> my_data1
  ID m n    o    p q
1  A K U <NA> <NA> K
2  B L V    Q    A L
3  C M W <NA>    S M
4  D N X    W    D N
5  E O Y <NA> <NA> O

> my_data2
  ID m n    o    p r
1  A K U    E <NA> U
2  B L V <NA>    B V
3  C M W    R <NA> W
4  D N X <NA> <NA> X
5  E O Y    T    F Y

合并它们返回

> my_data
   ID m n    o    p    q    r
1   A K U    E <NA> <NA>    U
2   A K U <NA> <NA>    K <NA>
3   B L V    Q    A    L <NA>
4   B L V <NA>    B <NA>    V
5   C M W    R <NA> <NA>    W
6   C M W <NA>    S    M <NA>
7   D N X    W    D    N <NA>
8   D N X <NA> <NA> <NA>    X
9   E O Y    T    F <NA>    Y
10  E O Y <NA> <NA>    O <NA>

但我想得到的是

> my_data
   ID m n    o    p    q    r
1   A K U    E <NA>    K    U
2   B L V    Q    A    L <NA>
3   B L V <NA>    B <NA>    V
4   C M W    R    S    M    W
5   D N X    W    D    N    X
6   E O Y    T    F    O    Y
  • 由于列p,无法合并ID == B
  • ID == A非常特殊,因为列p中有两个NA。
  • IDs C、D和E相对简单,可以通过下面的代码解决。

代码

my_data1 <- data.frame(ID = LETTERS[1:5],
                       m = LETTERS[11:15],
                       n = LETTERS[21:25],
                       o = c(NA, 'Q', NA, 'W', NA),
                       p = c(NA, 'A', 'S', 'D', NA),
                       q = LETTERS[11:15])

my_data2 <- data.frame(ID = LETTERS[1:5],
                       m = LETTERS[11:15],
                       n = LETTERS[21:25],
                       o = c('E', NA, 'R', NA, 'T'),
                       p = c(NA, 'B', NA, NA, 'F'),
                       r = LETTERS[21:25])

my_data <- merge(my_data1, my_data2, all = TRUE)
# or
my_data <- bind_rows(my_data1, my_data2) |>
 arrange(ID)

# Solution (incomplete)

my_data <- my_data |>
 filter(ID != 'A', ID != 'B') |> # Error without this line
 select(-m, -n) |> # Error without this line
 group_by(ID) |>
 summarise(across(.cols = everything(),
                  .fns = ~ na.omit(.x)))

my_data |>
 view()
w9apscun

w9apscun1#

我认为你需要一个bind_rows,然后重新排列列,将非NA移动到顶部(对于每个组),然后过滤以保留最后四列中具有一些非NA的行。

library(dplyr)
bind_rows(my_data1, my_data2) %>%
  arrange(across(everything())) %>%
  group_by(ID) %>%
  mutate(across(everything(), ~ .[order(is.na(.))])) %>%
  ungroup() %>%
  filter(rowSums(!is.na(cur_data()[c("o","p","q","r")])) > 0)
# # A tibble: 6 × 7
#   ID    m     n     o     p     q     r    
#   <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 A     K     U     E     <NA>  K     U    
# 2 B     L     V     Q     A     L     V    
# 3 B     L     V     <NA>  B     <NA>  <NA> 
# 4 C     M     W     R     S     M     W    
# 5 D     N     X     W     D     N     X    
# 6 E     O     Y     T     F     O     Y
ukxgm1gy

ukxgm1gy2#

最后,我们可能不想以重复的ID结束,我们可以考虑一个打破平局的规则,或者如果不同意,设置为NA,我使用后者。
我们可以使用pmaxna.rm=TRUE来比较列。如果两者都不是NA并且存在分歧,则结果为NA
(* 注意:* 我将my_data1[4, 2]设置为不同的值以进行演示)

f <- \(data1, data2, id='ID', verbose=TRUE) {
  stopifnot(identical(data1[[id]], data2[[id]]))
  cn <- intersect(names(data1), names(data2))
  m1 <- lapply(setNames(cn, cn), \(x, ...) {
    u <- data1[[x]]; v <- data2[[x]]
    pm <- pmax(u, v, na.rm=TRUE)
    w <- which(u != v)
    if (verbose && length(w) > 0) {
      message(sprintf('Disagreement row %s, column "%s", set to NA', w, x))
    }
    pm[w] <- NA
    pm
  }) |> as.data.frame()
  idn <- which(cn == id)
  m2 <- merge(data1[setdiff(names(data1), cn[-idn])], 
              data2[setdiff(names(data2), cn[-idn])])
  merge(m1, m2)
}

f(data1=my_data1, data2=my_data2, id='ID', verbose=TRUE)
# Disagreement row 4, column "m", set to NA
# Disagreement row 2, column "p", set to NA
#   ID    m n o    p q r
# 1  A    K U E <NA> K U
# 2  B    L V Q <NA> L V
# 3  C    M W R    S M W
# 4  D <NA> X W    D N X
# 5  E    O Y T    F O Y
  • 数据:*
my_data1 <- structure(list(ID = c("A", "B", "C", "D", "E"), m = c("K", "L", 
"M", "L", "O"), n = c("U", "V", "W", "X", "Y"), o = c(NA, "Q", 
NA, "W", NA), p = c(NA, "A", "S", "D", NA), q = c("K", "L", "M", 
"N", "O")), class = "data.frame", row.names = c("1", "2", "3", 
"4", "5"))

my_data2 <- structure(list(ID = c("A", "B", "C", "D", "E"), m = c("K", "L", 
"M", "N", "O"), n = c("U", "V", "W", "X", "Y"), o = c("E", NA, 
"R", NA, "T"), p = c(NA, "B", NA, NA, "F"), r = c("U", "V", "W", 
"X", "Y")), class = "data.frame", row.names = c("1", "2", "3", 
"4", "5"))

相关问题