根据与另一个 Dataframe 中出现次数的匹配,重复行n次

7bsow1i6  于 2023-06-19  发布在  其他
关注(0)|答案(1)|浏览(109)

methexp的行名分别与distal.dmr.cpgsprobeIDtarget_symbol列匹配。我想根据distal.dmr.cpgs的每一行中相同的probeIDtarget_symbol对的出现次数来复制methexp的行。

# Duplicate rows of `meth` and `exp` n-times, where n=#occurrence of `probeID` AND `target_symbol` pair
# `probeID` match `rownames(meth)`; `target_symbol` match `rownames(exp)`
n.occur <- distal.dmr.cpgs %>% count(probeID, target_symbol)
if(n.occur$probeID == rownames(meth)) {
  for (n in n.occur$n){
    meth <- meth[rep(seq_len(nrow(meth)), each=n), ]
    exp <- exp[rep(seq_len(nrow(exp)), each=n), ]
  }
}

追溯:

Error in if (n.occur$probeID == rownames(meth)) { : 
  the condition has length > 1

输入:

> dput(meth[1:5,1:5])
structure(c(0.965846994535519, 0.574436129165911, 0.8722836745901, 
0.101103884990587, 0.408252566147303, 0.945927883796075, 0.480092866016333, 
0.853431761902537, 0.0755436985006079, 0.448047437384109, 0.95713326619671, 
0.8891500945397, 0.926905435737289, 0.203369962221719, 0.644251635185525, 
0.961321856037767, 0.501968996835947, 0.865640773821373, 0.11543134872418, 
0.472688968983197, 0.941828841874757, 0.484359313077939, 0.9011216776396, 
0.155463949005687, 0.566371097999143), dim = c(5L, 5L), dimnames = list(
    c("cg03477043", "cg00926926", "cg00488747", "cg04452095", 
    "cg04658243"), c("TCGA.Y8.A8S1.01", "TCGA.Y8.A8S0.01", "TCGA.Y8.A8RZ.01", 
    "TCGA.Y8.A8RY.01", "TCGA.Y8.A897.01")))

> dput(exp[1:5,1:5])
structure(c(8.8764930371619, 7.10418439636105, 5.82248600796462, 
9.6336088827523, 7.12980348328146, 8.50488041264077, 7.4053080406626, 
6.10447768176044, 9.82618201799457, 7.25526533976169, 8.91383556515162, 
7.69822939472376, 6.293732617918, 10.0591928112391, 7.25510044887476, 
8.05139916296298, 7.19502387866043, 6.05119750390752, 9.29733149115023, 
7.55517805926009, 9.13478579142527, 7.38961451449646, 6.09188514895038, 
9.67121455892562, 7.28978856899351), dim = c(5L, 5L), dimnames = list(
    c("SRRM2", "ANKRD11", "RPTOR", "HSP90AA1", "RER1"), c("TCGA.2K.A9WE.01", 
    "TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", "TCGA.2Z.A9J5.01", 
    "TCGA.2Z.A9J6.01")))

> dput(distal.dmr.cpgs[,c("probeID","target_symbol")][1:5,])
structure(list(probeID = c("cg03477043", "cg03477043", "cg00926926", 
"cg00926926", "cg00926926"), target_symbol = c("SRRM2", "SRRM2", 
"ANKRD11", "ANKRD11", "ANKRD11")), row.names = c(785L, 786L, 
866L, 867L, 868L), class = "data.frame")

预期输出:
cg03477043SRRM2的行将重复两次(在methexp中),而cg03477043SRRM2将重复三次。

b4lqfgs4

b4lqfgs41#

methexp转换为 Dataframe ,以行名称作为列;列-将它们绑定到单个 Dataframe 中;并与distal.dmr.cpgs连接以复制行。然后,您可以拆分成单独的 Dataframe ,或者作为单个 Dataframe 保留,这可能更有用。

library(dplyr)
library(tidyr)

meth <- as_tibble(meth, rownames = "probeID")
exp <- as_tibble(exp, rownames = "target_symbol")

methexp <- bind_cols(meth, exp) %>%
  left_join(distal.dmr.cpgs)
  
meth <- select(methexp, all_of(names(meth)))
exp <- select(methexp, all_of(names(exp)))

结果如下:

#>meth
# A tibble: 8 × 6
  probeID    TCGA.Y8.A8S1.01 TCGA.Y8.A8S0.01 TCGA.Y8.A8RZ.01 TCGA.Y8.A8RY.01
  <chr>                <dbl>           <dbl>           <dbl>           <dbl>
1 cg03477043           0.966          0.946            0.957           0.961
2 cg03477043           0.966          0.946            0.957           0.961
3 cg00926926           0.574          0.480            0.889           0.502
4 cg00926926           0.574          0.480            0.889           0.502
5 cg00926926           0.574          0.480            0.889           0.502
6 cg00488747           0.872          0.853            0.927           0.866
7 cg04452095           0.101          0.0755           0.203           0.115
8 cg04658243           0.408          0.448            0.644           0.473
# ℹ 1 more variable: TCGA.Y8.A897.01 <dbl>

#> exp
# A tibble: 8 × 6
  target_symbol TCGA.2K.A9WE.01 TCGA.2Z.A9J1.01 TCGA.2Z.A9J3.01 TCGA.2Z.A9J5.01
  <chr>                   <dbl>           <dbl>           <dbl>           <dbl>
1 SRRM2                    8.88            8.50            8.91            8.05
2 SRRM2                    8.88            8.50            8.91            8.05
3 ANKRD11                  7.10            7.41            7.70            7.20
4 ANKRD11                  7.10            7.41            7.70            7.20
5 ANKRD11                  7.10            7.41            7.70            7.20
6 RPTOR                    5.82            6.10            6.29            6.05
7 HSP90AA1                 9.63            9.83           10.1             9.30
8 RER1                     7.13            7.26            7.26            7.56
# ℹ 1 more variable: TCGA.2Z.A9J6.01 <dbl>

注意,这假设您希望保留distal.dmr.cpgs中没有匹配项的行;如果没有,请改用inner_join()

相关问题