将 Dataframe 中的子字符串与另一个 Dataframe 进行匹配

rvpgvaaj  于 2023-04-27  发布在  其他
关注(0)|答案(2)|浏览(142)

rst.mva.one.leave.out["cpg_ids"]列包含可能与methreg$probeID对应的cg*子字符串。对于所有匹配项,我希望检索rst.mva.one.leave.out中相应的行并将其存储为rst.subset

for (i in 1:nrow(rst.mva.one.leave.out)) {
  rst.subset <- rst.mva.one.leave.out[strsplit(rst.mva.one.leave.out["cpg_ids"][i], ",", fixed=T) %in% methreg$probeID,]
}

回溯:

Error in h(simpleError(msg, call)) : 
  error in evaluating the argument 'x' in selecting a method for function '%in%': non-character argument

数据:
dput(rst.mva.one.leave.out[1:20,2:8])

structure(list(sample = c("TCGA.A3.3357.01", "TCGA.A3.3357.01", 
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3357.01", 
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3357.01", 
"TCGA.A3.3357.01", "TCGA.A3.3357.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01", 
"TCGA.A3.3367.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01", "TCGA.A3.3367.01", 
"TCGA.A3.3367.01", "TCGA.A3.3367.01"), chromosome = c("7", "3", 
"7", "6", "8", "8", "14", "16", "1", "19", "16", "6", "1", "17", 
"16", "5", "7", "5", "6", "3"), start = c(25989524L, 50275446L, 
134142981L, 146348486L, 116679763L, 145747140L, 37641261L, 73125573L, 
27874258L, 37407041L, 57770390L, 43043691L, 46769035L, 26925852L, 
2652948L, 43039592L, 79082898L, 131563095L, 26195697L, 36985996L
), end = c(25989763L, 50275694L, 134144288L, 146348913L, 116680346L, 
145748059L, 37641600L, 73126120L, 27874526L, 37407284L, 57771195L, 
43043815L, 46769320L, 26926511L, 2653839L, 43040885L, 79083753L, 
131563764L, 26197071L, 36986642L), sz = c(239L, 248L, 1307L, 
427L, 583L, 919L, 339L, 547L, 268L, 243L, 805L, 124L, 285L, 659L, 
891L, 1293L, 855L, 669L, 1374L, 646L), cpg_n = c(3, 3, 4, 3, 
3, 4, 3, 3, 3, 3, 3, 3, 5, 6, 6, 6, 6, 3, 5, 6), cpg_ids = c("cg08767938,cg07184013,cg03853208", 
"cg06508783,cg00813746,cg05060704", "cg00174851,cg06864853,cg02215070,cg04663564", 
"cg06121352,cg09179079,cg03478199", "cg06368590,cg03615269,cg05998283", 
"cg03949978,cg01099220,cg02479773,cg08872590", "cg01168865,cg07535928,cg03611555", 
"cg08992827,cg08187089,cg00293191", "cg00178877,cg05590948,cg09060489", 
"cg03584288,cg02370417,cg02459604", "cg00816177,cg03029127,cg01656750", 
"cg02057561,cg08747889,cg00252032", "cg02537149,cg03979582,cg04410181,cg06961071,cg00122254", 
"cg01626899,cg00449941,cg05564086,cg06774283,cg01724566,cg06329022", 
"cg03846022,cg08981282,cg02512202,cg01195053,cg03314158,cg00433159", 
"cg04122815,cg08205639,cg05551979,cg03723510,cg01313313,cg00257271", 
"cg04671932,cg04652097,cg08641579,cg02523844,cg05270344,cg07448060", 
"cg01211041,cg09140281,cg05501285", "cg08117800,cg02612650,cg03181300,cg03785755,cg02902477", 
"cg06686826,cg06163735,cg00325599,cg01756288,cg03450370,cg07696485"
)), row.names = c(NA, 20L), class = "data.frame")

dput(methreg[,1:3])

structure(list(regionID = c("chr7:87152539-87152540", "chr19:51905083-51905084", 
"chr19:36687587-36687588", "chr12:53985031-53985032", "chr8:85177989-85177990", 
"chr1:40161274-40161275", "chr12:51083289-51083290", "chr7:87152539-87152540", 
"chr19:55581239-55581240", "chr19:49927656-49927657", "chr19:55581239-55581240", 
"chr18:75212075-75212076", "chr7:99505269-99505270", "chr20:2692357-2692358", 
"chr2:173964147-173964148", "chr5:179024077-179024078", "chr19:52369920-52369921"
), probeID = c("cg08767938", "cg04425820", "cg09307868", "cg08737755", 
"cg04950789", "cg08707192", "cg04396637", "cg07560681", "cg06560912", 
"cg07469215", "cg06560912", "cg05448504", "cg00155700", "cg03227128", 
"cg07040405", "cg04681963", "cg03005603"), target_symbol = c("DMTF1", 
"ZNF649", "ZNF567", "HOXC10", "E2F5", "RLF", "CSRNP2", "DMTF1", 
"ZNF579", "ATF5", "ZNF579", "TSHZ1", "ZNF394", "EBF4", "SP3", 
"ZNF879", "ZNF880")), class = "data.frame", row.names = c("14", 
"87", "78", "43", "23", "1", "40", "13", "94", "83", "92", "62", 
"15", "53", "4", "131", "51"))

预期输出:

structure(list(sample = "TCGA.A3.3357.01", chromosome = "7", 
    start = 25989524L, end = 25989763L, sz = 239L, cpg_n = 3, 
    cpg_ids = "cg08767938,cg07184013,cg03853208"), row.names = 1L, class = "data.frame")
nfs0ujit

nfs0ujit1#

rst.subset <- rst.mva.one.leave.out[
  sapply(
    strsplit(rst.mva.one.leave.out$cpg_ids, ','), 
    function(x) any(x %in% methreg$probeID)), ]

rst.subset

#           sample chromosome    start      end  sz cpg_n                          cpg_ids
#1 TCGA.A3.3357.01          7 25989524 25989763 239     3 cg08767938,cg07184013,cg03853208

这会将rst.mva.one.leave.out$cpg_ids中的字符串拆分为逗号,如果有任何值与methreg$probeID匹配,则选中该行。
我们可以使用的另一种方法是使用grepl的模式匹配,前提是methreg$probeID不是很大。感谢@Gwang-Jin Kim对正则表达式的改进。

rst.mva.one.leave.out[
  grepl(paste0("\\b", methreg$probeID, "\\b", collapse = "|"), 
        rst.mva.one.leave.out$cpg_ids), 
  ]
vuktfyat

vuktfyat2#

可以使用fuzzyjoin

library(fuzzyjoin)
regex_left_join(df_2, df_1, by = c("DF_2"= "DF_1"))

相关问题