如何根据公共列值对多个 Dataframe 进行子集划分?

vngu2lb8  于 2023-05-04  发布在  其他
关注(0)|答案(2)|浏览(108)

我在df.list中有一个 Dataframe 列表,我想根据external_gene_name列中的共享值将它们相交。

df.list <- list(df.x3utr, df.x5utr, df.cds, df.promoter)
common.names <- Reduce(function(x, y){intersect(x, y$external_gene_name)}, df.list, init = names(df.list[[1]]))

输出:

> common.names
character(0)

输出df应该只包含df.list中每个 Dataframe 的adj.P.Val

df.x3utr <- df.x3utr[df.x3utr$adj.P.Val %in% common.names]
df.x5utr <- df.x3utr[df.x3utr$adj.P.Val %in% common.names]
df.cds <- df.cds[df.cds$adj.P.Val %in% common.names]
df.promoter <- df.promoter[df.promoter$adj.P.Val %in% common.names]

df <- rbind(df.x3utr$adj.P.Val, df.x5utr$adj.P.Val, df.cds$adj.P.Val, df.promoter$adj.P.Val)
names(df) <- c("X3UTR", "X5UTR", "CDS", "promCore")

数据:

> dput(df.list)
list(structure(list(seqnames = structure(c(7L, 17L, 1L, 11L, 
14L, 2L, 2L, 15L, 20L, 7L), levels = c("chr1", "chr2", "chr3", 
"chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", 
"chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", 
"chr19", "chr20", "chr21", "chr22", "chrX", "chrY"), class = "factor"), 
    start = c(122073549L, 7217125L, 1292390L, 44065925L, 23058564L, 
    65227753L, 113890063L, 58588809L, 62302093L, 151085831L), 
    end = c(122144255L, 7225266L, 1309609L, 44084237L, 23095614L, 
    65271253L, 113962596L, 58749791L, 62308862L, 151144436L), 
    width = c(70707L, 8142L, 17220L, 18313L, 37051L, 43501L, 
    72534L, 160983L, 6770L, 58606L), strand = structure(c(2L, 
    1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L), levels = c("+", "-", 
    "*"), class = "factor"), ensembl_gene_id = c("ENSG00000008311", 
    "ENSG00000072778", "ENSG00000131584", "ENSG00000110455", 
    "ENSG00000100813", "ENSG00000138071", "ENSG00000115091", 
    "ENSG00000137845", "ENSG00000130706", "ENSG00000133612"), 
    external_gene_name = c("AASS", "ACADVL", "ACAP3", "ACCS", 
    "ACIN1", "ACTR2", "ACTR3", "ADAM10", "ADRM1", "AGAP3"), adj.P.Val = c(4.6737332542265e-10, 
    1.27392687635188e-09, 2.67749562291447e-09, 4.30421108534489e-09, 
    6.18032947977852e-09, 8.5958306820173e-09, 9.28539096250232e-09, 
    1.03280085009177e-08, 2.13672442292269e-08, 2.22031576028495e-08
    ), annot.seqnames = structure(c(7L, 17L, 1L, 11L, 14L, 2L, 
    2L, 15L, 20L, 7L), levels = c("chr1", "chr2", "chr3", "chr4", 
    "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", 
    "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", 
    "chr19", "chr20", "chr21", "chr22", "chrX", "chrY", "chrM", 
    "chr1_gl000191_random", "chr1_gl000192_random", "chr4_ctg9_hap1", 
    "chr4_gl000193_random", "chr4_gl000194_random", "chr6_apd_hap1", 
    "chr6_cox_hap2", "chr6_dbb_hap3", "chr6_mann_hap4", "chr6_mcf_hap5", 
    "chr6_qbl_hap6", "chr6_ssto_hap7", "chr7_gl000195_random", 
    "chr8_gl000196_random", "chr8_gl000197_random", "chr9_gl000198_random", 
    "chr9_gl000199_random", "chr9_gl000200_random", "chr9_gl000201_random", 
    "chr11_gl000202_random", "chr17_ctg5_hap1", "chr17_gl000203_random", 
    "chr17_gl000204_random", "chr17_gl000205_random", "chr17_gl000206_random", 
    "chr18_gl000207_random", "chr19_gl000208_random", "chr19_gl000209_random", 
    "chr21_gl000210_random", "chrUn_gl000211", "chrUn_gl000212", 
    "chrUn_gl000213", "chrUn_gl000214", "chrUn_gl000215", "chrUn_gl000216", 
    "chrUn_gl000217", "chrUn_gl000218", "chrUn_gl000219", "chrUn_gl000220", 
    "chrUn_gl000221", "chrUn_gl000222", "chrUn_gl000223", "chrUn_gl000224", 
    "chrUn_gl000225", "chrUn_gl000226", "chrUn_gl000227", "chrUn_gl000228", 
    "chrUn_gl000229", "chrUn_gl000230", "chrUn_gl000231", "chrUn_gl000232", 
    "chrUn_gl000233", "chrUn_gl000234", "chrUn_gl000235", "chrUn_gl000236", 
    "chrUn_gl000237", "chrUn_gl000238", "chrUn_gl000239", "chrUn_gl000240", 
    "chrUn_gl000241", "chrUn_gl000242", "chrUn_gl000243", "chrUn_gl000244", 
    "chrUn_gl000245", "chrUn_gl000246", "chrUn_gl000247", "chrUn_gl000248", 
    "chrUn_gl000249"), class = "factor"), annot.start = c(122114435L, 
    7218951L, 1309110L, 44081471L, 23072672L, 65248281L, 113890449L, 
    58727100L, 62305446L, 151109122L)), row.names = c(296L, 678L, 
745L, 882L, 990L, 1422L, 1759L, 1833L, 2061L, 2219L), class = "data.frame"), 
    structure(list(seqnames = structure(c(12L, 3L, 17L, 1L, 11L, 
    14L, 2L, 2L, 15L, 8L), levels = c("chr1", "chr2", "chr3", 
    "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", 
    "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", 
    "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY"
    ), class = "factor"), start = c(53307456L, 51971426L, 7217125L, 
    1292390L, 44065925L, 23058564L, 65227753L, 113890063L, 58588809L, 
    38996869L), end = c(53324864L, 51981199L, 7225266L, 1309609L, 
    44084237L, 23095614L, 65271253L, 113962596L, 58749791L, 39105261L
    ), width = c(17409L, 9774L, 8142L, 17220L, 18313L, 37051L, 
    43501L, 72534L, 160983L, 108393L), strand = structure(c(2L, 
    1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L), levels = c("+", "-", 
    "*"), class = "factor"), ensembl_gene_id = c("ENSG00000094914", 
    "ENSG00000248487", "ENSG00000072778", "ENSG00000131584", 
    "ENSG00000110455", "ENSG00000100813", "ENSG00000138071", 
    "ENSG00000115091", "ENSG00000137845", "ENSG00000168615"), 
        external_gene_name = c("AAAS", "ABHD14A", "ACADVL", "ACAP3", 
        "ACCS", "ACIN1", "ACTR2", "ACTR3", "ADAM10", "ADAM9"), 
        adj.P.Val = c(1.83731245162161e-12, 1.01276101635279e-09, 
        1.27392687635188e-09, 2.67749562291447e-09, 4.30421108534489e-09, 
        6.18032947977852e-09, 8.5958306820173e-09, 9.28539096250232e-09, 
        1.03280085009177e-08, 1.31456095388164e-08), annot.seqnames = structure(c(12L, 
        3L, 17L, 1L, 11L, 14L, 2L, 2L, 15L, 8L), levels = c("chr1", 
        "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", 
        "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", 
        "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", 
        "chr21", "chr22", "chrX", "chrY", "chrM", "chr1_gl000191_random", 
        "chr1_gl000192_random", "chr4_ctg9_hap1", "chr4_gl000193_random", 
        "chr4_gl000194_random", "chr6_apd_hap1", "chr6_cox_hap2", 
        "chr6_dbb_hap3", "chr6_mann_hap4", "chr6_mcf_hap5", "chr6_qbl_hap6", 
        "chr6_ssto_hap7", "chr7_gl000195_random", "chr8_gl000196_random", 
        "chr8_gl000197_random", "chr9_gl000198_random", "chr9_gl000199_random", 
        "chr9_gl000200_random", "chr9_gl000201_random", "chr11_gl000202_random", 
        "chr17_ctg5_hap1", "chr17_gl000203_random", "chr17_gl000204_random", 
        "chr17_gl000205_random", "chr17_gl000206_random", "chr18_gl000207_random", 
        "chr19_gl000208_random", "chr19_gl000209_random", "chr21_gl000210_random", 
        "chrUn_gl000211", "chrUn_gl000212", "chrUn_gl000213", 
        "chrUn_gl000214", "chrUn_gl000215", "chrUn_gl000216", 
        "chrUn_gl000217", "chrUn_gl000218", "chrUn_gl000219", 
        "chrUn_gl000220", "chrUn_gl000221", "chrUn_gl000222", 
        "chrUn_gl000223", "chrUn_gl000224", "chrUn_gl000225", 
        "chrUn_gl000226", "chrUn_gl000227", "chrUn_gl000228", 
        "chrUn_gl000229", "chrUn_gl000230", "chrUn_gl000231", 
        "chrUn_gl000232", "chrUn_gl000233", "chrUn_gl000234", 
        "chrUn_gl000235", "chrUn_gl000236", "chrUn_gl000237", 
        "chrUn_gl000238", "chrUn_gl000239", "chrUn_gl000240", 
        "chrUn_gl000241", "chrUn_gl000242", "chrUn_gl000243", 
        "chrUn_gl000244", "chrUn_gl000245", "chrUn_gl000246", 
        "chrUn_gl000247", "chrUn_gl000248", "chrUn_gl000249"), class = "factor"), 
        annot.start = c(53320234L, 51976361L, 7218571L, 1293885L, 
        44069531L, 23067147L, 65228582L, 113931560L, 58702775L, 
        39078454L)), row.names = c(4L, 333L, 462L, 709L, 799L, 
    926L, 1358L, 1516L, 1805L, 1878L), class = "data.frame"), 
    structure(list(seqnames = structure(c(12L, 17L, 7L, 3L, 17L, 
    1L, 11L, 16L, 14L, 20L), levels = c("chr1", "chr2", "chr3", 
    "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", 
    "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", 
    "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY"
    ), class = "factor"), start = c(53307456L, 42950526L, 122073549L, 
    51971426L, 7217125L, 1292390L, 44065925L, 67657512L, 23058564L, 
    45841721L), end = c(53324864L, 42964498L, 122144255L, 51981199L, 
    7225266L, 1309609L, 44084237L, 67660815L, 23095614L, 45857405L
    ), width = c(17409L, 13973L, 70707L, 9774L, 8142L, 17220L, 
    18313L, 3304L, 37051L, 15685L), strand = structure(c(2L, 
    2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L), levels = c("+", "-", 
    "*"), class = "factor"), ensembl_gene_id = c("ENSG00000094914", 
    "ENSG00000266967", "ENSG00000008311", "ENSG00000248487", 
    "ENSG00000072778", "ENSG00000131584", "ENSG00000110455", 
    "ENSG00000102977", "ENSG00000100813", "ENSG00000101473"), 
        external_gene_name = c("AAAS", "AARSD1", "AASS", "ABHD14A", 
        "ACADVL", "ACAP3", "ACCS", "ACD", "ACIN1", "ACOT8"), 
        adj.P.Val = c(1.83731245162161e-12, 2.64248727238285e-11, 
        4.6737332542265e-10, 1.01276101635279e-09, 1.27392687635188e-09, 
        2.67749562291447e-09, 4.30421108534489e-09, 4.6721922893073e-09, 
        6.18032947977852e-09, 6.2518699283833e-09), annot.seqnames = structure(c(12L, 
        17L, 7L, 3L, 17L, 1L, 11L, 16L, 14L, 20L), levels = c("chr1", 
        "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", 
        "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", 
        "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", 
        "chr21", "chr22", "chrX", "chrY", "chrM", "chr1_gl000191_random", 
        "chr1_gl000192_random", "chr4_ctg9_hap1", "chr4_gl000193_random", 
        "chr4_gl000194_random", "chr6_apd_hap1", "chr6_cox_hap2", 
        "chr6_dbb_hap3", "chr6_mann_hap4", "chr6_mcf_hap5", "chr6_qbl_hap6", 
        "chr6_ssto_hap7", "chr7_gl000195_random", "chr8_gl000196_random", 
        "chr8_gl000197_random", "chr9_gl000198_random", "chr9_gl000199_random", 
        "chr9_gl000200_random", "chr9_gl000201_random", "chr11_gl000202_random", 
        "chr17_ctg5_hap1", "chr17_gl000203_random", "chr17_gl000204_random", 
        "chr17_gl000205_random", "chr17_gl000206_random", "chr18_gl000207_random", 
        "chr19_gl000208_random", "chr19_gl000209_random", "chr21_gl000210_random", 
        "chrUn_gl000211", "chrUn_gl000212", "chrUn_gl000213", 
        "chrUn_gl000214", "chrUn_gl000215", "chrUn_gl000216", 
        "chrUn_gl000217", "chrUn_gl000218", "chrUn_gl000219", 
        "chrUn_gl000220", "chrUn_gl000221", "chrUn_gl000222", 
        "chrUn_gl000223", "chrUn_gl000224", "chrUn_gl000225", 
        "chrUn_gl000226", "chrUn_gl000227", "chrUn_gl000228", 
        "chrUn_gl000229", "chrUn_gl000230", "chrUn_gl000231", 
        "chrUn_gl000232", "chrUn_gl000233", "chrUn_gl000234", 
        "chrUn_gl000235", "chrUn_gl000236", "chrUn_gl000237", 
        "chrUn_gl000238", "chrUn_gl000239", "chrUn_gl000240", 
        "chrUn_gl000241", "chrUn_gl000242", "chrUn_gl000243", 
        "chrUn_gl000244", "chrUn_gl000245", "chrUn_gl000246", 
        "chrUn_gl000247", "chrUn_gl000248", "chrUn_gl000249"), class = "factor"), 
        annot.start = c(53320196L, 42963953L, 122131366L, 51977372L, 
        7217807L, 1293836L, 44069587L, 67660458L, 23067216L, 
        45855946L)), row.names = c(3L, 11L, 144L, 311L, 420L, 
    696L, 772L, 888L, 909L, 1003L), class = "data.frame"), structure(list(
        seqnames = structure(c(12L, 3L, 17L, 1L, 17L, 11L, 14L, 
        2L, 15L, 8L), levels = c("chr1", "chr2", "chr3", "chr4", 
        "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", 
        "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", 
        "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", 
        "chrY"), class = "factor"), start = c(53307456L, 51971426L, 
        7217125L, 1292390L, 45132600L, 44065925L, 23058564L, 
        113890063L, 58588809L, 38996869L), end = c(53324864L, 
        51981199L, 7225266L, 1309609L, 45144181L, 44084237L, 
        23095614L, 113962596L, 58749791L, 39105261L), width = c(17409L, 
        9774L, 8142L, 17220L, 11582L, 18313L, 37051L, 72534L, 
        160983L, 108393L), strand = structure(c(2L, 1L, 1L, 2L, 
        1L, 1L, 2L, 1L, 2L, 1L), levels = c("+", "-", "*"), class = "factor"), 
        ensembl_gene_id = c("ENSG00000094914", "ENSG00000248487", 
        "ENSG00000072778", "ENSG00000131584", "ENSG00000181513", 
        "ENSG00000110455", "ENSG00000100813", "ENSG00000115091", 
        "ENSG00000137845", "ENSG00000168615"), external_gene_name = c("AAAS", 
        "ABHD14A", "ACADVL", "ACAP3", "ACBD4", "ACCS", "ACIN1", 
        "ACTR3", "ADAM10", "ADAM9"), adj.P.Val = c(1.83731245162161e-12, 
        1.01276101635279e-09, 1.27392687635188e-09, 2.67749562291447e-09, 
        2.90826962635755e-09, 4.30421108534489e-09, 6.18032947977852e-09, 
        9.28539096250232e-09, 1.03280085009177e-08, 1.31456095388164e-08
        ), annot.seqnames = structure(c(12L, 3L, 17L, 1L, 17L, 
        11L, 14L, 2L, 15L, 8L), levels = c("chr1", "chr2", "chr3", 
        "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", 
        "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", 
        "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", 
        "chrX", "chrY", "chrM", "chr1_gl000191_random", "chr1_gl000192_random", 
        "chr4_ctg9_hap1", "chr4_gl000193_random", "chr4_gl000194_random", 
        "chr6_apd_hap1", "chr6_cox_hap2", "chr6_dbb_hap3", "chr6_mann_hap4", 
        "chr6_mcf_hap5", "chr6_qbl_hap6", "chr6_ssto_hap7", "chr7_gl000195_random", 
        "chr8_gl000196_random", "chr8_gl000197_random", "chr9_gl000198_random", 
        "chr9_gl000199_random", "chr9_gl000200_random", "chr9_gl000201_random", 
        "chr11_gl000202_random", "chr17_ctg5_hap1", "chr17_gl000203_random", 
        "chr17_gl000204_random", "chr17_gl000205_random", "chr17_gl000206_random", 
        "chr18_gl000207_random", "chr19_gl000208_random", "chr19_gl000209_random", 
        "chr21_gl000210_random", "chrUn_gl000211", "chrUn_gl000212", 
        "chrUn_gl000213", "chrUn_gl000214", "chrUn_gl000215", 
        "chrUn_gl000216", "chrUn_gl000217", "chrUn_gl000218", 
        "chrUn_gl000219", "chrUn_gl000220", "chrUn_gl000221", 
        "chrUn_gl000222", "chrUn_gl000223", "chrUn_gl000224", 
        "chrUn_gl000225", "chrUn_gl000226", "chrUn_gl000227", 
        "chrUn_gl000228", "chrUn_gl000229", "chrUn_gl000230", 
        "chrUn_gl000231", "chrUn_gl000232", "chrUn_gl000233", 
        "chrUn_gl000234", "chrUn_gl000235", "chrUn_gl000236", 
        "chrUn_gl000237", "chrUn_gl000238", "chrUn_gl000239", 
        "chrUn_gl000240", "chrUn_gl000241", "chrUn_gl000242", 
        "chrUn_gl000243", "chrUn_gl000244", "chrUn_gl000245", 
        "chrUn_gl000246", "chrUn_gl000247", "chrUn_gl000248", 
        "chrUn_gl000249"), class = "factor"), annot.start = c(53320254L, 
        51975361L, 7218648L, 1293916L, 45137516L, 44068531L, 
        23066147L, 113930560L, 58701775L, 39077454L)), row.names = c(1L, 
    305L, 412L, 686L, 756L, 764L, 899L, 1427L, 1790L, 1836L), class = "data.frame"))
d4so4syb

d4so4syb1#

base R中,我们通过在list上循环(lapply)来提取)([[),即external_gene_name列,然后在提取的列上应用intersect以获得common.names。然后,将subset数据集基于common.names,并将Reduce列表转换为单个数据。帧使用merge通过'external_gene_name'

common.names <- Reduce(intersect, lapply(df.list, `[[`, "external_gene_name"))
 Reduce(\(...) merge(..., by = "external_gene_name"), 
   lapply(df.list, subset, external_gene_name %in% common.names, 
     select = c(external_gene_name, adj.P.Val)))
  • 输出
external_gene_name  adj.P.Val.x  adj.P.Val.y  adj.P.Val.x  adj.P.Val.y
1             ACADVL 1.273927e-09 1.273927e-09 1.273927e-09 1.273927e-09
2              ACAP3 2.677496e-09 2.677496e-09 2.677496e-09 2.677496e-09
3               ACCS 4.304211e-09 4.304211e-09 4.304211e-09 4.304211e-09
4              ACIN1 6.180329e-09 6.180329e-09 6.180329e-09 6.180329e-09
dced5bon

dced5bon2#

一种方法:

library(dplyr)

df.list |>
  Map(f  = \(x) x |> select(external_gene_name, adj.P.Val)) |>
  Reduce(f = \(x, y) x |> left_join(y, by = 'external_gene_name')) |>
  filter(if_all(starts_with('adj.P.Val'), ~ !(is.na(.x))))
external_gene_name  adj.P.Val.x  adj.P.Val.y adj.P.Val.x.x adj.P.Val.y.y
1             ACADVL 1.273927e-09 1.273927e-09  1.273927e-09  1.273927e-09
2              ACAP3 2.677496e-09 2.677496e-09  2.677496e-09  2.677496e-09
3               ACCS 4.304211e-09 4.304211e-09  4.304211e-09  4.304211e-09
4              ACIN1 6.180329e-09 6.180329e-09  6.180329e-09  6.180329e-09

相关问题