R语言 按组和条件筛选列

cpjpxq1n  于 2023-04-18  发布在  其他
关注(0)|答案(5)|浏览(160)

我有一个简单的任务,但还是想不出来,我有一个csv二元矩阵,基因为行,样本为列,其中1表示基因存在,0表示基因不存在,样本属于不同的聚类,如:cluster1 = c(sampleA,sampleB,sampleC,sampleD)cluster2 = c(sampleE,sampleF,sampleG)
| 基因|样品A|样品B|样品C|样品D|样品E|样品F|样品G|
| --------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
| 基因1|1个|0|0|1个|0|0|0|
| 基因2|0|0|0|0|1个|1个|0|
| 基因3|0|0|0|0|0|0|1个|
| 基因4|0|1个|0|0|0|0|0|
| 基因5|1个|1个|1个|1个|0|0|0|
| 基因6|1个|1个|1个|1个|0|0|0|
| 基因7|0|0|0|1个|1个|0|0|
| 基因8|0|0|0|0|0|0|0|
| 基因9|1个|0|0|1个|0|0|0|
| 基因1|0|0|0|0|0|0|0|
| 基因11|1个|0|0|1个|0|0|0|
| 基因12|0|0|0|0|1个|1个|1个|
| 基因13|0|0|0|0|1个|1个|1个|
| 基因14|0|0|0|0|0|0|0|
| 基因15|0|0|0|0|0|0|0|
| 基因16|1个|0|0|0|0|0|0|
| 基因17|1个|0|0|1个|0|0|0|
| 基因18|1个|0|0|1个|0|0|0|
| 基因19|1个|0|0|1个|0|0|0|
| 基因20|1个|0|0|1个|0|0|0|
我需要找出哪些基因只出现在一个聚类中,而在所有其他聚类中都不存在,这意味着,我想根据基因只出现在一个聚类中来对列进行子集化/过滤,如下所示:
集群1:
| 样品A|样品B|样品C|样品D|样品E|样品F|样品G|
| --------------|--------------|--------------|--------------|--------------|--------------|--------------|
| 基因5|1个|1个|1个|1个|0|0|
| 基因6|1个|1个|1个|1个|0|0|
集群2:
| 样品A|样品B|样品C|样品D|样品E|样品F|样品G|
| --------------|--------------|--------------|--------------|--------------|--------------|--------------|
| 基因12|0|0|0|0|1个|1个|
| 基因13|0|0|0|0|1个|1个|
我的原始数据集包含8212个基因和23个样本,分为8个不同的簇:

cluster1 <- c("sampleA", "sampleB", "sampleC", "sampleD", "sampleE", "sampleF", "sampleG", "sampleH", 
              "sampleI", "sampleJ", "sampleK", "sampleL", "sampleM", "sampleN", "sampleO") 
cluster2 <- c("sampleP")
cluster3 <- c("sampleQ", "sampleR")
cluster4 <- c("sampleS")
cluster5 <- c("sampleT")
cluster6 <- c("sampleU")
cluster7 <- c("sampleV")
cluster8 <- c("sampleZ")

Here is an example file
有没有简单的方法用R或bash?谢谢

wydwbb8l

wydwbb8l1#

更新4/13/23:

clusterno=paste0("cluster",1:8)

getmatch <- function(cluster){
  pangenome[rowSums(pangenome[,get0(cluster),drop=F])==length(get0(cluster)) &
            rowSums(pangenome[,!names(pangenome) %in% c(get0(cluster),"Gene")])==0
            ,]
}

do.call(rbind,lapply(clusterno,getmatch))  #output has 280 rows

这将保留所有其他附加列

cluster1 = c("sampleA", "sampleB", "sampleC", "sampleD")
cluster2 = c("sampleE", "sampleF", "sampleG")

rbind(df[rowSums(df[,cluster1])==length(cluster1),],
      df[rowSums(df[,cluster2])==length(cluster2),])

     Gene sampleA sampleB sampleC sampleD sampleE sampleF sampleG
5   gene5       1       1       1       1       0       0       0
6   gene6       1       1       1       1       0       0       0
12 gene12       0       0       0       0       1       1       1
13 gene13       0       0       0       0       1       1       1

广义:

  • 申请
clusterno <- c("cluster1","cluster2")
getmatch <- function(cluster){
  df[rowSums(df[,get0(cluster)])==length(get0(cluster)),]
}

do.call(rbind,lapply(clusterno,getmatch))

     Gene sampleA sampleB sampleC sampleD sampleE sampleF sampleG
5   gene5       1       1       1       1       0       0       0
6   gene6       1       1       1       1       0       0       0
12 gene12       0       0       0       0       1       1       1
13 gene13       0       0       0       0       1       1       1

-for循环

clusterno <- c("cluster1","cluster2")
out <- data.frame()
for(cluster in clusterno){
  temp_df <- df[rowSums(df[,get0(cluster)])==length(get0(cluster)),]
  out <- rbind(out,temp_df)
}

out
     Gene sampleA sampleB sampleC sampleD sampleE sampleF sampleG
5   gene5       1       1       1       1       0       0       0
6   gene6       1       1       1       1       0       0       0
12 gene12       0       0       0       0       1       1       1
13 gene13       0       0       0       0       1       1       1
jrcvhitl

jrcvhitl2#

首先创建两个变量来保存列名:

set1<- c('sampleA','sampleB','sampleC','sampleD')

set2<- c('sampleE','sampleF','sampleG')

df %>% filter(xor(rowSums(.[set1])==length(set1),rowSums(.[set2])==length(set2)))

我使用的是基本R和Dyplr函数的组合。如果一个条件为真,另一个条件为假,xor返回真。如果两个条件都为真或假,则返回假。

zzwlnbp8

zzwlnbp83#

更新(删除之前的解决方案):以下是dplyr方法:

library(dplyr)

cluster1 <- c("sampleA", "sampleB", "sampleC", "sampleD")
cluster2 <- c("sampleE", "sampleF", "sampleG")

df %>% 
  filter(if_all(all_of(cluster1), ~ . == 1)) %>% 
  bind_rows(df %>% 
              filter(if_all(all_of(cluster2), ~. == 1))) 

  sampleA sampleB sampleC sampleD sampleE sampleF sampleG
1       1       1       1       1       0       0       0
2       1       1       1       1       0       0       0
3       0       0       0       0       1       1       1
4       0       0       0       0       1       1       1
smdnsysy

smdnsysy4#

在分层聚类之后选择聚类,在本例中为5和7。

library(dplyr) # for "filter" and "select"

cbind(df, clust = cutree(hclust(dist(df[,-1])), h=0)) %>% 
  filter(clust %in% c(5,7)) %>% 
  select(-clust)
    Gene sampleA sampleB sampleC sampleD sampleE sampleF sampleG
1  gene5       1       1       1       1       0       0       0
2  gene6       1       1       1       1       0       0       0
3 gene11       0       0       0       0       1       1       1
4 gene12       0       0       0       0       1       1       1

选择所需群集编号的直观帮助

rownames(df) <- paste(apply(df[,-1], 1, paste0, collapse=""), 1:nrow(df))

ctree <- cutree(hclust(dist(df[,-1])), h=0)

ctree[!duplicated(sub(" .*", "", names(ctree)))]
 1001000 1  0000110 2  0000001 3  0100000 4  1111000 5  0000000 8 0000111 10 
         1          2          3          4          5          6          7 
0000010 13 1000000 14 
         8          9
iovurdzv

iovurdzv5#

这可能就是您要尝试做的事情,在每个Unix机器上的任何shell中使用任何awk:

$ cat tst.sh
#!/usr/bin/env bash

awk '
    BEGIN {
        FS = ","
        clusters[1] = "sampleA,sampleB,sampleC,sampleD"
        clusters[2] = "sampleE,sampleF,sampleG"
    }
    NR == 1 {
        for ( i=1; i<=NF; i++ ) {
            tag2val[$i] = i
        }
        print
        next
    }
    {
        numClustersFull = numClustersPartial = 0
        for ( i in clusters ) {
            numSamplesPresent = 0
            totSamples = split(clusters[i],samples)
            for ( j in samples ) {
                numSamplesPresent += $(tag2val[samples[j]])
            }
            if ( numSamplesPresent == totSamples ) {
                numClustersFull++
            }
            else if ( numSamplesPresent != 0 ) {
                numClustersPartial++
            }
        }
    }
    (numClustersFull == 1) && (numClustersPartial == 0)
' "${@:--}"
$ ./tst.sh file
Gene,sampleA,sampleB,sampleC,sampleD,sampleE,sampleF,sampleG
gene5,1,1,1,1,0,0,0
gene6,1,1,1,1,0,0,0
gene11,0,0,0,0,1,1,1
gene12,0,0,0,0,1,1,1

以上是针对此输入文件运行的:

$ cat file
Gene,sampleA,sampleB,sampleC,sampleD,sampleE,sampleF,sampleG
gene1,1,0,0,1,0,0,0
gene2,0,0,0,0,1,1,0
gene3,0,0,0,0,0,0,1
gene4,0,1,0,0,0,0,0
gene5,1,1,1,1,0,0,0
gene6,1,1,1,1,0,0,0
gene8,0,0,0,0,0,0,1
gene9,0,0,0,0,0,0,0
gene10,1,0,0,1,0,0,0
gene11,0,0,0,0,1,1,1
gene12,0,0,0,0,1,1,1
gene13,0,0,0,0,0,0,0
gene14,0,0,0,0,0,1,0
gene16,1,0,0,0,0,0,0
gene17,1,0,0,1,0,0,0
gene18,1,0,0,1,0,0,0
gene19,1,0,0,1,0,0,0
gene20,1,0,0,1,0,0,0

相关问题