计算和提取 Dataframe 中所有列中行内唯一元素/基因的出现

bxjv4tth  于 2023-03-15  发布在  其他
关注(0)|答案(2)|浏览(101)

我有非常大的尺寸csv文件和搜索存在缺席数据的独特基因,以及他们的计数。我的数据如下

df <- data.frame(
      A = c("G1", "G2", "G3", "G4", "G5","G6","G7", "G8", "G9","G10"),
      B = c(1, 0, 1, 0, 1, 1, 1, 0, 0, 0),
      C = c(1, 0, 1, 0, 0, 0, 0, 1, 1, 0),
      D = c(1, 1, 0, 0, 0, 0, 0, 0, 0, 1),
      E = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0))

输出如下:第一个是具有唯一基因的 Dataframe

df_uniq <- data.frame(
  A = c("G4", "G5","G6","G7", "G8", "G9","G10"),
  B = c(0, 1, 1, 1, 0, 0, 0),
  C = c(0, 0, 0, 0, 1, 1, 0),
  D = c(0, 0, 0, 0, 0, 0, 1),
  E = c(1, 0, 0, 0, 0, 0, 0))

感谢你的帮助谢谢!

pn9klfpd

pn9klfpd1#

一个dplyr解决方案

library(dplyr)
library(purrr)
df_uniq <- df %>%
   filter(reduce(pick(-A), `+`) == 1)
> df_uniq
    A B C D E
1  G4 0 0 0 1
2  G5 1 0 0 0
3  G6 1 0 0 0
4  G7 1 0 0 0
5  G8 0 1 0 0
6  G9 0 1 0 0
7 G10 0 0 1 0
df_uniq %>%
   reframe(across(-1, sum))
  B C D E
1 3 2 1 1

或者在base R

df_uniq <- subset(df, Reduce(`+`, df[-1]) == 1)
colSums(df_uniq[-1])
B C D E 
3 2 1 1
mlmc2os5

mlmc2os52#

dplyr解决方案:

library(dplyr)

df_uniq <- df %>%
  filter(rowSums(pick(-A)) == 1)

#      A B C D E
# 4   G4 0 0 0 1
# 5   G5 1 0 0 0
# 6   G6 1 0 0 0
# 7   G7 1 0 0 0
# 8   G8 0 1 0 0
# 9   G9 0 1 0 0
# 10 G10 0 0 1 0

df_uniq %>%
  summarise(across(-A, sum))

#   B C D E
# 1 3 2 1 1

base中的等效值:

df_uniq <- df[rowSums(df[-1]) == 1, ]
sapply(df_uniq[-1], sum)

相关问题