R语言 对跨多个列和组出现的非重复值进行计数

eoxn13cs  于 2023-02-01  发布在  其他
关注(0)|答案(3)|浏览(215)

我有一个 Dataframe ,如下所示(在实际的数据集中,行数是几千,我有超过300个变量):

df <- data.frame (Gr = c("A","A","A","B","B","B","B","B","B"),
                  Var1 = c("a","b","c","e","a","a","c","e","b"),
                  Var2 = c("a","a","a","d","b","b","c","a","e"),
                  Var3 = c("e","a","b",NA,"a","b","c","d","a"),
                  Var4 = c("e",NA,"a","e","a","b","d","c",NA))

该函数返回:

Gr Var1 Var2 Var3 Var4
1  A    a    a    e    e
2  A    b    a    a <NA>
3  A    c    a    b    a
4  B    e    d <NA>    e
5  B    a    b    a    a
6  B    a    b    b    b
7  B    c    c    c    d
8  B    e    a    d    c
9  B    b    e    a <NA>

并希望获得每个值(a、b、c、d、e和NA)在每个变量和每个组中出现的次数。因此,输出应如下所示:

df1 <- data.frame(Vars = c("Var1","Var2","Var3","Var4"),
                  a = c(1,3,1,1),
                  b = c(1,0,1,0),
                  c = c(1,0,0,0),
                  d = c(0,0,0,0),
                  e = c(0,0,1,1),
                  na = c(0,0,0,1))

df2 <- data.frame(Vars = c("Var1","Var2","Var3","Var4"),
                  a = c(2,1,2,1),
                  b = c(0,2,1,1),
                  c = c(1,1,1,1),
                  d = c(0,1,1,1),
                  e = c(2,1,0,1),
                  na = c(0,0,1,1))
output <- list(df1,df2)
names(output) <- c("A","B")

它看起来像:

$A
  Vars a b c d e na
1 Var1 1 1 1 0 0  0
2 Var2 3 0 0 0 0  0
3 Var3 1 1 0 0 1  0
4 Var4 1 0 0 0 1  1

$B
  Vars a b c d e na
1 Var1 2 0 1 0 2  0
2 Var2 1 2 1 1 1  0
3 Var3 2 1 1 1 0  1
4 Var4 1 1 1 1 1  1

到目前为止,我还没有能够取得任何显著的进展,tidyverse解决方案是首选。

laximzn5

laximzn51#

我们可以在split ing之后使用mtabulate

library(qdapTools)
lapply(split(df[-1], df$Gr), mtabulate)

如果我们还需要na计数

lapply(split(replace(df[-1], is.na(df[-1]), "na"), df$Gr), mtabulate)
  • 输出
$A
     a b c e na
Var1 1 1 1 0  0
Var2 3 0 0 0  0
Var3 1 1 0 1  0
Var4 1 0 0 1  1

$B
     a b c d e na
Var1 2 1 1 0 2  0
Var2 1 2 1 1 1  0
Var3 2 1 1 1 0  1
Var4 1 1 1 1 1  1

或者使用tidyverse

library(dplyr)
library(tidyr)
df %>%
  pivot_longer(cols = -Gr, names_to = "Vars") %>%
  pivot_wider(names_from = value, values_from = value, 
    values_fn = length, values_fill = 0) %>%
  {split(.[-1], .$Gr)}
  • 输出
$A
# A tibble: 4 × 7
  Vars      a     e     b  `NA`     c     d
  <chr> <int> <int> <int> <int> <int> <int>
1 Var1      1     0     1     0     1     0
2 Var2      3     0     0     0     0     0
3 Var3      1     1     1     0     0     0
4 Var4      1     1     0     1     0     0

$B
# A tibble: 4 × 7
  Vars      a     e     b  `NA`     c     d
  <chr> <int> <int> <int> <int> <int> <int>
1 Var1      2     2     1     0     1     0
2 Var2      1     1     2     0     1     1
3 Var3      2     0     1     1     1     1
4 Var4      1     1     1     1     1     1
mnemlml8

mnemlml82#

使用colSumsNA保存base R方法

val <- sort(unique(unlist(df[-1])), na.last=T)

as.list(lapply(split(df[-1], df$Gr), function(dlist) 
  data.frame(sapply(val, function(x) 
    colSums(dlist == x | (is.na(dlist) & is.na(x)), na.rm=T)), check.names=F)))
$A
     a b c d e NA
Var1 1 1 1 0 0  0
Var2 3 0 0 0 0  0
Var3 1 1 0 0 1  0
Var4 1 0 0 0 1  1

$B
     a b c d e NA
Var1 2 1 1 0 2  0
Var2 1 2 1 1 1  0
Var3 2 1 1 1 0  1
Var4 1 1 1 1 1  1
pqwbnv8z

pqwbnv8z3#

reshape2::recast(df,Gr+variable~value,length,id.var = 'Gr')

  Gr variable a b c d e NA
1  A     Var1 1 1 1 0 0  0
2  A     Var2 3 0 0 0 0  0
3  A     Var3 1 1 0 0 1  0
4  A     Var4 1 0 0 0 1  1
5  B     Var1 2 1 1 0 2  0
6  B     Var2 1 2 1 1 1  0
7  B     Var3 2 1 1 1 0  1

如果必须拆分它们:

split(reshape2::recast(df,Gr+variable~value,length,id.var = 'Gr'), ~Gr)

$A
  Gr variable a b c d e NA
1  A     Var1 1 1 1 0 0  0
2  A     Var2 3 0 0 0 0  0
3  A     Var3 1 1 0 0 1  0
4  A     Var4 1 0 0 0 1  1

$B
  Gr variable a b c d e NA
5  B     Var1 2 1 1 0 2  0
6  B     Var2 1 2 1 1 1  0
7  B     Var3 2 1 1 1 0  1
8  B     Var4 1 1 1 1 1  1

以R为基:

ftable(cbind(df[1], stack(replace(df, is.na(df),'na'), -1)),col.vars = 2)

        values a b c d e na
Gr ind                     
A  Var1        1 1 1 0 0  0
   Var2        3 0 0 0 0  0
   Var3        1 1 0 0 1  0
   Var4        1 0 0 0 1  1
B  Var1        2 1 1 0 2  0
   Var2        1 2 1 1 1  0
   Var3        2 1 1 1 0  1
   Var4        1 1 1 1 1  1

相关问题