R语言 保留至少一个预定义的列子集的行和大于阈值的行

fnvucqvd  于 2023-05-20  发布在  其他
关注(0)|答案(2)|浏览(119)

我有一个这样的dataframe:

df <- data.frame(
  sample1 = c(0, 1, 2, 0, 2, 1),
  sample2 = c(0.3, 3, 2, 0.4, 2, 3),
  sample3 = c(0.2, 1, 3, 0.1, 3, 3),
  sample4 = c(0.4, 2, 4, 0.3, 1, 1),
  sample5 = c(0.1, 2, 4, 0.2, 5, 3),
  sample6 = c(0.2, 3, 1, 0.1, 6, 3),
  sample7 = c(0.2, 1, 1, 0.4, 1, 1)
)

在这个df中定义的组:

groups <- data.frame(
  samples = c("sample1", "sample2", "sample3", "sample4", "sample5", "sample6", "sample7"),
  groups = c("group1", "group1", "group1", "group2", "group2", "group3", "group3")
)

使用R,我想只保留至少一个组中的总和大于0.5的行,因此结果df将是:

sample1 sample2 sample3 sample4 sample5 sample6 sample7
2       1       3       1       2       2       3       1
3       2       2       3       4       4       1       1
5       2       2       3       1       5       6       1
6       1       3       3       1       3       3       1
4szc88ey

4szc88ey1#

library(dplyr)
library(purrr)
library(tibble)

map({groups %>% 
      group_split(groups, .keep = F) %>% 
      map(~pull(.x, samples))} , ~df %>% 
      rownames_to_column("id") %>% 
      mutate(, sum_grp = select(., all_of(.x)) %>% 
               rowSums) %>% 
      filter(sum_grp > 0.5)) %>% 
  bind_rows() %>% 
  arrange(id) %>% 
  select(-sum_grp, -id) %>% 
  unique()

#>    sample1 sample2 sample3 sample4 sample5 sample6 sample7
#> 1        1       3       1       2       2       3       1
#> 4        2       2       3       4       4       1       1
#> 7        2       2       3       1       5       6       1
#> 10       1       3       3       1       3       3       1
lnvxswe2

lnvxswe22#

以下是tidyverse版本:注意,只有一行(例如满足标准的行4):

library(dplyr)
library(tidyr)

df %>% 
  pivot_longer(everything()) %>% 
  left_join(groups, by = c("name"="samples")) %>% 
  mutate(row =as.integer(gl(n(),ncol(df),n()))) %>% 
  mutate(sum_group = sum(value), .by = c(row, groups)) %>% 
  group_by(row) %>% 
  filter(!all(sum_group <= 0.5)) %>% 
  select(name, value) %>% 
  pivot_wider(names_from = name, values_from = value) 

  row sample1 sample2 sample3 sample4 sample5 sample6 sample7
  <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1     1       0     0.3     0.2     0.4     0.1     0.5     0.2
2     2       1     3       1       2       2       3       1  
3     3       2     2       3       4       4       1       1  
4     5       2     2       3       1       5       6       1  
5     6       1     3       3       1       3       3       1

相关问题