如何根据列名中的模式计算data.frame中的均值?

jtoj6r0c  于 2023-09-27  发布在  其他
关注(0)|答案(6)|浏览(88)

我有一个21列的数据框架,每三列是一个特定的时间点。我想计算三次重复测定的平均值。

geneName  t11  t12  t13  t21  t22  t23  t31  t32  t33  t41  t42  t43  t51 t52  t53  t61  t62  t63
gene1 gene1 3296 5133 3466 2166 1759 2099 1916 1379 1570 2533 1794 1016  800  79  648   99   60  152
gene2 gene2 4210 5505 4173 2736 2748 3052 2409 1944 2237 1158 3475 1488 4023 102  940  265  365  124
...

在上面的例子中,我想计算三个t1t2t3等的平均值。

rsl1atfo

rsl1atfo1#

这是一个基本的R方法-

# Column numbers to be ignored from the calculation
columns_not_included <- 1
# Keep only subset of dataset that we need
tmp <- df[-columns_not_included]

cbind.data.frame(df[columns_not_included], 
              sapply(split.default(tmp, 
               sub('\\d$', '', names(tmp))), rowMeans, na.rm =TRUE))

#  geneName       t1       t2       t3       t4       t5       t6
#1    gene1 3965.000 2008.000 1621.667 1781.000  509.000 103.6667
#2    gene2 4629.333 2845.333 2196.667 2040.333 1688.333 251.3333

sub('\\d$', '', names(tmp))将删除列名的最后一个数字,以便t11t12t13都将仅返回t1,并且对于传递给split.default的其他组也是如此,split.default基于这些组在dataframe列表中分割数据。对于每组,我们使用rowMeans计算行平均值。

mzsu5hc0

mzsu5hc02#

tidyverse解决方案:

library(tidyverse)

df %>%
  pivot_longer(-geneName, names_to = ".value", names_pattern = "(t.)") %>%
  summarise(across(everything(), mean), .by = geneName)

# # A tibble: 2 × 7
#   geneName    t1    t2    t3    t4    t5    t6
#   <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 gene1    3965  2008  1622. 1781   509   104.
# 2 gene2    4629. 2845. 2197. 2040. 1688.  251.
czq61nw1

czq61nw13#

另一种基于R的方法:

g1 <- t(substr(names(df[-1]), 1, 2))[rep(1, nrow(df)),]
cbind(df[1], tapply(unlist(df[-1]), list(row(g1), g1), mean))

  geneName       t1       t2       t3       t4       t5       t6
1    gene1 3965.000 2008.000 1621.667 1781.000  509.000 103.6667
2    gene2 4629.333 2845.333 2196.667 2040.333 1688.333 251.3333
rqenqsqc

rqenqsqc4#

df<- read.table(text =
"geneName  t11  t12  t13  t21  t22  t23  t31  t32  t33  t41  t42  t43  t51 t52  t53  t61  t62  t63
gene1 3296 5133 3466 2166 1759 2099 1916 1379 1570 2533 1794 1016  800  79  648   99   60  152
gene2 4210 5505 4173 2736 2748 3052 2409 1944 2237 1158 3475 1488 4023 102  940  265  365  124", header = TRUE)

library(tidyverse)

df %>% 
  pivot_longer(cols = -geneName) %>% 
  mutate(name = substr(name, 1, 2)) %>%
  group_by(geneName, name) %>%
  summarise(mean = mean(value))

# A tibble: 12 × 3
# Groups:   geneName [2]
   geneName name   mean
   <chr>    <chr> <dbl>
 1 gene1    t1    3965 
 2 gene1    t2    2008 
 3 gene1    t3    1622.
 4 gene1    t4    1781 
 5 gene1    t5     509 
 6 gene1    t6     104.
 7 gene2    t1    4629.
 8 gene2    t2    2845.
 9 gene2    t3    2197.
10 gene2    t4    2040.
11 gene2    t5    1688.
12 gene2    t6     251.
cmssoen2

cmssoen25#

这里是另一个选项:

library(tidyverse)

test_data |>
  pivot_longer(-geneName, 
               names_to = "time", 
               names_pattern = "t(\\d+)", 
               names_transform = as.numeric) |>
  arrange(geneName, time) |>
  mutate(group = cumsum(time - lag(time, default = first(time)-1)!=1)) |>
  nest(data = -c(geneName, group)) |>
  transmute(geneName = geneName,
            time_range = map_chr(data, ~glue::glue("t{mn}-t{mx}", 
                                                mn = min(.x$time),
                                                mx = max(.x$time))),
            mean = map_dbl(data, ~mean(.x$value))) |>
  pivot_wider(names_from = time_range, values_from = mean)
#> # A tibble: 2 x 7
#>   geneName `t11-t13` `t21-t23` `t31-t33` `t41-t43` `t51-t53` `t61-t63`
#>   <chr>        <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
#> 1 gene1        3965      2008      1622.     1781       509       104.
#> 2 gene2        4629.     2845.     2197.     2040.     1688.      251.
okxuctiv

okxuctiv6#

即使我喜欢Ronak的圆滑答案,我仍然更喜欢使用Tidyverse语法的计算。我发现它更干净,更清晰。
不过还是谢谢你的帮助

test.mean <- test_data |>
group_by(gene)
dplyr::rowwise() |> 
mutate("meant1" = mean(c(t11,t12,t13))) |>
mutate("meant2" = mean(c(t21,t22,t23))) |>
mutate("meant3" = mean(c(t31,t32,t33))) |>
mutate("meant4" = mean(c(t41,t42,t43))) |>
mutate("meant5" = mean(c(t51,t52,t53))) |>
mutate("meant6" = mean(c(t61,t62,t63))) |>
select(matches("geneName|mean")) |> 
column_to_rownames("geneName") |>
as.data.frame()

相关问题