如何在R中创建条件和?

fhity93d  于 2023-06-19  发布在  其他
关注(0)|答案(3)|浏览(125)

我尝试根据不同的条件计算数据框中不同列的总和。
这是我的试验数据:

Year <- c(2001, 2001, 2001, 2002, 2002, 2003, 2003, 2004, 2004)
COMP1 <- c(NA, 1, 2, 6, 9, NA, 2, 1, NA)  
COMP2 <- c(2, 3, 3, 3, 6, 4, > 1, 0, 1)   
COMP3 <- c(NA, 1, 2, 3, 4, 0, 0, 1, 0)   
COMP4 <- c(25, 29, 16, 17, NA, 20, NA, 21, 12)

DF <- data.frame(Year, COMP1, COMP2, COMP3, COMP4)

这就是我想做的:
对于每一年和每一家公司(COMP),我想计算值的总和,但仅当单元格中的单个值高于该年内值的平均值时)。
我已经实现了计算总和和平均值,但我不知道如何将它们结合起来创造一个总和与所提到的条件。
以下是我到目前为止尝试过的公式(请注意,我的原始样本有6121列不同公司的列):

SUM <- aggregate(DF[, 2:6121], list(DF$Year), sum)

MEAN <- aggregate(DF[, 2:6121], list(DF$Year), mean)
brgchamk

brgchamk1#

library(tidyverse)

df %>%
  summarize(across(contains("COMP"), 
                   ~ sum(.x[.x > mean(.x, na.rm = TRUE)], na.rm = TRUE)), 
            .by = Year)

# A tibble: 4 × 5
   Year COMP1 COMP2 COMP3 COMP4
  <dbl> <dbl> <dbl> <dbl> <dbl>
1  2001     2     6     2    54
2  2002     9     6     4     0
3  2003     0     4     0     0
4  2004     0     1     1    21

包括等于和高于组平均值的值:

df %>%
  summarize(across(1:4, 
                   ~ sum(.x[.x >= mean(.x, na.rm = TRUE)], na.rm = TRUE)), 
            .by = Year)

# A tibble: 4 × 5
   Year COMP1 COMP2 COMP3 COMP4
  <dbl> <dbl> <dbl> <dbl> <dbl>
1  2001     2     6     2    54
2  2002     9     6     4    17
3  2003     2     4     0    20
4  2004     1     1     1    21
tmb3ates

tmb3ates2#

dplyr/tidyr

我认为Chamkrai的答案对dplyr来说是最简单的,但我倾向于发现处理数据更像是“长”格式,所以我将保留这个格式。看看中间值(紧接着pivot_longer之后,然后是summarize之后和pivot_wider之前),就明白我的意思了。

library(dplyr) # dplyr_1.1 or newer for `.by=`
library(tidyr)
quux %>%
  pivot_longer(-Year, names_to = "Comp") %>%
  summarize(
    res = sum(value[value > mean(value, na.rm=TRUE)], na.rm=TRUE),
    .by = c(Year, Comp)) %>%
  pivot_wider(id_cols = Year, names_from = "Comp", values_from = "res")
# # A tibble: 4 × 5
#    Year COMP1 COMP2 COMP3 COMP4
#   <dbl> <dbl> <dbl> <dbl> <dbl>
# 1  2001     2     6     2    54
# 2  2002     9     6     4     0
# 3  2003     0     4     0     0
# 4  2004     0     1     1    21

base R

use <- lapply(quux[-1], function(y) {
  res <- y > ave(y, quux$Year, FUN = function(z) mean(z, na.rm = TRUE))
  !is.na(res) & res
})
use
# $COMP1
# [1] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
# $COMP2
# [1] FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE
# $COMP3
# [1] FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE
# $COMP4
# [1]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE

使用use,我们可以调整quux中的值,以便将不高于平均值的值归零:

quux[-1] * use
#   COMP1 COMP2 COMP3 COMP4
# 1    NA     0    NA    25
# 2     0     3     0    29
# 3     2     3     2     0
# 4     0     0     0     0
# 5     9     6     4    NA
# 6    NA     4     0     0
# 7     0     0     0    NA
# 8     0     0     1    21
# 9    NA     1     0     0

现在我们可以使用aggregate

aggregate(quux[-1] * use, quux["Year"], FUN = function(z) sum(z, na.rm = TRUE))
#   Year COMP1 COMP2 COMP3 COMP4
# 1 2001     2     6     2    54
# 2 2002     9     6     4     0
# 3 2003     0     4     0     0
# 4 2004     0     1     1    21

data.table

library(data.table)
as.data.table(quux)[, lapply(.SD, function(z) sum(z[z > mean(z, na.rm = TRUE)], na.rm = TRUE)),
                      by = Year]
#     Year COMP1 COMP2 COMP3 COMP4
#    <num> <num> <num> <num> <num>
# 1:  2001     2     6     2    54
# 2:  2002     9     6     4     0
# 3:  2003     0     4     0     0
# 4:  2004     0     1     1    21

数据

quux <- structure(list(Year = c(2001, 2001, 2001, 2002, 2002, 2003, 2003, 2004, 2004), COMP1 = c(NA, 1, 2, 6, 9, NA, 2, 1, NA), COMP2 = c(2, 3, 3, 3, 6, 4, 1, 0, 1), COMP3 = c(NA, 1, 2, 3, 4, 0, 0, 1, 0), COMP4 = c(25, 29, 16, 17, NA, 20, NA, 21, 12)), class = "data.frame", row.names = c(NA, -9L))
bwntbbo3

bwntbbo33#

使用data.table

library(data.table)

DF <- data.table(Year, COMP1, COMP2, COMP3, COMP4)

conditinalSum <- function(x){
  x <- x[!is.na(x)]
  sum(x[x > mean(x)])
}

DF[, lapply(.SD, conditinalSum), by = Year]

相关问题