使用R统计 Dataframe 组中的类型数

o7jaxewo  于 2023-05-26  发布在  其他
关注(0)|答案(2)|浏览(112)

我有这样一个数据:

data<-data.frame(is.on=c("FALSE","FALSE","FALSE","TRUE","FALSE","TRUE","FALSE","FALSE","TRUE","TRUE","TRUE","TRUE"),
                 dur=c(10,20,30,10,10,10,10,20,10,20,30,40),
                 dt=c(10,10,10,10,10,10,10,10,10,10,10,10),
                 block=c(2,2,2,3,4,5,6,6,7,7,7,7),
                 interval_block=c(1,1,1,2,2,2,3,3,3,4,4,4))

现在我想在block的基础上创建summary_datasummary_data的行数是interval_block的类型数。第一步:

# Step 1: Find the maximum number of types for block column within each interval_block
max_types <- sapply(unique(data$interval_block), function(interval) {
  blocks <- unique(data[data$interval_block == interval, "block"])
  length(blocks)
})
max_num_types <- max(max_types)

对于interval_block =1,存在一种类型的块。(2)对于interval_block =2,有三种类型的块。(3,4和5)对于interval_block =3,有两种类型的块。(6和7)对于interval_block =4,有一种类型的块。(7)因此,每个interval_blockblock列的最大类型数为3。上面是计算这个数字的代码。根据这个数字,我想做dur_列。因此,在这种情况下,应该有dur_1dur_2dur_3
步骤2:确定dur_列的值。对于interval_block =1,有一种类型的block。我想填充dur_1,并将dur_2dur_3保留为0。#(block =2,interval_block =1)=3。因此,我想将dur_1填充为3乘以10=30。
对于interval_block =2,有三种类型的block。我想填充dur_1dur_2dur_3。#(block =3,interval_block =2)=1,#(block =4,interval_block =2)=1,#(block =5,interval_block =2)=1。因此,我想将dur_1填充为1乘以10=10,dur_2填充为1乘以10=10,dur_3填充为1乘以10=10。
对于interval_block =3,有两种类型的block。我想填充dur_1dur_2,并将dur_3保留为0。#(block =6 within interval_block =3)=2,#(block =7 within interval_block =3)=1,所以,我想把dur_1填成2乘以10=20,把dur_2填成1乘以10=10,把dur_3填成0。
对于interval_block =4,有一种类型的block。我想填充dur_1,并将dur_2dur_3保留为0。#(block =7,interval_block =4)=3。因此,我想将dur_1填充为3乘以10=10,将dur_2dur_3填充为0。
我描述的规则相当长,但基本上都是关于计算interval_block中的类型数并乘以10。我的预期输出应该是这样的:

summary_data<-data.frame(dur_1=c(30,10,20,30),
                     dur_2=c(0,10,10,0),
                     dur_3=c(0,10,10,0),
                     interval_block=c(1,2,3,4))

我不知道怎么用R语言写代码。
澄清一下。第一行:有3个block =2(一种)。其中一种类型,我们只用3乘以10填充dur_1。第二行,有1 block =3,1 block =4和1 block =5(三种类型)。由于有三种类型,我们分别用1 × 10、1 × 10、1 × 10填充dur_1dur_2dur_3
第三行:有2 block =6,1 block =7(两种)。由于有两种类型,我们分别用2 × 10,1 × 10填充dur_1dur_2

ukqbszuj

ukqbszuj1#

利用{dplyr}和{tidyr},您可以执行以下操作:

library(dplyr)
library(tidyr)

data |>
  group_by(interval_block) |>
  mutate(ID = row_number(),
         dur = block |> as.factor() |> as.integer(),
         dur = 1 + dur - min(dur),
         dur_names = paste0('dur_', dur),
         dur_values = 10 * dur
         ) |>
  group_by(interval_block, dur_names) |>
  summarise(dur_values = sum(dur_values)) |>
  pivot_wider(names_from = dur_names, values_from = dur_values) |>
  mutate(across(everything(), ~ ifelse(is.na(.x), 0, .x))) |>
  select(starts_with('dur'), interval_block)
# A tibble: 4 x 4
# Groups:   interval_block [4]
  dur_1 dur_2 dur_3 interval_block
  <dbl> <dbl> <dbl>          <dbl>
1    30     0     0              1
2    10    20    30              2
3    20    20     0              3
4    30     0     0              4

Edit:一个带有base R的稍微深奥的替代方案:

data |>
  split(data$interval_block) |>
  Map(f = \(x) {
    max_blocks = with(data,  max(table(interval_block, block)))
    dur <- table(x$block)
    `[<-`(integer(max_blocks), seq_along(dur), 10 * dur)
  }) |>
  Reduce(f = rbind) |>
  cbind(unique(data$interval_block)) |>
  as.data.frame(row.names = FALSE) |>
  setNames(nm = c(paste0('dur_', 1:3), 'interval block'))

'[<-'用于从here获取的零填充

lf5gs5x2

lf5gs5x22#

利用base R,首先计算唯一的组块计数,然后聚合数据,并通过清理将其整形为最终格式:

# ADD COLUMN FOR UNIQUE BLOCK GROUP NUM
data <- within(
    data, {
        dur_num <- ave(
            block,
            interval_block, 
            FUN=\(x) as.integer(factor(x))
        )
    }
) 

# AGGREGATE BY UNIQUE BLOCKS WITHIN INTERVAL BLOCK
agg_df <- aggregate(
    dt ~ dur_num + interval_block,
    data,
    FUN = sum
)

# RESHAPE WIDE
wide_df <- reshape(
    agg_df,
    idvar = "interval_block",
    timevar = "dur_num",
    v.names = "dt",
    direction = "wide",
    sep = "_"
)

# CLEAN UP
wide_df[is.na(wide_df)] = 0

row.names(wide_df) <- 1:nrow(wide_df)
colnames(wide_df) <- gsub(
    "dt_", "dur_", colnames(wide_df), fixed=TRUE
)

wide_df
  interval_block dur_1 dur_2 dur_3
1              1    30     0     0
2              2    10    10    10
3              3    20    10     0
4              4    30     0     0

Online Demo

相关问题