R语言 基于作为截止值的值序列生成新的数据子集

qrjkbowd  于 2023-02-27  发布在  其他
关注(0)|答案(4)|浏览(133)

我有一个包含压力数据的大型数据集。我希望能够创建多个数据集,这些数据集在不同的值(即〉3500、〉3600等)进行过滤,然后对这些新数据集中的每个数据集运行几个分析,这些数据集在指定值处截止。
例如,这可能类似于我现在做的:

#making a reproducible example
pressure <- runif(30, min = 3750, max = 4500)
value <- runif(30, min = 0, max = 50)
stage <- rep(c(1, 2), each = 15)

raw.data <- data.frame(pressure, value, stage)

#set a cutoff point
cutoff.press <- 3750

#make a new dataset
cutoff <- raw.data[raw.data$pressure > cutoff.press,]

#run an analysis
analysis <- cutoff %>% 
group_by(stage) %>%
summarize(
   MinValue = min(value),
   MaxValue = max(value)
)

是否有一种方法可以做到这一点,而不必为每个感兴趣的临界值创建多个单独的数据集,然后单独运行每个分析?
例如,如果我想测试多个压力截止值(如seq(3750, 4000, 50)),我不想为序列中生成的每个值重复上述过程。
我曾经考虑过将dplyrfilter()函数一起使用,并手动设置一系列值,但这不仅会耗费时间,而且我不确定是否允许我对多个数据集进行分析。

7uzetpgm

7uzetpgm1#

如果您希望运行许多不同的迭代,那么使用purrr也是一个不错的选择,因为您可以在一个管道中完成所有工作。

library(tidyverse)

purrr::pmap(data.frame(pressure = seq(3750, 4000, 50)),
            ~ dplyr::filter(raw.data, pressure > ..1)) %>%
  purrr::map(. %>%
               group_by(stage) %>%
               summarize(MinValue = min(value),
                         MaxValue = max(value))) %>%
  # If you want to set the names to the cutoff values.
  setNames(seq(3750, 4000, 50))

产出

$`3750`
# A tibble: 2 × 3
  stage MinValue MaxValue
  <dbl>    <dbl>    <dbl>
1     1    3.52      46.6
2     2    0.575     49.3

$`3800`
# A tibble: 2 × 3
  stage MinValue MaxValue
  <dbl>    <dbl>    <dbl>
1     1    3.52      46.6
2     2    0.575     47.5

$`3850`
# A tibble: 2 × 3
  stage MinValue MaxValue
  <dbl>    <dbl>    <dbl>
1     1    3.52      46.6
2     2    0.575     47.5

$`3900`
# A tibble: 2 × 3
  stage MinValue MaxValue
  <dbl>    <dbl>    <dbl>
1     1    3.52      46.6
2     2    0.575     47.5

$`3950`
# A tibble: 2 × 3
  stage MinValue MaxValue
  <dbl>    <dbl>    <dbl>
1     1    3.52      46.6
2     2    0.575     47.5

$`4000`
# A tibble: 2 × 3
  stage MinValue MaxValue
  <dbl>    <dbl>    <dbl>
1     1    6.65      46.6
2     2    0.575     47.5

数据

raw.data <- structure(list(pressure = c(4160.41269886773, 4044.58961030468, 
                            4336.48418885423, 3762.11064029485, 4235.55055609904, 3926.50744639104, 
                            4086.0048676841, 4360.64667999744, 3850.74476944283, 3950.07681293646, 
                            4347.61320002144, 3996.32209626725, 4262.53829378402, 3869.30528597441, 
                            4252.7681372012, 4013.94325762521, 4275.64664371312, 4197.37908616662, 
                            4231.71574808657, 4028.1643497292, 4407.9091984313, 4481.91399103962, 
                            4353.40271308087, 4013.09538848, 4109.39885408152, 4195.05179609405, 
                            4222.33691916335, 4316.15335500101, 3860.02388742054, 3772.72424055263
), value = c(46.6360261081718, 19.0778955002315, 9.46381011744961, 
             17.4791521392763, 6.64818733930588, 3.79822270479053, 17.0007253182121, 
             45.9705576649867, 39.6164933103137, 3.52405618177727, 29.9587145447731, 
             10.8624027809128, 45.8421137067489, 34.4845326268114, 17.0537169324234, 
             47.0035993610509, 29.5542735257186, 12.992845242843, 32.0275551988743, 
             21.112488291692, 12.7272683312185, 23.9938693121076, 18.5264392290264, 
             42.9235454765148, 0.575024168938398, 10.7687710318714, 0.992469629272819, 
             47.4592371145263, 40.4172958689742, 49.3020136258565), 
stage = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 
          2, 2, 2, 2, 2, 2, 2, 2)), class = "data.frame", row.names = c(NA, -30L))
ngynwnxp

ngynwnxp2#

1.我们可以通过case_when语句创建分组或截断组。
1.分成这些组(您将获得一个列表)

  1. map超过名单,也是第一次提供的安德鲁吉里斯布朗!
library(purrr)
library(dplyr)

raw.data %>% 
  mutate(cut_off = case_when(pressure <= 4000 ~ "<= 4000",
                             TRUE ~ "> 4000")) %>% 
  group_split(cut_off) %>% 
  map(. %>%
     group_by(stage) %>%
               summarise(MinValue = min(value),
                         MaxValue = max(value)))
[[1]]
# A tibble: 2 x 3
  stage MinValue MaxValue
  <dbl>    <dbl>    <dbl>
1     1     4.72     48.7
2     2    33.9      47.3

[[2]]
# A tibble: 2 x 3
  stage MinValue MaxValue
  <dbl>    <dbl>    <dbl>
1     1     9.64     41.5
2     2    14.5      45.5
vxf3dgd4

vxf3dgd43#

尝试sapply函数,如下所示:

cutoff <- seq(3750, 4000, 50)
sapply(cutoff, function(y) raw.data %>% 
                             filter(pressure > y) %>%
                             group_by(stage) %>%
                             summarize(
                               MinValue = min(value),
                               MaxValue = max(value)), simplify = FALSE)
h79rfbju

h79rfbju4#

写一个函数:

do_analysis <- function(data, cutoff) {
  # don't use dplyr here to avoid quoting hassles
  data_subset <- data[data$pressure > cutoff, ]
  data_subset |>
                group_by(stage) |>
                summarize(
                  MinValue = min(value),
                  MaxValue = max(value)
                )

}

result <- do_analysis(raw_data, 3750)
# etc.

# or even
cutoffs <- c(3750, 4000, 4250)
results <- lapply(cuttofs, do_analysis, data = raw_data)

相关问题