R语言 如何计算首末分差

whhtz7ly  于 2023-02-06  发布在  其他
关注(0)|答案(2)|浏览(122)

我想根据日期计算每个人的第一个得分和最后一个得分之间的差值。原始数据如下所示:

ID <- c(1,1,1,2,2,3,3,3,3,4)
Score <- c(3,2,1,1,2,0,0,3,4,0)
Date <- c("2020/01/01","2020/01/02","2020/01/03","2020/02/05","2020/02/06","2021/10/01","2021/10/02","2021/10/03","2021/10/04","2022/03/01")

a <- data.frame(ID,Score,Date)

所需数据集:

diff_first_last <- c(-2,-2,-2,1,1,4,4,4,4,0)
b <- data.frame(ID,Score,Date,diff_first_last)

有没有一种方法可以很容易地做到这一点,因为我想把它应用到一个更大的数据集。我会感谢所有的帮助有!谢谢!!!

yebdmbv4

yebdmbv41#

base 中,可以使用ave,假设它是按日期排序的。

ave(a$Score, a$ID, FUN = \(x) x[length(x)] - x[1])
# [1] -2 -2 -2  1  1  4  4  4  4  0

以防它没有排序。

a <- a[order(as.Date(a$Date)),]

基准

set.seed(42)
a <- data.frame(ID = sample(0:9999, 1e5, TRUE), Score = sample(0:5, 1e5, TRUE),
      Date = c("2020/01/01","2020/01/02","2020/01/03","2020/02/05","2020/02/06",
           "2021/10/01","2021/10/02","2021/10/03","2021/10/04","2022/03/01"))

library(dplyr)
bench::mark(check=FALSE,
DPH = {dplyr::group_by(a, ID) %>%
    dplyr::mutate(Date = lubridate::ymd(Date)) %>% 
    dplyr::arrange(Date) %>%
    mutate(diff_first_last = dplyr::last(Score) - dplyr::first(Score)) %>%
    dplyr::ungroup()},
DPH2 = {dplyr::group_by(a, ID) %>%
    dplyr::mutate(Date = as.Date(Date)) %>%
    dplyr::arrange(Date) %>%
   dplyr::mutate(diff_first_last = dplyr::last(Score) - dplyr::first(Score)) %>%
     dplyr::ungroup()},
GKi = {. <- a[order(as.Date(a$Date)),]
  cbind(., diff_first_last = ave(.$Score, .$ID, FUN = \(x) x[length(x)] - x[1]))
  }
)
#  expres…¹      min   median itr/s…² mem_a…³ gc/se…⁴ n_itr  n_gc total_…⁵ result
#  <bch:ex> <bch:tm> <bch:tm>   <dbl> <bch:b>   <dbl> <int> <dbl> <bch:tm> <list>
#1 DPH        18.91s   18.91s  0.0529 137.6MB    2.33     1    44   18.91s <NULL>
#2 DPH2        1.37s    1.37s  0.732   11.5MB    3.66     1     5    1.37s <NULL>
#3 GKi      325.96ms 326.36ms  3.06    25.8MB    0        2     0 652.73ms <NULL>

在这种情况下,base 解决方案比DPH2快大约4倍,但使用的内存是DPH2的两倍。

zpf6vheq

zpf6vheq2#

这是一种可能的DPLYR解决方案:

library(dplyr)
# build groupings by ID
dplyr::group_by(a, ID) %>%
    # convert date from text to date to be able to order by it
    dplyr::mutate(Date = lubridate::ymd(Date)) %>% 
    # order by date just to be sure
    dplyr::arrange(Date) %>%
    # calculate first last diference 
    mutate(diff_first_last = dplyr::last(Score) - dplyr::first(Score)) %>%
    # ungroup to prevent unwanted behaviour downstream
    dplyr::ungroup()

 # A tibble: 10 x 4
      ID Score Date       diff_first_last
   <dbl> <dbl> <date>               <dbl>
 1     1     3 2020-01-01              -2
 2     1     2 2020-01-02              -2
 3     1     1 2020-01-03              -2
 4     2     1 2020-02-05               1
 5     2     2 2020-02-06               1
 6     3     0 2021-10-01               4
 7     3     0 2021-10-02               4
 8     3     3 2021-10-03               4
 9     3     4 2021-10-04               4
10     4     0 2022-03-01               0

编辑:正如@GKi指出的代码很慢-关键的优化是将lubridate::ymd()更改为as.Date()...通过改变组大小但保持唯一观察的数量或多或少相同来测试更大的数据集,我们观察到以下性能(dplyr解决方案在大组上表现更好):

myfun <- function(dys, rps, edt) {
    # set up dummy data
    ID <- sort(rep(1:rps, dys))
    Score <- sample(0:99, dys, replace = TRUE)
    Date <- seq.Date(from = as.Date("2021-01-01"), to = as.Date(edt), by = "days")
    a <- data.frame(ID, Score, Date)
    # shuffle the dummy data
    a <- a[sample(1:nrow(a)), ]
    # show number of unique values
    print(nrow(unique(a)))
    # use supplied benchmark function with the critical twist 
    bench::mark(check=FALSE,
    DPH = {dplyr::group_by(a, ID) %>%
        dplyr::mutate(Date = as.Date(Date)) %>% # critical change of lubridate::ymd) to as.Date()
        dplyr::arrange(Date) %>%
        dplyr::mutate(diff_first_last = dplyr::last(Score) - dplyr::first(Score)) %>%
        dplyr::ungroup()},
    GKi = {. <- a[order(as.Date(a$Date)),]
      cbind(., diff_first_last = ave(.$Score, .$ID, FUN = \(x) x[length(x)] - x[1]))
      }
    )
}

 myfun(15, 6400, "2021-01-15")
[1] 96000
# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory               time           gc              
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>               <list>         <list>          
1 DPH           229ms    229ms      4.37    9.42MB     4.37     1     1      229ms <NULL> <Rprofmem [856 x 3]> <bench_tm [2]> <tibble [2 x 3]>
2 GKi           143ms    153ms      6.58   13.99MB     0        4     0      608ms <NULL> <Rprofmem [218 x 3]> <bench_tm [4]> <tibble [4 x 3]>

 myfun(46, 2400, "2021-02-15")
[1] 110400
# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                  time           gc              
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                  <list>         <list>          
1 DPH           108ms    111ms      9.09    12.8MB     3.03     3     1      330ms <NULL> <Rprofmem [12,127 x 3]> <bench_tm [4]> <tibble [4 x 3]>
2 GKi           175ms    181ms      5.48      16MB     0        3     0      548ms <NULL> <Rprofmem [4,838 x 3]>  <bench_tm [3]> <tibble [3 x 3]>

 myfun(90, 1200, "2021-03-31")
[1] 108000
# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                 time           gc              
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                 <list>         <list>          
1 DPH          67.8ms   69.9ms     14.2     11.8MB     2.36     6     1      423ms <NULL> <Rprofmem [6,127 x 3]> <bench_tm [7]> <tibble [7 x 3]>
2 GKi         166.9ms  171.1ms      5.82    15.5MB     0        3     0      516ms <NULL> <Rprofmem [2,438 x 3]> <bench_tm [3]> <tibble [3 x 3]>

 myfun(181, 600, "2021-06-30")
[1] 108600
# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                 time           gc              
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                 <list>         <list>          
1 DPH          50.2ms   52.9ms     16.9     11.5MB     2.42     7     1      413ms <NULL> <Rprofmem [3,131 x 3]> <bench_tm [8]> <tibble [8 x 3]>
2 GKi         172.8ms  175.2ms      5.66    15.5MB     0        3     0      530ms <NULL> <Rprofmem [1,238 x 3]> <bench_tm [3]> <tibble [3 x 3]>

 myfun(365, 300, "2021-12-31")
[1] 109500
# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                 time            gc               
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                 <list>          <list>           
1 DPH            39ms   47.1ms     19.9     11.4MB     2.21     9     1      452ms <NULL> <Rprofmem [1,634 x 3]> <bench_tm [10]> <tibble [10 x 3]>
2 GKi           171ms    183ms      5.48    15.5MB     0        3     0      547ms <NULL> <Rprofmem [649 x 3]>   <bench_tm [3]>  <tibble [3 x 3]> 

 myfun(730, 150, "2022-12-31")
[1] 109500
# A tibble: 2 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory               time           gc              
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>               <list>         <list>          
1 DPH            33ms     35ms     26.5     11.3MB     4.41     6     1      227ms <NULL> <Rprofmem [884 x 3]> <bench_tm [7]> <tibble [7 x 3]>
2 GKi           164ms    165ms      5.94    15.5MB     0        3     0      505ms <NULL> <Rprofmem [345 x 3]> <bench_tm [3]> <tibble [3 x 3]>

相关问题