如果时间发生在R中给定的时间间隔内,如何用特定的值(包括MS)标记行

oxf4rvwz  于 11个月前  发布在  其他
关注(0)|答案(2)|浏览(87)

我有两个时间帧。DF 1的时间间隔包括毫秒,有一个对应的值(行为)。DF 2有一列时间(包括毫秒)。
DF1:

START                   STOP behavior
1 2023-07-20 14:07:39.00 2023-07-20 12:07:39.14      dig
2 2023-07-20 14:07:39.27 2023-07-20 12:07:39.90      pig

字符串
DF2:

x                time label
133202 20/07/2023 12:07:39.020 2023-07-20 12:07:39    NA
133203 20/07/2023 12:07:39.040 2023-07-20 12:07:39    NA
133204 20/07/2023 12:07:39.060 2023-07-20 12:07:39    NA
133205 20/07/2023 12:07:39.080 2023-07-20 12:07:39    NA
133206 20/07/2023 12:07:39.100 2023-07-20 12:07:39    NA
133207 20/07/2023 12:07:39.120 2023-07-20 12:07:39    NA
133208 20/07/2023 12:07:39.140 2023-07-20 12:07:39    NA
133209 20/07/2023 12:07:39.160 2023-07-20 12:07:39    NA
133210 20/07/2023 12:07:39.180 2023-07-20 12:07:39    NA
133211 20/07/2023 12:07:39.200 2023-07-20 12:07:39    NA
133212 20/07/2023 12:07:39.220 2023-07-20 12:07:39    NA
133213 20/07/2023 12:07:39.240 2023-07-20 12:07:39    NA
133214 20/07/2023 12:07:39.260 2023-07-20 12:07:39    NA
133215 20/07/2023 12:07:39.280 2023-07-20 12:07:39    NA
133216 20/07/2023 12:07:39.300 2023-07-20 12:07:39    NA
133217 20/07/2023 12:07:39.320 2023-07-20 12:07:39    NA
133218 20/07/2023 12:07:39.340 2023-07-20 12:07:39    NA
133219 20/07/2023 12:07:39.360 2023-07-20 12:07:39    NA
133220 20/07/2023 12:07:39.380 2023-07-20 12:07:39    NA
133221 20/07/2023 12:07:39.400 2023-07-20 12:07:39    NA


我想生成一个DF 2的嵌套框架,但如果时间发生在DF 1的时间间隔(包括毫秒)内,则有一列对应于DF 1中的行为。
所需输出:

x                time label
133202 20/07/2023 12:07:39.020 2023-07-20 12:07:39   dig
133203 20/07/2023 12:07:39.040 2023-07-20 12:07:39   dig
133204 20/07/2023 12:07:39.060 2023-07-20 12:07:39   dig
133205 20/07/2023 12:07:39.080 2023-07-20 12:07:39   dig
133206 20/07/2023 12:07:39.100 2023-07-20 12:07:39   dig
133207 20/07/2023 12:07:39.120 2023-07-20 12:07:39   dig
133208 20/07/2023 12:07:39.140 2023-07-20 12:07:39   dig
133209 20/07/2023 12:07:39.160 2023-07-20 12:07:39  <NA>
133210 20/07/2023 12:07:39.180 2023-07-20 12:07:39  <NA>
133211 20/07/2023 12:07:39.200 2023-07-20 12:07:39  <NA>
133212 20/07/2023 12:07:39.220 2023-07-20 12:07:39  <NA>
133213 20/07/2023 12:07:39.240 2023-07-20 12:07:39  <NA>
133214 20/07/2023 12:07:39.260 2023-07-20 12:07:39  <NA>
133215 20/07/2023 12:07:39.280 2023-07-20 12:07:39   pig
133216 20/07/2023 12:07:39.300 2023-07-20 12:07:39   pig
133217 20/07/2023 12:07:39.320 2023-07-20 12:07:39   pig
133218 20/07/2023 12:07:39.340 2023-07-20 12:07:39   pig
133219 20/07/2023 12:07:39.360 2023-07-20 12:07:39   pig
133220 20/07/2023 12:07:39.380 2023-07-20 12:07:39   pig
133221 20/07/2023 12:07:39.400 2023-07-20 12:07:39   pig


以下是数据输入:
DF1

用于测试的新框架#

START<-c(“2023-07-20 14:07:39.01”,“2023-07-20 14:07:39.29”)
START<- as.POSIXct(START,“%Y-%m-%d %H:%M:%OS”,tz =“UTC”)
START<-format(START,“%Y-%m-%d %H:%M:%OS3”)
STOP<-c(“2023-07-20 14:07:39.14”,“2023-07-20 14:07:39.90”)
STOP<- as.POSIXct(STOP,“%Y-%m-%d %H:%M:%OS”,tz =“UTC”)
STOP<-format(STOP,“%Y-%m-%d %H:%M:%OS2”)
behavior<-c(“dig”,“pig”)
DF1<-data.frame(START,STOP,behavior)
DF1$START<- as.POSIXct(DF1$START,“%Y-%m-%d %H:%M:%OS”,tz =“UTC”)
DF1$STOP<-as.POSIXct(DF1$STOP,“%Y-%m-%d %H:%M:%OS”,tz =“UTC”)
DF1$START<-format(DF1$START,“%Y-%m-%d %H:%M:%OS2”)
DF1$STOP<-format(DF1$STOP,“%Y-%m-%d %H:%M:%OS2”)
DF 2(x是以ms表示的时间,使用format(df2$x, "%Y-%m-%d %H:%M:%OS2")

structure(list(x = c("20/07/2023 12:07:39.020", "20/07/2023 12:07:39.040", 
"20/07/2023 12:07:39.060", "20/07/2023 12:07:39.080", "20/07/2023 12:07:39.100", 
"20/07/2023 12:07:39.120", "20/07/2023 12:07:39.140", "20/07/2023 12:07:39.160", 
"20/07/2023 12:07:39.180", "20/07/2023 12:07:39.200", "20/07/2023 12:07:39.220", 
"20/07/2023 12:07:39.240", "20/07/2023 12:07:39.260", "20/07/2023 12:07:39.280", 
"20/07/2023 12:07:39.300", "20/07/2023 12:07:39.320", "20/07/2023 12:07:39.340", 
"20/07/2023 12:07:39.360", "20/07/2023 12:07:39.380", "20/07/2023 12:07:39.400"
), time = structure(c(1689854859.02, 1689854859.04, 1689854859.06, 
1689854859.08, 1689854859.1, 1689854859.12, 1689854859.14, 1689854859.16, 
1689854859.18, 1689854859.2, 1689854859.22, 1689854859.24, 1689854859.26, 
1689854859.28, 1689854859.3, 1689854859.32, 1689854859.34, 1689854859.36, 
1689854859.38, 1689854859.4), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    label = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA)), row.names = 133202:133221, class = "data.frame")


我想制作这个:

dput(df2)
structure(list(x = c("20/07/2023 12:07:39.020", "20/07/2023 12:07:39.040", 
"20/07/2023 12:07:39.060", "20/07/2023 12:07:39.080", "20/07/2023 12:07:39.100", 
"20/07/2023 12:07:39.120", "20/07/2023 12:07:39.140", "20/07/2023 12:07:39.160", 
"20/07/2023 12:07:39.180", "20/07/2023 12:07:39.200", "20/07/2023 12:07:39.220", 
"20/07/2023 12:07:39.240", "20/07/2023 12:07:39.260", "20/07/2023 12:07:39.280", 
"20/07/2023 12:07:39.300", "20/07/2023 12:07:39.320", "20/07/2023 12:07:39.340", 
"20/07/2023 12:07:39.360", "20/07/2023 12:07:39.380", "20/07/2023 12:07:39.400"
), time = structure(c(1689854859.02, 1689854859.04, 1689854859.06, 
1689854859.08, 1689854859.1, 1689854859.12, 1689854859.14, 1689854859.16, 
1689854859.18, 1689854859.2, 1689854859.22, 1689854859.24, 1689854859.26, 
1689854859.28, 1689854859.3, 1689854859.32, 1689854859.34, 1689854859.36, 
1689854859.38, 1689854859.4), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    label = c("dig", "dig", "dig", "dig", "dig", "dig", "dig", 
    NA, NA, NA, NA, NA, NA, "pig", "pig", "pig", "pig", "pig", 
    "pig", "pig")), row.names = 133202:133221, class = "data.frame")


我可以使用以下代码生成所需的结果:

for (i in 1:nrow(df2)) {
  time_val <- df2$time[i]
  # Find the row where time_val falls within the interval
  interval_row <- which(time_val >= df1$START & time_val <= df1$STOP)
  
  # If there's a match, replace NA in label with the corresponding behavior
  if (length(interval_row) > 0) {
    behavior_val <- df1$behavior[interval_row]
    df2$label[i] <- behavior_val
  }
}

与DF 1的虚拟数据。
当我尝试使用完整的DF 1运行时,它似乎找不到时间间隔(抛出错误“Error in df2$label[i] <- behavior_瓦尔:替换长度为零”,即使str()输出是相同的。我不知道为什么。我还注意到,这似乎只在我使用DF 1的列时才有效,当它们是format(DF1$STOP, "%Y-%m-%d %H:%M:%OS2")时,当它们被格式化为POSIXct,这让我觉得这是一个非常制作的方式来做我认为是一个简单的问题!
我也认为这是一个混乱的解决方案,for循环需要一段时间才能运行。有没有更好的方法来获得我想要的结果,同时仍然考虑时间间隔和数据中的毫秒数?
我开始研究data.table解决方案,但它听起来并不适合用于毫秒级的时间间隔?
救命啊!
感谢您发送编修。

5gfr0r5j

5gfr0r5j1#

如果我理解正确的话,您希望将行为合并到DF2,这取决于x福尔斯是否落入STARTSTOP定义的区间?
下面,我假设DF1中的时间不是14 h而是12 h:

library(tidyverse)

DF1 <- as_tibble(DF1) |> 
  mutate(across(c(START, STOP), ymd_hms))

DF2 <- as_tibble(DF2) |> 
  mutate(x = dmy_hms(x))

DF2 |> 
  left_join(DF1, join_by(between(x, START, STOP))) |> 
  select(x, time, behavior)
#> # A tibble: 20 × 3
#>    x                   time                behavior
#>    <dttm>              <dttm>              <chr>   
#>  1 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
#>  2 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
#>  3 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
#>  4 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
#>  5 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
#>  6 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
#>  7 2023-07-20 12:07:39 2023-07-20 12:07:39 dig     
#>  8 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
#>  9 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
#> 10 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
#> 11 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
#> 12 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
#> 13 2023-07-20 12:07:39 2023-07-20 12:07:39 <NA>    
#> 14 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
#> 15 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
#> 16 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
#> 17 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
#> 18 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
#> 19 2023-07-20 12:07:39 2023-07-20 12:07:39 pig     
#> 20 2023-07-20 12:07:39 2023-07-20 12:07:39 pig

字符串
创建于2023-12-08带有reprex v2.0.2

xj3cbfub

xj3cbfub2#

在你的for循环中,你刚刚错过了提供一个else df2$label[i] <- NA_character_。处理日期和时间可能很棘手,你可能在时区上失败了,它们应该彼此一致。

df2[1:2] <- lapply(df2[1:2], as.POSIXct, format='%d/%m/%Y %H:%M:%OS', tz='GMT')

字符串

您可以使用data.table,因为它快得多。

我们在 * df 1 * 的行上执行sapply,并查看df2$time是否为betweenSTARTSTOP。注意,我们使用data.table::between;它抛出一个布尔矩阵,我们在其中确定max.col,如果两者都是FALSE,则我们用NA执行replace

library(data.table)
> setDT(df2)[, label := df1$behavior[
+   max.col(r <- sapply(seq_len(nrow(df1)), \(i) 
+                       data.table::between(df2$x, 
+                                           df1[i, ]$START, 
+                                           df1[i, ]$STOP))) |>
+     replace(rowSums(r) == 0, NA)]]
> df2
                         x                   time label
 1: 2023-07-20 12:07:39.01 2023-07-20 12:07:39.01   dig
 2: 2023-07-20 12:07:39.03 2023-07-20 12:07:39.03   dig
 3: 2023-07-20 12:07:39.05 2023-07-20 12:07:39.05   dig
 4: 2023-07-20 12:07:39.07 2023-07-20 12:07:39.07   dig
 5: 2023-07-20 12:07:39.09 2023-07-20 12:07:39.09   dig
 6: 2023-07-20 12:07:39.11 2023-07-20 12:07:39.11   dig
 7: 2023-07-20 12:07:39.14 2023-07-20 12:07:39.14   dig
 8: 2023-07-20 12:07:39.16 2023-07-20 12:07:39.16  <NA>
 9: 2023-07-20 12:07:39.18 2023-07-20 12:07:39.18  <NA>
10: 2023-07-20 12:07:39.20 2023-07-20 12:07:39.20  <NA>
11: 2023-07-20 12:07:39.22 2023-07-20 12:07:39.22  <NA>
12: 2023-07-20 12:07:39.24 2023-07-20 12:07:39.24  <NA>
13: 2023-07-20 12:07:39.25 2023-07-20 12:07:39.25  <NA>
14: 2023-07-20 12:07:39.27 2023-07-20 12:07:39.27   pig
15: 2023-07-20 12:07:39.29 2023-07-20 12:07:39.29   pig
16: 2023-07-20 12:07:39.31 2023-07-20 12:07:39.31   pig
17: 2023-07-20 12:07:39.33 2023-07-20 12:07:39.33   pig
18: 2023-07-20 12:07:39.35 2023-07-20 12:07:39.35   pig
19: 2023-07-20 12:07:39.38 2023-07-20 12:07:39.38   pig
20: 2023-07-20 12:07:39.40 2023-07-20 12:07:39.40   pig

基准测试

> f_for <- \(df2) {
+   for (i in 1:nrow(df2)) {
+     time_val <- df2$time[i]
+     interval_row <- which(time_val >= df1$START & time_val <= df1$STOP)
+     if (length(interval_row) > 0) {
+       df2$label[i] <- df1$behavior[interval_row]
+     } else {
+       df2$label[i] <- NA_character_
+     }
+   }
+   df2
+ }
> f_tidy <- \(df2) {
+   df2 |> 
+     dplyr::left_join(df1, dplyr::join_by(dplyr::between(x, START, STOP))) |> 
+     dplyr::select(x, time, behavior)
+ }
> f_dt <- \(dt) setDT(dt)[, label := df1$behavior[
+   max.col(r <- sapply(seq_len(nrow(df1)), \(i) 
+                       data.table::between(dt$x, 
+                                           df1[i, ]$START, 
+                                           df1[i, ]$STOP))) |>
+     replace(rowSums(r) == 0, NA)]]
> dt3 <- df3 <- df2[sample.int(nrow(df2), 5e3, replace=TRUE), ] |> as.data.frame()
> mb <- microbenchmark::microbenchmark(`for`=f_for(df3), 
+                                      `tidy`=f_tidy(df3),
+                                      `data.table`=f_dt(dt3))

$ Rscript --vanilla foo.R
Unit: milliseconds
       expr        min          lq        mean      median          uq        max neval cld
        for 651.363713 1178.234978 1197.410804 1200.984304 1224.139109 1324.71263   100  a 
       tidy   6.018196    6.531816   12.450828    6.837905    7.122139  557.42749   100   b
 data.table   1.664823    1.852909    2.199552    2.009043    2.112688   14.60731   100   b

> plot(mb, log='y')


x1c 0d1x的数据

  • 数据类型:*
> dput(df1)
structure(list(START = structure(c(1689854859, 1689854859.27), class = c("POSIXct", 
"POSIXt"), tzone = "GMT"), STOP = structure(c(1689854859.14, 
1689854859.9), class = c("POSIXct", "POSIXt"), tzone = "GMT"), 
    behavior = c("dig", "pig")), row.names = c("1", "2"), class = "data.frame")
> dput(df2)
structure(list(x = c("20/07/2023 12:07:39.020", "20/07/2023 12:07:39.040", 
"20/07/2023 12:07:39.060", "20/07/2023 12:07:39.080", "20/07/2023 12:07:39.100", 
"20/07/2023 12:07:39.120", "20/07/2023 12:07:39.140", "20/07/2023 12:07:39.160", 
"20/07/2023 12:07:39.180", "20/07/2023 12:07:39.200", "20/07/2023 12:07:39.220", 
"20/07/2023 12:07:39.240", "20/07/2023 12:07:39.260", "20/07/2023 12:07:39.280", 
"20/07/2023 12:07:39.300", "20/07/2023 12:07:39.320", "20/07/2023 12:07:39.340", 
"20/07/2023 12:07:39.360", "20/07/2023 12:07:39.380", "20/07/2023 12:07:39.400"
), time = structure(c(1689854859.02, 1689854859.04, 1689854859.06, 
1689854859.08, 1689854859.1, 1689854859.12, 1689854859.14, 1689854859.16, 
1689854859.18, 1689854859.2, 1689854859.22, 1689854859.24, 1689854859.26, 
1689854859.28, 1689854859.3, 1689854859.32, 1689854859.34, 1689854859.36, 
1689854859.38, 1689854859.4), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    label = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA)), row.names = 133202:133221, class = "data.frame")

相关问题