根据ID和R中的最大差异合并时间戳最近的两个 Dataframe

mklgxw1f  于 2023-03-15  发布在  其他
关注(0)|答案(2)|浏览(94)

我有两个 Dataframe 要合并,它们必须基于某个ID的最接近的timestamp进行连接,如果时间差低于特定值因此,对于给定ID,df1中的每个timestamp需要与df2中最接近的timestamp匹配。如果时间差超过某个值,应该是NA。我尝试了多种方法将它们结合使用sqldfnearestTimeandID,但似乎不成功。任何帮助都将不胜感激。
数据如下所示:

# dput(head(df1))

structure(list(timestamp = structure(c(1660048051, 1660049817, 
1660051642, 1660053416, 1660055228, 1660057017), class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), location_long = c(2.7305546, 2.730441, 
2.7301144, 2.732173, 2.7319352, 2.7319432), location_lat = c(51.1519968, 
51.1520704, 51.1519392, 51.1509984, 51.1511456, 51.1511168), 
    ID = c("16000002ba", "16000002ba", "16000002ba", "16000002ba", 
    "16000002ba", "16000002ba")), row.names = c("1.1", "1.2", 
"1.3", "1.4", "1.5", "1.6"), class = "data.frame")

# dput(head(df2))

structure(list(ID = c("16000002ba", "16000002ba", "16000002ba", 
"16000002ba", "16000002ba", "16000002ba"), Collecting_time = c("2022-08-07 23:59:05 UTC", 
"2022-08-08 00:00:03 UTC", "2022-08-08 00:30:39 UTC", "2022-08-08 01:00:16 UTC", 
"2022-08-08 01:30:51 UTC", "2022-08-08 02:00:25 UTC"), ODBA = c(110L, 
290L, 260L, 220L, 210L, 410L)), row.names = 290345:290350, class = "data.frame")
gev0vcfq

gev0vcfq1#

我们可以使用{powerjoin}包,并使用公式指定by参数。我也编辑了df1的第一个条目,以获得至少一个连接。我建议使用df2,这样我们实际上就有了一个日期时间对象。
我不确定时间间隔是+/- 10分钟还是只小于10分钟,下面我应用了后者,但你可以玩周围的设置。

library(dplyr)
library(powerjoin)

df2 <- df2 |>
  mutate(timestamp = as.POSIXct(Collecting_time, tz = "UTC"))

power_left_join(
  df1,
  df2,
  by = c("ID",
         ~ difftime(.y$timestamp, .x$timestamp, units = "mins") < 10,
         ~ difftime(.y$timestamp, .x$timestamp, units = "mins") > 0)
)
#>           timestamp.x location_long location_lat         ID
#> 1 2022-08-09 12:27:31      2.730555     51.15200 16000002ba
#> 2 2022-08-09 12:56:57      2.730441     51.15207 16000002ba
#> 3 2022-08-09 13:27:22      2.730114     51.15194 16000002ba
#> 4 2022-08-09 13:56:56      2.732173     51.15100 16000002ba
#> 5 2022-08-09 14:27:08      2.731935     51.15115 16000002ba
#> 6 2022-08-09 14:56:57      2.731943     51.15112 16000002ba
#>           Collecting_time ODBA         timestamp.y
#> 1 2022-08-09 12:36:00 UTC  110 2022-08-09 12:36:00
#> 2                    <NA>   NA                <NA>
#> 3                    <NA>   NA                <NA>
#> 4                    <NA>   NA                <NA>
#> 5                    <NA>   NA                <NA>
#> 6                    <NA>   NA                <NA>

数据来自OP

df1 <- structure(list(timestamp = structure(c(1660048051, 1660049817, 
                                       1660051642, 1660053416, 1660055228, 1660057017), class = c("POSIXct", 
                                                                                                  "POSIXt"), tzone = "UTC"), location_long = c(2.7305546, 2.730441, 
                                                                                                                                               2.7301144, 2.732173, 2.7319352, 2.7319432), location_lat = c(51.1519968, 
                                                                                                                                                                                                            51.1520704, 51.1519392, 51.1509984, 51.1511456, 51.1511168), 
               ID = c("16000002ba", "16000002ba", "16000002ba", "16000002ba", 
                      "16000002ba", "16000002ba")), row.names = c("1.1", "1.2", 
                                                                  "1.3", "1.4", "1.5", "1.6"), class = "data.frame")

df2 <- structure(list(ID = c("16000002ba", "16000002ba", "16000002ba", 
                      "16000002ba", "16000002ba", "16000002ba"), Collecting_time = c("2022-08-09 12:36:00 UTC", 
                                                                                     "2022-08-08 00:00:03 UTC", "2022-08-08 00:30:39 UTC", "2022-08-08 01:00:16 UTC", 
                                                                                     "2022-08-08 01:30:51 UTC", "2022-08-08 02:00:25 UTC"), ODBA = c(110L, 
                                                                                                                                                     290L, 260L, 220L, 210L, 410L)), row.names = 290345:290350, class = "data.frame")

创建于2023年3月13日,使用reprex v2.0.2

ujv3wf0j

ujv3wf0j2#

我不确定我是否正确地理解了你的意思。但也许这会有所帮助。注意我编辑了df2的第一个条目以获得实际的匹配。

library(fuzzyjoin)
library(dplyr)

df1 <-
structure(list(timestamp = structure(c(1660048051, 1660049817, 
                                       1660051642, 1660053416, 1660055228, 1660057017), class = c("POSIXct", 
                                                                                                  "POSIXt"), tzone = "UTC"), location_long = c(2.7305546, 2.730441, 
                                                                                                                                               2.7301144, 2.732173, 2.7319352, 2.7319432), location_lat = c(51.1519968, 
                                                                                                                                                                                                            51.1520704, 51.1519392, 51.1509984, 51.1511456, 51.1511168), 
               ID = c("16000002ba", "16000002ba", "16000002ba", "16000002ba", 
                      "16000002ba", "16000002ba")), row.names = c("1.1", "1.2", 
                                                                  "1.3", "1.4", "1.5", "1.6"), class = "data.frame")

df2 <-

structure(list(ID = c("16000002ba", "16000002ba", "16000002ba", 
                      "16000002ba", "16000002ba", "16000002ba"), Collecting_time = c("2022-08-09 12:36:05 UTC", 
                                                                                     "2022-08-08 00:00:03 UTC", "2022-08-08 00:30:39 UTC", "2022-08-08 01:00:16 UTC", 
                                                                                     "2022-08-08 01:30:51 UTC", "2022-08-08 02:00:25 UTC"), ODBA = c(110L, 
timing <- 10                                                                                                                                                 290L, 260L, 220L, 210L, 410L)), row.names = 290345:290350, class = "data.frame")

# Convert Collecting_time to POSIXct format
df2$Collecting_time <- as.POSIXct(df2$Collecting_time, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")

# Join df1 and df2 based on the closest timestamp within x minutes defined in "timing" for each ID
df_join <- fuzzy_left_join(df1, df2, 
                           by = c("ID" = "ID", "timestamp" = "Collecting_time"),
                           match_fun = list(`==`, function(x, y) abs(difftime(x, y, units = "mins")) <= timing))

# Select the columns you need
df_join <- select(df_join, timestamp, location_long, location_lat, ID.x, Collecting_time, ODBA)

# Rename the columns
names(df_join)[5] <- "closest_Collecting_time"
names(df_join)[6] <- "ODBA_of_closest_timestamp"

df_join

相关问题