R语言 根据开始和结束日期填写缺失日期

4xy9mtcn  于 2023-04-18  发布在  其他
关注(0)|答案(3)|浏览(105)

我有以下dataframe df(下面的dput):

> df
  group      date1      date2 value
1     A 2023-01-04 2023-01-06     1
2     A 2023-01-06 2023-01-07     2
3     A 2023-01-08 2023-01-09     3
4     B 2023-01-05 2023-01-06     3
5     B 2023-01-06 2023-01-08     2
6     B 2023-01-08 2023-01-10     1

我想complete之间的开始日期2023-01-01和结束日期2023-01-10缺失的日期.这意味着对于组A的时间间隔2023-01-01的date 1到2023-01-04 date 2;缺少2023-01-072023-01-082023-01-092023-01-10。所需的输出应如下所示:

group      date1      date2 value
1      A 2023-01-01 2023-01-04    NA
2      A 2023-01-04 2023-01-06     1
3      A 2023-01-06 2023-01-07     2
4      A 2023-01-07 2023-01-08    NA
5      A 2023-01-08 2023-01-09     3
6      A 2023-01-09 2023-01-10    NA
7      B 2023-01-01 2023-01-05    NA
8      B 2023-01-05 2023-01-06     3
9      B 2023-01-06 2023-01-08     2
10     B 2023-01-08 2023-01-10     1

正如你所看到的,缺失的日期现在用NA值填充,以使序列完整。所以我想知道是否有人知道如何根据每组的开始和结束日期来完成这些日期?
dput df:

structure(list(group = c("A", "A", "A", "B", "B", "B"), date1 = c("2023-01-04", 
"2023-01-06", "2023-01-08", "2023-01-05", "2023-01-06", "2023-01-08"
), date2 = c("2023-01-06", "2023-01-07", "2023-01-09", "2023-01-06", 
"2023-01-08", "2023-01-10"), value = c(1, 2, 3, 3, 2, 1)), class = "data.frame", row.names = c(NA, 
-6L))
jm81lzqq

jm81lzqq1#

试试这个:

library(dplyr)
start <- "2023-01-01"; end <- "2023-01-10"
df %>%
  group_by(group) %>%
  summarize(newdate1 = c(start, date2), newdate2 = c(date1, end)) %>%
  rename(date1 = newdate1, date2 = newdate2) %>%
  filter(date2 > date1) %>%
  ungroup() %>%
  bind_rows(df) %>%
  arrange(group, date1)
# # A tibble: 10 × 4
#    group date1      date2      value
#    <chr> <chr>      <chr>      <dbl>
#  1 A     2023-01-01 2023-01-04    NA
#  2 A     2023-01-04 2023-01-06     1
#  3 A     2023-01-06 2023-01-07     2
#  4 A     2023-01-07 2023-01-08    NA
#  5 A     2023-01-08 2023-01-09     3
#  6 A     2023-01-09 2023-01-10    NA
#  7 B     2023-01-01 2023-01-05    NA
#  8 B     2023-01-05 2023-01-06     3
#  9 B     2023-01-06 2023-01-08     2
# 10 B     2023-01-08 2023-01-10     1

(In dplyr_1.1.0或更高版本,它可能更喜欢reframe而不是summarize。)

cs7cruho

cs7cruho2#

另一种选择,虽然更长:

# Get start as numeric date
start <- as.numeric(as.Date("2023-01-01"))

# Get end as numeric date
end <- as.numeric(as.Date("2023-01-10"))

# Change character dates to numeric dates
df <- df %>%
    # Change dates
    mutate(date1 = as.numeric(as.Date(date1)),
           date2 = as.numeric(as.Date(date2)))

# For each group, fill in missing dates and bind together
df <- do.call("rbind", lapply(unique(df[["group"]]), \(x){
    # Get temporary data with group of interest
    df_tmp <- dplyr::filter(df, group == x)
    
    # First add start date if this is not present
    if(!(start %in% df_tmp[["date1"]])){
        # Add start date
        df_tmp <- 
            # Bind data together
            rbind(# Start date
                  data.frame(group = x,                   # Current group
                             date1 = start,               # Start date
                             date2 = df_tmp[1, "date1"],  # First date1 in data
                             value = NA),                 # No value
                  # Current data
                  df_tmp)
    }
    
    # Also add last date if this is not present
    if(!(end %in% df_tmp[["date2"]])){
        # Add end date
        df_tmp <- 
            # Bind data together
            rbind(# Start date
                data.frame(group = x,                               # Current group
                           date1 = df_tmp[nrow(df_tmp), "date2"],   # Last date2 in data
                           date2 = end,                             # First date1 in data
                           value = NA),                             # No value
                # Current data
                df_tmp) %>%
            # Rearrange data
            dplyr::arrange(date1, date2)
    }
    
    # Check each row of data to add all missing dates and bind together
    new_rows <- do.call("rbind", lapply(1:(nrow(df_tmp) - 1), \(i){
        # Check if current row date2 does not continue on next row date2
        if(df_tmp[i, "date2"] != df_tmp[i + 1, "date1"]){
            # Create new row
            new_row <- data.frame(group = x,                       # Current group
                                  date1 = df_tmp[i, "date2"],      # Start date
                                  date2 = df_tmp[i + 1, "date1"],  # First date1 in data
                                  value = NA)                      # No value
            
            # Return new row
            return(new_row)
        }}))
    
    # Add new rows to data
    df_tmp <- 
        # Bind data
        rbind(# Current data
              df_tmp,
              # New rows
              new_rows) %>%
        # Rearrange data
        dplyr::arrange(date1, date2) %>%
        # Change numeric dates back to character dates
        mutate(date1 = as.character(as.Date(date1, origin = "1970-01-01")),
               date2 = as.character(as.Date(date2, origin = "1970-01-01")))
}))

给出:

group      date1      date2 value
1      A 2023-01-01 2023-01-04    NA
2      A 2023-01-04 2023-01-06     1
3      A 2023-01-06 2023-01-07     2
4      A 2023-01-07 2023-01-08    NA
5      A 2023-01-08 2023-01-09     3
6      A 2023-01-09 2023-01-10    NA
7      B 2023-01-01 2023-01-05    NA
8      B 2023-01-05 2023-01-06     3
9      B 2023-01-06 2023-01-08     2
10     B 2023-01-08 2023-01-10     1
jhiyze9q

jhiyze9q3#

使用来自ivs的dplyr和iv_set_complement(),一个专用于处理间隔的软件包:

df <- df %>%
  as_tibble() %>%
  mutate(date1 = as.Date(date1), date2 = as.Date(date2)) %>%
  mutate(range = iv(date1, date2), .keep = "unused", .after = group)

df
#> # A tibble: 6 × 3
#>   group                    range value
#>   <chr>               <iv<date>> <dbl>
#> 1 A     [2023-01-04, 2023-01-06)     1
#> 2 A     [2023-01-06, 2023-01-07)     2
#> 3 A     [2023-01-08, 2023-01-09)     3
#> 4 B     [2023-01-05, 2023-01-06)     3
#> 5 B     [2023-01-06, 2023-01-08)     2
#> 6 B     [2023-01-08, 2023-01-10)     1

lower <- as.Date("2023-01-01")
upper <- as.Date("2023-01-10")

# Using `iv_set_complement()` with extended lower/upper bounds
missing <- df %>%
  reframe(
    range = iv_set_complement(range, lower = lower, upper = upper),
    .by = group
  )

missing
#> # A tibble: 4 × 2
#>   group                    range
#>   <chr>               <iv<date>>
#> 1 A     [2023-01-01, 2023-01-04)
#> 2 A     [2023-01-07, 2023-01-08)
#> 3 A     [2023-01-09, 2023-01-10)
#> 4 B     [2023-01-01, 2023-01-05)

# Let `rows_append()` generate `NA`s for unspecified columns
df %>%
  rows_append(missing) %>%
  arrange(group, range)
#> # A tibble: 10 × 3
#>    group                    range value
#>    <chr>               <iv<date>> <dbl>
#>  1 A     [2023-01-01, 2023-01-04)    NA
#>  2 A     [2023-01-04, 2023-01-06)     1
#>  3 A     [2023-01-06, 2023-01-07)     2
#>  4 A     [2023-01-07, 2023-01-08)    NA
#>  5 A     [2023-01-08, 2023-01-09)     3
#>  6 A     [2023-01-09, 2023-01-10)    NA
#>  7 B     [2023-01-01, 2023-01-05)    NA
#>  8 B     [2023-01-05, 2023-01-06)     3
#>  9 B     [2023-01-06, 2023-01-08)     2
#> 10 B     [2023-01-08, 2023-01-10)     1

相关问题