R语言 按开闭日期统计

iovurdzv  于 2023-05-04  发布在  其他
关注(0)|答案(3)|浏览(180)

我在R中有一个数据框架,它给我每个城镇和地区的每家商店的开业和关门日期,像这样:

town <- c("A", "A", "A", "B")
opening <- as.Date(c("1900-01-01", "1905-02-05", "1906-01-01", "1910-01-01"))
closing <- as.Date(c(NA, NA, NA, "1913-03-03"))

df <- data.frame(town, opening, closing)

我想计算每个城镇每年(年底)的商店数量,将每个观察结果对应于一个城镇和一年,这样我就得到了这个:

town_final <- c("A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B")
year <- c(1900, 1901, 1902, 1903, 1904, 1905, 1900, 1901, 1902, 1903, 1904, 1905)
stores <- c(1, 1, 2, 3, 3, 3, 0, 0, 0, 1, 1, 0)

df_final <- data.frame(town_final, year, stores)

我有大量的观察结果,所以我必须用循环来做这件事,但我不确定如何正确地做到这一点。我非常感谢任何帮助。

erhoui1w

erhoui1w1#

这里有一个tidyverse解决方案,使用了一些值得了解的工具:

library(tidyverse)

# sample data
df <- tibble(
    town = c("A", "A", "A", "B"),
    opening = as.Date(c("1900-01-01", "1905-02-05", "1906-01-01", "1910-01-01")),
    closing = as.Date(c(NA, NA, NA, "1913-03-03"))
)

首先,我们为每个商店创建一个唯一的ID,清理开始和结束年份,并创建从year_openedyear_closed的整数序列:

df_intermediate <- df %>% 
    mutate(
        # make a unique ID for to track each store
        store_id = uuid::UUIDgenerate(n = n()),
        # ceiling to New Years after opening and extract year
        year_opened = year(ceiling_date(opening, unit = 'year', change_on_boundary = FALSE)),
        # use current year for still-open stores, and extract year
        year_closed = year(coalesce(closing, Sys.Date())),
        # iterate in parallel over open and close years and make a sequence of open years for 
        # each row, in a list column
        open_years = map2(year_opened, year_closed, seq)
    )

df_intermediate
#> # A tibble: 4 × 7
#>   town  opening    closing    store_id        year_opened year_closed open_years
#>   <chr> <date>     <date>     <chr>                 <dbl>       <dbl> <list>    
#> 1 A     1900-01-01 NA         55151ece-6295-…        1900        2023 <int>     
#> 2 A     1905-02-05 NA         7d34b1a6-86fe-…        1906        2023 <int>     
#> 3 A     1906-01-01 NA         9db13517-ed09-…        1906        2023 <int>     
#> 4 B     1910-01-01 1913-03-03 3c7af4e2-3ca2-…        1910        1913 <int [4]>

然后,我们将列子集化,将列表列解嵌套为长格式,对每个城镇和年份的不同商店进行分组和计数,并进行排序:

df_out <- df_intermediate %>% 
    # subset to cols we care about and rename
    select(town, store_id, year = open_years) %>%
    # expand year list col, duplicating town and store_id for each year
    unnest(c(year)) %>% 
    group_by(town, year) %>%
    # count distinct stores open per town and year
    summarise(n_stores_open = n_distinct(store_id), .groups = 'drop') %>%
    # sort
    arrange(town, year)

df_out
#> # A tibble: 128 × 3
#>    town   year n_stores_open
#>    <chr> <int>         <int>
#>  1 A      1900             1
#>  2 A      1901             1
#>  3 A      1902             1
#>  4 A      1903             1
#>  5 A      1904             1
#>  6 A      1905             1
#>  7 A      1906             3
#>  8 A      1907             3
#>  9 A      1908             3
#> 10 A      1909             3
#> # ℹ 118 more rows
ijnw1ujt

ijnw1ujt2#

一个潜在的{tidyverse}解决方案:

library(dplyr)
library(tidyr)
library(lubridate)

town <- c("A", "A", "A", "B")
opening <- as.Date(c("1900-01-01", "1905-02-05", "1906-01-01", "1910-01-01"))
closing <- as.Date(c(NA, NA, NA, "1913-03-03"))

df <- data.frame(town, opening, closing)
stores_open <- expand.grid(town_final = unique(town),
                           years = 1900:1905)

stores_open |> 
  as_tibble() |> 
  left_join(df, by = c("town_final" = "town"), multiple = "all") |> 
  mutate(end_year = ymd(paste0(years, "1231")),
         closing = replace_na(closing, as.Date(Inf)),
         is_open = end_year >= opening & end_year <= closing) |> 
  group_by(town_final, years) |> 
  summarise(stores = sum(is_open))

这个想法本质上是检查每个商店是否在给定年份的最后一天开放,并存储为TRUEFALSE。然后按城市和年份分组,并将它们相加。这给出:

# A tibble: 12 × 3
# Groups:   town_final [2]
   town_final years stores
   <chr>      <int>  <int>
 1 A           1900      1
 2 A           1901      1
 3 A           1902      1
 4 A           1903      1
 5 A           1904      1
 6 A           1905      2
 7 B           1900      0
 8 B           1901      0
 9 B           1902      0
10 B           1903      0
11 B           1904      0
12 B           1905      0

这些值与您要求的输出不太匹配,但我不确定您是如何得到e的。g.根据您提供的输入数据,1903年A镇为3。

dwbf0jvd

dwbf0jvd3#

这里是一个“非魔法”的解决方案(因此相对较多的代码),它可以在将来补充选择一个年的范围,以及可能的错误处理。
其想法是创建零数据框并添加“真实的”值。

if (!require("pacman"))
  install.packages("pacman")

# install/load nedded library/ies
pacman::p_load(
  tidyr,
  dplyr
)


# create data
town <- c("A", "A", "A", "B")
opening <- as.Date(c("1900-01-01", "1905-02-05", "1906-01-01", "1910-01-01"))
closing <- as.Date(c(NA, NA, NA, "1913-03-03"))

df <- data.frame(town, opening, closing)

# remove NA without opening, and closing cols
df %>%
  drop_na(!c(opening, closing)) -> df


get_result <- function(df)
{
  # get year from date
  get_year <- function(date)
    as.numeric(format(date, "%Y"))
  
  
  # create range of date by years
  get_range_years <- function(df)
  {
    years <- c(df$opening, df$closing)
    
    # get left year of range
    tryCatch(
      # if not all is NA
      {
        l_year <- get_year(min(years, na.rm = T))
      },
      error = function(e)
      {
        message("Error! It is impossible!")
        message(e)
        message("Hence return NA")
        
        return (NA)
      }
    )
    
    # get right year of range
    tryCatch(
      # if not all is NA
      {
        r_year <- get_year(max(years, na.rm = T))
      },
      error = function(e)
      {
        message("Error! Most of all NA")
        message(e)
        message("Hence right year from range is left year")
        
        r_year <- l_year
      }
    )
    
    l_year:r_year
  }
  
  
  # create helpfull variable
  range_years <- get_range_years(df)
  range_town <- sort(unique(df$town))
  
  len_years <- length(range_years)
  len_town <- length(range_town)
  
  
  # res df
  right_join(
    
    # base df
    df %>%
      
      # create variable as "year" = year of opening
      mutate(year = get_year(opening)) %>%
      
      # rename as in you
      rename(town_final = town) %>%
      
      # create groups by "town_final" and "year"
      group_by(town_final, year) %>%
      
      # for each groups calculate length
      summarise(stores = n()),
    
    # add "zero" df
    data.frame(
      year = rep(range_years, each = len_town),
      town_final = rep(range_town, len_years),
      stores = rep(0, len_years * len_town)
    ),
    by = c("town_final", "year")
  ) %>%
    
    mutate(stores = pmax(stores.x, stores.y, na.rm = T)) %>%
    select(!starts_with("stores.")) %>%
    arrange(town_final, year) %>%
    ungroup()
}

df_final <- get_result(df)

输出

print(df_final, n = nrow(df_final))
# A tibble: 28 × 3
   town_final  year stores
   <chr>      <dbl>  <dbl>
 1 A           1900      1
 2 A           1901      0
 3 A           1902      0
 4 A           1903      0
 5 A           1904      0
 6 A           1905      1
 7 A           1906      1
 8 A           1907      0
 9 A           1908      0
10 A           1909      0
11 A           1910      0
12 A           1911      0
13 A           1912      0
14 A           1913      0
15 B           1900      0
16 B           1901      0
17 B           1902      0
18 B           1903      0
19 B           1904      0
20 B           1905      0
21 B           1906      0
22 B           1907      0
23 B           1908      0
24 B           1909      0
25 B           1910      1
26 B           1911      0
27 B           1912      0
28 B           1913      0

相关问题