R语言 基于DATE间隔和数据中的其他变量有条件地改变新列,

kmb7vmvb  于 2023-04-03  发布在  其他
关注(0)|答案(1)|浏览(111)

bounty将在5天后过期。回答此问题可获得+50的声誉奖励。hklovs正在寻找来自声誉良好来源的答案

我需要一些帮助来解决一个相当复杂的问题,我已经尽可能地说清楚了。

输入数据:

A<-(c("AA10", "AA20", "AA30"))
B<-(c("BB10","BB20","BB30"))
C<-(c("CC", "DD"))

library(data.table)

DF<-data.table::data.table(
              ID = c(1L,1L,1L,1L,1L,1L,2L,2L,2L,
                     2L,3L,3L,3L,3L,4L,4L,4L,5L,5L,5L),
            DIAG = c("AA10","AA100","AA20","BB10",
                     "BB1000","D05","AA10","AA105","BB10","BB101","AA10",
                     "BB2020","BB2020","BB20","BB10","AA20","AA200","AA10",
                     "AA205","AA305"),
             DAT = c("2001-01-01","2002-02-02",
                     "2003-03-03","2004-04-04","2010-10-10","2011-11-11",
                     "2002-02-02","2003-03-03","2004-04-04","2005-05-05","2001-01-01",
                     "2002-02-02","2003-03-03","2004-04-04","2002-02-02",
                     "2003-03-03","2009-09-09","2001-01-01","2002-02-02",
                     "2003-03-03"),
         OP_DIAG = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,
                     NA,NA,NA,NA,"CC10",NA,NA,NA,NA,NA,NA)
    )

预期产出:

ID  DIAG DAT          OP_DIAG ABAB_MIN_DAT ABAB_MAX_DAT A_DIAG AB_DIAG B_DIAG
 1:  1   AA10 2001-01-01    <NA>   2001-01-01   2010-10-10   <NA>    <NA>    YES
 2:  1  AA100 2002-02-02    <NA>   2001-01-01   2010-10-10   <NA>    <NA>    YES
 3:  1   AA20 2003-03-03    <NA>   2001-01-01   2010-10-10   <NA>    <NA>    YES
 4:  1   BB10 2004-04-04    <NA>   2001-01-01   2010-10-10   <NA>    <NA>    YES
 5:  1 BB1000 2010-10-10    <NA>   2001-01-01   2010-10-10   <NA>    <NA>    YES
 6:  1    D05 2011-11-11    <NA>   2001-01-01   2010-10-10   <NA>    <NA>    YES
 7:  2   AA10 2002-02-02    <NA>   2002-02-02   2005-05-05   <NA>     YES   <NA>
 8:  2  AA105 2003-03-03    <NA>   2002-02-02   2005-05-05   <NA>     YES   <NA>
 9:  2   BB10 2004-04-04    <NA>   2002-02-02   2005-05-05   <NA>     YES   <NA>
10:  2  BB101 2005-05-05    <NA>   2002-02-02   2005-05-05   <NA>     YES   <NA>
11:  3   AA10 2001-01-01    <NA>   2001-01-01   2004-04-04    YES    <NA>   <NA>
12:  3 BB2020 2002-02-02    <NA>   2001-01-01   2004-04-04    YES    <NA>   <NA>
13:  3 BB2020 2003-03-03    <NA>   2001-01-01   2004-04-04    YES    <NA>   <NA>
14:  3   BB20 2004-04-04    CC10   2001-01-01   2004-04-04    YES    <NA>   <NA>
15:  4   BB10 2002-02-02    <NA>   2002-02-02   2009-09-09    YES    <NA>   <NA>
16:  4   AA20 2003-03-03    <NA>   2003-03-03   2009-09-09    YES    <NA>   <NA>
17:  4  AA200 2009-09-09    <NA>   2009-09-09   2009-09-09    YES    <NA>   <NA>
18:  5   AA10 2001-01-01    <NA>   2001-01-01   2003-03-03    YES    <NA>   <NA>
19:  5  AA205 2002-02-02    <NA>   2001-01-01   2003-03-03    YES    <NA>   <NA>
20:  5  AA305 2003-03-03    <NA>   2001-01-01   2003-03-03    YES    <NA>   <NA>

逻辑:

所有数据按**ID分组。
A_DIAGB_DIAGAB_DIAG
全部DIAG(在相同的ID中)被搜索以包含在AB中定义的值。如果DIAG(在同一个ID中)仅包含A值(而不是B)这应该生成A_DIAG=YES。请注意,例如AA205是一个命中,因为它
以**“AA20“开头。
因此,如果DIAG仅包含B值,则B_DIAG=YES
如果DIAG(在同一个ID中)同时包含AB值,则生成AB_DIAG=YES.
但是,DIAG同时包含AB值的情况除外。(从最高DAT开始计数)仅包含AB值,这应该算作A_DIAGB_DIAG,而不是AB_DIAG。这是我在ID=1中的示例数据的情况,其中2010-10-102004-04-04之间的时间跨度〉5年,其中两个日期都包含B值,然后是B_DIAG=YES
另一个例外是,当OP_DIAGCC开始时(如C中定义的),并且当DIAG中有AB值时(在相同的ID中),这应该总是生成A_DIAG=YES
ABAB_MIN_DAT, ABAB_MAX_DAT
这分别表示A_DIAGB_DIAG的最低和最高日期。
有没有人愿意帮忙?:)
最好是data.table解决方案,但dplyr也是受欢迎的,以及其他智能解决方案。
贝斯特,H

yqlxgs2m

yqlxgs2m1#

我认为这实现了所需的逻辑。请记住,根据示例,MIN/MAX日期的计算并不关心您提到的异常。

library(lubridate)

# redefine rules as regexp patterns
A <- paste0("^[", paste0(A, collapse = "|"), "]")
B <- paste0("^[", paste0(B, collapse = "|"), "]")
C <- paste0("^[", paste0(C, collapse = "|"), "]")

# base answers that can be adapted for each ID-based group
ans_A <- data.frame(A_DIAG = "YES", AB_DIAG = NA_character_, B_DIAG = NA_character_)
ans_B <- data.frame(A_DIAG = NA_character_, AB_DIAG = NA_character_, B_DIAG = "YES")
ans_AB <- data.frame(A_DIAG = NA_character_, AB_DIAG = "YES", B_DIAG = NA_character_)
ans_NA <- data.frame(A_DIAG = NA_character_, AB_DIAG = NA_character_, B_DIAG = NA_character_)

# helper functions - SD means subset of data

with_min_max <- function(ans, SD, indices) {
    ans$ABAB_MIN_DAT <- min(SD$DAT[indices])
    ans$ABAB_MAX_DAT <- max(SD$DAT[indices])
    ans
}

group_fun <- function(SD) {
    matches_A <- grepl(A, SD$DIAG)
    matches_B <- grepl(B, SD$DIAG)
    
    if (any(grepl(C, SD$OP_DIAG), na.rm = TRUE) && (any(matches_A) || any(matches_B)) ) {
        # OP_DIAG exception
        matches_either <- matches_A | matches_B
        return(with_min_max(ans_A, SD, matches_either))
    }
    
    matches_neither <- !(matches_A | matches_B)
    
    if (all(matches_neither)) {
        # min/max dates also set to NA if no matches for A or B are found
        return(cbind(ans_NA, as.data.frame(rep(list(as.Date(NA)), 2L))))
    }
    else if (all(matches_A | matches_neither)) {
        return(with_min_max(ans_A, SD, matches_A))
    }
    else if (all(matches_B | matches_neither)) {
        return(with_min_max(ans_B, SD, matches_B))
    }
    
    matches_either <- !matches_neither
    max_date <- max(SD$DAT[matches_either])
    years_to_max_date <- as.numeric(as.duration(max_date - SD$DAT), "years")
    is_recent_enough <- years_to_max_date <= 5
    
    if (all(is_recent_enough | matches_neither)) {
        return(with_min_max(ans_AB, SD, matches_either))
    }
    
    matches_neither_recent <- matches_neither[is_recent_enough]
    
    if (all(matches_A[is_recent_enough] | matches_neither_recent)) {
        return(with_min_max(ans_A, SD, matches_either))
    }
    else if (all(matches_B[is_recent_enough] | matches_neither_recent)) {
        return(with_min_max(ans_B, SD, matches_either))
    }
    else {
        return(with_min_max(ans_AB, SD, matches_either))
    }
}

# apply
DF[, DAT := ymd(DAT)][
    , c("A_DIAG", "AB_DIAG", "B_DIAG", "ABAB_MIN_DAT", "ABAB_MAX_DAT") := group_fun(.SD), by = "ID"]

我从你的描述中得知,既不匹配A规则也不匹配B规则的行(如示例中的第6行)基本上可以被过滤掉。对于“最近5年”的情况,当从组中获取最新的DAT时,这些不匹配行的日期也会被忽略。

相关问题