如何检查R中包含混合字母-数字索引的列中的重叠

pgpifvop  于 2023-10-13  发布在  其他
关注(0)|答案(3)|浏览(92)

我有一个字符串,其中一列包含由一个字母后跟一个3位数字组成的代码串。我想检查是否有任何重叠的代码跨行。
一个例子是(注意,第1行和第2行之间有一个重叠(B258):

df <- data.frame(ICD_code = c("A581-A589, B200-B299, B354-B355", "B258, I230-I233, J201-J300, K353", "C001-C005, C020-C023, C781"),
                 category = c(1, 2, 3))

df

                              ICD_code category
    1  A581-A589, B200-B299, B354-B355        1
    2 B258, I230-I233, J201-J300, K353        2
    3       C001-C005, C020-C023, C781        3

为此,我尝试创建一个函数,以1为增量对“A581-A589”进行排序,但这不起作用,因为seq()只对数值有效。我想知道是否有人知道如何使用字母数字值生成序列?
我想要的结果:

ICD_code category overlap
1  A581-A589, B200-B299, B354-B355        1    T
2 B258, I230-I233, J201-J300, K353        2    T
3       C001-C005, C020-C023, C781        3    F
ahy6op9u

ahy6op9u1#

df <- data.frame(
  ICD_code = c("A581-A589, B200-B299, B354-B355",
               "B258, I230-I233, J201-J300, K353",
               "C001-C005, C020-C023, C781"),
  category = c(1, 2, 3)
)
library(tidyverse)

(df2 <- separate_longer_delim(df,
  cols = ICD_code,
  delim = ","
) |>
  separate_wider_delim(
    cols = ICD_code, delim = "-",
    names_sep = "x",
    too_few = "align_start"
  ) |> mutate(
    across(1:2,trimws),
    across(1:2,
      parse_number,
      .names = "num_{col}"
    ),
    first_letter = substr(ICD_codex1, 1, 1)
  ) |> rowwise() |>
  mutate(numseq = list(seq(
    from = num_ICD_codex1,
    to = pmax(num_ICD_codex1, num_ICD_codex2, na.rm = TRUE),
    by = 1))))

(df3 <- split(df2, ~first_letter))

(df4 <- map_lgl(
  df3,
  \(x){anyDuplicated(unlist(x$numseq)) > 0}
))
8fsztsew

8fsztsew2#

data.table版本:

library(data.table)

DF <- data.frame(ICD_code = c("A581-A589, B200-B299, B354-B355",
                              "B258, I230-I233, J201-J300, K353",
                              "C001-C005, C020-C023, C781"),
                 category = c(1, 2, 3))

setDT(DF)
DT <- copy(DF)
DT <- DT[, .(ICD_code = unlist(strsplit(ICD_code, ", "))), by = category][, prefix := substring(ICD_code, 1, 1)][, suffix := lapply(strsplit(ICD_code, "-"), function(x){if(length(x) > 1){do.call(seq, as.list(as.integer(substring(x, 2))))} else {as.numeric(substring(x, 2))}})]
DT <- DT[, .(suffix = as.character(unlist(suffix))), by = .(category, ICD_code, prefix)][, ICD_code := paste0(prefix, suffix)][, overlap := (duplicated(ICD_code) | duplicated(ICD_code, fromLast=TRUE))]
DF <- merge.data.table(DF, DT[overlap == TRUE, .(category, overlap)], by = "category", all.x = TRUE)[is.na(overlap), overlap := FALSE]
DF[]

测试结果:

category                         ICD_code overlap
1:        1  A581-A589, B200-B299, B354-B355    TRUE
2:        2 B258, I230-I233, J201-J300, K353    TRUE
3:        3       C001-C005, C020-C023, C781   FALSE
cbeh67ev

cbeh67ev3#

下面是我的代码:

df <- data.frame(ICD_code = c("A581-A589, B200-B299, B354-B355", "B258, I230-I233, J201-J300, K353", "C001-C005, C020-C023, C781"),
                 category = c(1, 2, 3))

    df

首先,确定要扩大的不同范围

library(tidyverse)
    
    
    exp_df = df |> 
      rowid_to_column(var = "original_row") |> 
      separate_longer_delim(cols = ICD_code,delim = ",") |> 
      mutate(ICD_code = str_trim(ICD_code),
             expand_check = str_detect(ICD_code,"-"),
             ICD_code_exp_code = ifelse(expand_check,
                                        str_extract(ICD_code,"[A-Z]"),
                                        "A"), 
             ICD_code_exp_min = ifelse(expand_check,
                                       str_extract(ICD_code,"\\d*(?=-)"),
                                       0),
             ICD_code_exp_max = ifelse(expand_check,
                                       str_extract(ICD_code,"(?<=-[A-Z])\\d*"),
                                       0)) |> 
      mutate(across(c(ICD_code_exp_min,ICD_code_exp_max),as.integer))

这将扩展范围并将其存储在列中

# Producing a expanded list of codes
    exp_df$ICD_code_exp = sapply(1:nrow(exp_df),
                                 FUN = function(x){
                                   paste(paste0(exp_df$ICD_code_exp_code[x],
                                                seq(exp_df$ICD_code_exp_min[x],
                                                    exp_df$ICD_code_exp_max[x],
                                                    1)),
                                         collapse = ",")
                                   }
        )
    
    # Replacing the ones that do not need to be expanded
    exp_df$ICD_code_exp[!exp_df$expand_check] = exp_df$ICD_code[!exp_df$expand_check]

重叠被检测到计数ICD_code出现的不同行。

overlaps = exp_df |>
      separate_longer_delim(ICD_code_exp,delim = ",") |> 
      mutate(n_dist_rows = n_distinct(original_row),.by = ICD_code_exp) |> 
      summarise(overlap = max(n_dist_rows)>1,
                .by = c(original_row))

输出将是

df |> 
      rowid_to_column(var = "original_row") |>
      left_join(overlaps,by = join_by(original_row)) |
      select(-original_row)

      #                           ICD_code category overlap
      # 1  A581-A589, B200-B299, B354-B355        1    TRUE
      # 2 B258, I230-I233, J201-J300, K353        2    TRUE
      # 3       C001-C005, C020-C023, C781        3   FALSE

相关问题