R语言 标识小于给定日期的日期并对数据应用条件

9wbgstp7  于 2022-12-06  发布在  其他
关注(0)|答案(1)|浏览(100)

我有四个数据表,其中三个有三个不同程序的日期,一个表有感染的日期和诊断感染的体征。首先,我想确定哪个手术与感染迹象相关,然后应用预定标准来确定是否是实际感染。感染发作时间是感染日期和手术日期之间的差异(感染前最后一次手术)。为了确定哪种手术与感染相关,我们考虑感染发作前最后一次手术日期与感染日期之间的时间差小于1年。感染的标准是手术后感染发作时间小于1年,发热=是,出院=是,且培养=阳性。
我确实使用了merge函数先合并各种表,然后使用as.duration来识别手术日期和感染日期之间的差异,但这看起来很奇怪。
有人能帮我吗?我的数据示例如下。

dt1 = data.table(
  participant.id = c("1","2","3", "3"),
  date.procedure1 = c("2000-11-19", "2003-08-29", "2000-01-08", "2002-03-08"),
repeat.instance.procedure1 = c("1", "1", "1", "2")
)
dt2 = data.table (participant.id = c("1","2","3"),
                  date.procedure2 = c("2000-10-19", "2003-07-02", "1999-12-12"),
                  repeat.instance.procedure2 = c("1", "1", "1")
)
dt3 = data.table (participant.id = c("1","1", "2","2" ,"3"),
                  date.procedure3 = c("2002-10-19","2004-10-10", "2006-10-02", "2010-10-10",       "2009-01-12"),
                  repeat.instance.procedure3 = c("1", "2", "1", "2", "1")
)
dt4 = data.table (
  participant.id = c("1", "2", "3"),
  date.infection = c("2001-05-10", "2007-02-10", "2002-03-25"),
  repeat.instance.infection = c("1", "1", "1"),
  fever = c("yes", "no", "yes"),
  discharge = c("yes", "no", "yes"),
  culture = c("positive", "positive","negative"),
  pain = c("yes", "yes", "yes")
)
wbgh16ku

wbgh16ku1#

如果我按照您的要求进行操作,我认为您希望将前三个表堆叠成一个表,并为该过程创建一个字段“type(1、2或3)。然后由于每个患者仅感染一次,可以使用左连接合并感染日期。然后,您可以为每一行构建一个感染日期和治疗日期之间差异的变量,并确定它是否为阳性和我对sql_df和dplyr包都这样做了,以检查结果。

rm(list = ls())
library(data.table)
library(dplyr)
library(sqldf)

dt1 <- data.table(
  participant.id = c("1", "2", "3", "3"),
  date.procedure1 = c("2000-11-19", "2003-08-29", "2000-01-08", "2002-03-08"),
  repeat.instance.procedure1 = c("1", "1", "1", "2")
)
dt2 <- data.table(
  participant.id = c("1", "2", "3"),
  date.procedure2 = c("2000-10-19", "2003-07-02", "1999-12-12"),
  repeat.instance.procedure2 = c("1", "1", "1")
)
dt3 <- data.table(
  participant.id = c("1", "1", "2", "2", "3"),
  date.procedure3 = c("2002-10-19", "2004-10-10", "2006-10-02", "2010-10-10", "2009-01-12"),
  repeat.instance.procedure3 = c("1", "2", "1", "2", "1")
)
dt4 <- data.table(
  participant.id = c("1", "2", "3"),
  date.infection = c("2001-05-10", "2007-02-10", "2002-03-25"),
  repeat.instance.infection = c("1", "1", "1"),
  fever = c("yes", "no", "yes"),
  discharge = c("yes", "no", "yes"),
  culture = c("positive", "positive", "negative"),
  pain = c("yes", "yes", "yes")
)
tables123 <- list(dt1, dt2, dt3)
for (i in 1:length(tables123)) {
  toadd <- data.frame(tables123[[i]])
  toadd$date <- as.Date(toadd[, 2])
  toadd$type <- i
  toadd <- toadd[, c(1, 3:5)]
  names(toadd) <- c("id", "instance", "date", "type")
  if (i == 1) {
    dsn <- toadd
  } else {
    dsn <- rbind(dsn, toadd)
  }
}

dsn

返回:

> dsn
   id instance       date type
1   1        1 2000-11-19    1
2   2        1 2003-08-29    1
3   3        1 2000-01-08    1
4   3        2 2002-03-08    1
5   1        1 2000-10-19    2
6   2        1 2003-07-02    2
7   3        1 1999-12-12    2
8   1        1 2002-10-19    3
9   1        2 2004-10-10    3
10  2        1 2006-10-02    3
11  2        2 2010-10-10    3
12  3        1 2009-01-12    3

然后合并感染日期并创建变量差异和关联

toJoin <- dt4[, 1:2]
toJoin$infect_dt <- as.Date(toJoin$date.infection)
toJoin[, date.infection := NULL]
names(toJoin) <- c("id", "infect_dt")

dsn2 <- left_join(dsn, toJoin, by = NULL)
setorder(dsn2, id, date)
dsn2 <- mutate(dsn2, diff = as.numeric(infect_dt - date))
dsn2 <- mutate(dsn2, associate = (diff > 0 & diff < 635))
dsn2

# or

dsn2_sql <- sqldf("select a.*, b.infect_dt from dsn as a left join toJoin as b 
      where a.id=b.id order by id, date ")

dsn2_sql$diff <- as.numeric(dsn2_sql$infect_dt - dsn2_sql$date)
dsn2_sql$associate <- (dsn2_sql$diff < 365 & dsn2_sql$diff > 0)
dsn2_sql

dsn2_sql

返回:

> dsn2_sql
   id instance       date type  infect_dt  diff associate
1   1        1 2000-10-19    2 2001-05-10   203      TRUE
2   1        1 2000-11-19    1 2001-05-10   172      TRUE
3   1        1 2002-10-19    3 2001-05-10  -527     FALSE
4   1        2 2004-10-10    3 2001-05-10 -1249     FALSE
5   2        1 2003-07-02    2 2007-02-10  1319     FALSE
6   2        1 2003-08-29    1 2007-02-10  1261     FALSE
7   2        1 2006-10-02    3 2007-02-10   131      TRUE
8   2        2 2010-10-10    3 2007-02-10 -1338     FALSE
9   3        1 1999-12-12    2 2002-03-25   834     FALSE
10  3        1 2000-01-08    1 2002-03-25   807     FALSE
11  3        2 2002-03-08    1 2002-03-25    17      TRUE
12  3        1 2009-01-12    3 2002-03-25 -2485     FALSE

然后过滤掉错误的关联

dsn2_sql <- sqldf("select a.*, b.infect_dt from dsn as a left join toJoin as b 
      where a.id=b.id order by id, date ")

dsn2_sql$diff <- as.numeric(dsn2_sql$infect_dt - dsn2_sql$date)
dsn2_sql$associate <- (dsn2_sql$diff < 365 & dsn2_sql$diff > 0)
dsn2_sql

dsn2_sql

dsn3 <- filter(dsn2, associate == T)
dsn3
dsn3_sql <- dsn2_sql[dsn2$associate == T, ]
dsn3_sql

其产生

id instance       date type  infect_dt diff associate
1   1        1 2000-10-19    2 2001-05-10  203      TRUE
2   1        1 2000-11-19    1 2001-05-10  172      TRUE
7   2        1 2006-10-02    3 2007-02-10  131      TRUE
11  3        2 2002-03-08    1 2002-03-25   17      TRUE

你可以用

dsn3 == dsn3_sql

患者1接受了两次治疗,间隔一个月,然后在大约6个月后感染。您可以通过以下方式选择每个患者的最后一行:

dsn3$keep = c(dsn3$id[1:(nrow(dsn3)-1)] != dsn3$id[2:(nrow(dsn3))],1)

然后筛选其中的keep == 1。希望这回答了您的问题。

相关问题