根据优先级列表选择值- R数据,表

pkwftd7m  于 2023-03-15  发布在  其他
关注(0)|答案(6)|浏览(100)

我有这样的数据集其中变量很多,但我只选择显示几个:

dt <- data.table(ID = c(1,2,3, 4, 5),
                 diagnosis1 = c(0, 0, 1, 0, 1), 
                 diagnosis2 = c(1, 0, 0, 1, 0), 
                 diagnosis3 = c(0, 1, 1, 0, 1), 
                 diagnosis4 = c(1, 0, 1, 0, 0))

共有5名患者和4种诊断类型。一名患者可以有诊断1,也可以有诊断3(fx患者5),但在我的最终数据集中,每位患者只允许有一种诊断。优先级列表如下:诊断1、诊断2、诊断3、诊断4。因此,在这种情况下,患者5应仅得到诊断1。
我有一个大的数据集,有多个变量,而不是上面显示的5个。所以输出应该是相同的,但取代1的,不是选择的一个0。
希望你能帮忙!

h79rfbju

h79rfbju1#

创建一个列名向量,然后使用min(which() by ID:

更新:

明确指定dx优先级:

dx_priority = c("diagnosis3", "diagnosis1", "diagnosis4", "diagnosis2")
dt[, f_dx:=dx_priority[min(which(.SD==1))], ID, .SDcols = dx_priority]

上一个答案

让diagnosis列的顺序决定优先级

dx_cols <- names(dt)[ grepl("^diagnosis", names(dt)) ]
dt[, f_dx := dx_cols[min(which(.SD == 1))], by = ID, .SDcols = dx_cols ]

输出:

ID diagnosis1 diagnosis2 diagnosis3 diagnosis4       f_dx
1:  1          0          1          0          1 diagnosis2
2:  2          0          0          1          0 diagnosis3
3:  3          1          0          1          1 diagnosis1
4:  4          0          1          0          0 diagnosis2
5:  5          1          0          1          0 diagnosis1
dkqlctbz

dkqlctbz2#

melt(dt, id.vars = "ID", variable.name = "Diagnosis")[value == 1, .SD[1,1], keyby = ID]

   ID  Diagnosis
1:  1 diagnosis2
2:  2 diagnosis3
3:  3 diagnosis1
4:  4 diagnosis2
5:  5 diagnosis1
7cwmlq89

7cwmlq893#

如果需要输出变量名,另一个有用的函数是max.colties.method = "first"为:

cols <- names(dt)[grepl("^diagnosis", names(dt))]
dt[, indx := cols[max.col(.SD, ties.method = "first")], .SDcols = cols]

   ID diagnosis1 diagnosis2 diagnosis3 diagnosis4       indx
1:  1          0          1          0          1 diagnosis2
2:  2          0          0          1          0 diagnosis3
3:  3          1          0          1          1 diagnosis1
4:  4          0          1          0          0 diagnosis2
5:  5          1          0          1          0 diagnosis1

如果需要选定的优先级顺序,可以先使用setcolorder重新排列数据:

prio <- c("diagnosis2", "diagnosis1", "diagnosis4", "diagnosis3")
setcolorder(dt, c("ID", prio))[, indx := prio[max.col(.SD, ties.method = "first")], .SDcols = prior]

   ID diagnosis2 diagnosis1 diagnosis4 diagnosis3       indx
1:  1          1          0          0          0 diagnosis2
2:  2          0          0          0          1 diagnosis3
3:  3          0          1          0          0 diagnosis1
4:  4          1          0          0          0 diagnosis2
5:  5          0          1          0          0 diagnosis1

在底数R中,可以使用apply

f <- function(x){
  idx <- min(which(x == 1))
  x[-idx] <- 0
  x
}

dt <- as.data.frame(dt)
dt[-1] <- t(apply(dt[-1], 1, f))
dt

  ID diagnosis1 diagnosis2 diagnosis3 diagnosis4
1  1          0          1          0          0
2  2          0          0          1          0
3  3          1          0          0          0
4  4          0          1          0          0
5  5          1          0          0          0
j13ufse2

j13ufse24#

使用this answer中的优先级向量在行上进行矢量化。

bln <- rep(TRUE, nrow(dt))
dx_priority = c("diagnosis1", "diagnosis2", "diagnosis3", "diagnosis4")
dt[
  , c(dx_priority) := lapply(.SD, function(x) {x <- x*bln; bln <<- bln & !x; x}),
  .SDcols = dx_priority
][]
#>    ID diagnosis1 diagnosis2 diagnosis3 diagnosis4
#> 1:  1          0          1          0          0
#> 2:  2          0          0          1          0
#> 3:  3          1          0          0          0
#> 4:  4          0          1          0          0
#> 5:  5          1          0          0          0
rqdpfwrv

rqdpfwrv5#

使用dplyr

library(dplyr)
library(purrr)
dt %>% 
  transmute(ID, Diagnosis = invoke(coalesce, across(starts_with("diagnosis"), 
    ~ case_match(.x, 1 ~ cur_column()))))
  • 输出
ID  Diagnosis
1:  1 diagnosis2
2:  2 diagnosis3
3:  3 diagnosis1
4:  4 diagnosis2
5:  5 diagnosis1
chhqkbe1

chhqkbe16#

使用set()

# Define column priority
pr  <- paste0("diagnosis", 1:4)

tmp <- dt[[pr[1]]]
for (i in 2:length(pr)) {
  set(dt, i = which(tmp == 1), j = pr[i], value = 0)
  tmp <- pmax(tmp, dt[[pr[i]]])
}

#       ID diagnosis1 diagnosis2 diagnosis3 diagnosis4
#    <num>      <num>      <num>      <num>      <num>
# 1:     1          0          1          0          0
# 2:     2          0          0          1          0
# 3:     3          1          0          0          0
# 4:     4          0          1          0          0
# 5:     5          1          0          0          0

相关问题