对按grep分组的行按字母顺序排序

lmvvr0a8  于 2023-01-06  发布在  其他
关注(0)|答案(6)|浏览(158)

我有一个 Dataframe ,其中有一行不良事件,还有这些不良事件与手术的关系,如下所示:

df <- data.frame(
  adverse_event = c(
    "Haemorrhage", "related", "likely related",
    "Other", "related", "likely related", "Pain", "related", "likely related",
    "Subcapsular hematoma", "related", "likely related", "Ascites",
    "related", "likely related", "Hyperbilirubinemia", "related",
    "likely related", "Liver abscess", "related", "likely related",
    "Pleural effusion with drainage", "related", "likely related",
    "Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
    "related", "likely related", "Portal vein thrombosis", "related",
    "likely related", "Sepsis", "related", "likely related"
  ),
  grade_1 = c(
    4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
    NA, NA, NA, NA, NA
  ),
  grade_2 = c(
    2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
    1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
    NA, NA, NA, NA, NA
  ),
  grade_3 = c(
    1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
    NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
    1L, 0L, 1L, 1L, 0L
  ),
  grade_4 = c(
    2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
    NA, NA, NA, NA, NA
  )
)

现在,我想按字母顺序对不良事件进行排序,但当然要将“相关”、“可能相关”行与单个不良事件行放在一起,所以我想先对它们进行分组。在本例中,通常为3行,但假设有时为2行,也有4或5行(字符串/名称中包含“相关”的不良事件行除外,但例如“不太可能相关”)。
我知道,我可以通过grep('related', df$adverse_event, invert = T)获得不良事件行的索引,但我不确定如何在排序之前使用它将行分组在一起。
编辑:所需输出的左列开始:

expected_output_left_column <- data.frame(adverse_event = c( 
"Ascites", "related", "likely related", 
"Biliary leakage / occlusion / fistula", "related", "likely related" ) )

谢谢大家!

jaxagkaj

jaxagkaj1#

使用dplyr中的基r和lead函数的另一种解决方案

# where start each group
id <- grep('related', df$adverse_event, invert = T)

# size of each group
size <- lead(id) - id
size_of_last_group <- nrow(df) - id[length(id)] + 1 
size[length(size)] <- size_of_last_group

# add col with id
df$id <- paste0(rep(df$adverse_event[id], times = size),
                df$adverse_event)

# order
df <- df[order(df$id), ]

# remove id
df$id <- NULL
of1yzvn4

of1yzvn42#

您可以执行以下操作:

library(dplyr)

left_join(
  df, 
  df %>% 
    filter(!grepl('related',adverse_event)) %>% 
    select(adverse_event) %>% 
    arrange(adverse_event) %>% 
    mutate(o = row_number())
  ) %>% 
  mutate(o = data.table::nafill(o, "locf")) %>% 
  arrange(o) %>% 
  select(-o)

输出:

adverse_event grade_1 grade_2 grade_3 grade_4
1                                Ascites      NA       1      NA      NA
2                                related      NA       1      NA      NA
3                         likely related      NA       0      NA      NA
4  Biliary leakage / occlusion / fistula      NA      NA       1      NA
5                                related      NA      NA       2      NA
6                         likely related      NA      NA       0      NA
7                            Haemorrhage       4       2       1       2
8                                related       4       3       4       4
9                         likely related       0       0       1       1
10                    Hyperbilirubinemia      NA       1      NA      NA
11                               related      NA       0      NA      NA
12                        likely related      NA       2      NA      NA
13                         Liver abscess      NA       1       4      NA
14                               related      NA       1       5      NA
15                        likely related      NA       0       1      NA
16                                 Other       3      11       5      NA
17                               related       6       3       3      NA
18                        likely related       1       7       2      NA
19                                  Pain       8       2       2      NA
20                               related       4       4       5      NA
21                        likely related       5       2       1      NA
22        Pleural effusion with drainage      NA       1      NA      NA
23                               related      NA       2      NA      NA
24                        likely related      NA       1      NA      NA
25                          Pneumothorax      NA       1       1      NA
26                               related      NA       1       1      NA
27                        likely related      NA       0       0      NA
28                Portal vein thrombosis      NA      NA       1      NA
29                               related      NA      NA       1      NA
30                        likely related      NA      NA       0      NA
31                                Sepsis      NA      NA       1      NA
32                               related      NA      NA       1      NA
33                        likely related      NA      NA       0      NA
34                  Subcapsular hematoma       3       1      NA      NA
35                               related       1       2      NA      NA
36                        likely related       3       0      NA      NA

请注意,这里使用的是data.table::nafill()。完整的data.table解决方案如下所示:

library(data.table)
setDT(df)

data.table(adverse_event =  sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
  df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]
mwngjboj

mwngjboj3#

添加“group”变量并排序

tmp=!grepl("related",df$adverse_event)
df$grp=cumsum(tmp)
df[order(match(df$grp,order(df$adverse_event[tmp]))),]

                           adverse_event grade_1 grade_2 grade_3 grade_4 grp
13                               Ascites      NA       1      NA      NA   5
14                               related      NA       1      NA      NA   5
15                        likely related      NA       0      NA      NA   5
28 Biliary leakage / occlusion / fistula      NA      NA       1      NA  10
29                               related      NA      NA       2      NA  10
30                        likely related      NA      NA       0      NA  10
1                            Haemorrhage       4       2       1       2   1
2                                related       4       3       4       4   1
3                         likely related       0       0       1       1   1
16                    Hyperbilirubinemia      NA       1      NA      NA   6
17                               related      NA       0      NA      NA   6
18                        likely related      NA       2      NA      NA   6
19                         Liver abscess      NA       1       4      NA   7
20                               related      NA       1       5      NA   7
21                        likely related      NA       0       1      NA   7
4                                  Other       3      11       5      NA   2
5                                related       6       3       3      NA   2
6                         likely related       1       7       2      NA   2
7                                   Pain       8       2       2      NA   3
8                                related       4       4       5      NA   3
9                         likely related       5       2       1      NA   3
22        Pleural effusion with drainage      NA       1      NA      NA   8
23                               related      NA       2      NA      NA   8
24                        likely related      NA       1      NA      NA   8
25                          Pneumothorax      NA       1       1      NA   9
26                               related      NA       1       1      NA   9
27                        likely related      NA       0       0      NA   9
31                Portal vein thrombosis      NA      NA       1      NA  11
32                               related      NA      NA       1      NA  11
33                        likely related      NA      NA       0      NA  11
34                                Sepsis      NA      NA       1      NA  12
35                               related      NA      NA       1      NA  12
36                        likely related      NA      NA       0      NA  12
10                  Subcapsular hematoma       3       1      NA      NA   4
11                               related       1       2      NA      NA   4
12                        likely related       3       0      NA      NA   4
fhity93d

fhity93d4#

再加上另一个tidyverse解决方案:

library(tidyr)
library(dplyr)

df %>% 
  mutate(grp = if_else(grepl("related", adverse_event), 
                       NA_character_,
                       adverse_event)) %>% 
  fill(grp) %>% 
  nest(data = -grp) %>% 
  arrange(grp) %>% 
  unnest(cols = data) %>% 
  select(-grp)

# # A tibble: 36 × 5
#    adverse_event                         grade_1 grade_2 grade_3 grade_4
#    <chr>                                   <int>   <int>   <int>   <int>
#  1 Ascites                                    NA       1      NA      NA
#  2 related                                    NA       1      NA      NA
#  3 likely related                             NA       0      NA      NA
#  4 Biliary leakage / occlusion / fistula      NA      NA       1      NA
#  5 related                                    NA      NA       2      NA
#  6 likely related                             NA      NA       0      NA
#  7 Haemorrhage                                 4       2       1       2
#  8 related                                     4       3       4       4
#  9 likely related                              0       0       1       1
# 10 Hyperbilirubinemia                         NA       1      NA      NA
# 11 related                                    NA       0      NA      NA
# 12 likely related                             NA       2      NA      NA
# 13 Liver abscess                              NA       1       4      NA
# 14 related                                    NA       1       5      NA
# 15 likely related                             NA       0       1      NA
# 16 Other                                       3      11       5      NA
# 17 related                                     6       3       3      NA
# 18 likely related                              1       7       2      NA
# 19 Pain                                        8       2       2      NA
# 20 related                                     4       4       5      NA
# 21 likely related                              5       2       1      NA
# 22 Pleural effusion with drainage             NA       1      NA      NA
# 23 related                                    NA       2      NA      NA
# 24 likely related                             NA       1      NA      NA
# 25 Pneumothorax                               NA       1       1      NA
# 26 related                                    NA       1       1      NA
# 27 likely related                             NA       0       0      NA
# 28 Portal vein thrombosis                     NA      NA       1      NA
# 29 related                                    NA      NA       1      NA
# 30 likely related                             NA      NA       0      NA
# 31 Sepsis                                     NA      NA       1      NA
# 32 related                                    NA      NA       1      NA
# 33 likely related                             NA      NA       0      NA
# 34 Subcapsular hematoma                        3       1      NA      NA
# 35 related                                     1       2      NA      NA
# 36 likely related                              3       0      NA      NA

解释

  1. mutate + fill:用词干标记每个adverse_event,即用上述相应事件重新标记所有related记录。
    1.嵌套所有列,但保留新创建的grp列,该列具有股骨柄不良事件的名称。
    1.对不良事件股骨柄进行分类。
    1.再次取消嵌套行。
    1.删除grp列。
btxsgosb

btxsgosb5#

使用rank的方法。使用具有4个“腹水”条目的扩展数据集。

library(dplyr)

df %>% 
  mutate(ord = !grepl("related", adverse_event), 
         grp = cumsum(ord), 
         Rank = rank(adverse_event[ord])[grp]) %>%   
  arrange(Rank) %>% 
  select(-c(ord, grp, Rank))
                           adverse_event grade_1 grade_2 grade_3 grade_4
1                                Ascites      NA       1      NA      NA
2                                related      NA       1      NA      NA
3                                related      NA       1      NA      NA
4                         likely related      NA       0      NA      NA
5  Biliary leakage / occlusion / fistula      NA      NA       1      NA
6                                related      NA      NA       2      NA
7                         likely related      NA      NA       0      NA
8                            Haemorrhage       4       2       1       2
9                                related       4       3       4       4
10                        likely related       0       0       1       1
11                    Hyperbilirubinemia      NA       1      NA      NA
12                               related      NA       0      NA      NA
13                        likely related      NA       2      NA      NA
14                         Liver abscess      NA       1       4      NA
15                               related      NA       1       5      NA
16                        likely related      NA       0       1      NA
17                                 Other       3      11       5      NA
18                               related       6       3       3      NA
19                        likely related       1       7       2      NA
20                                  Pain       8       2       2      NA
21                               related       4       4       5      NA
22                        likely related       5       2       1      NA
23        Pleural effusion with drainage      NA       1      NA      NA
24                               related      NA       2      NA      NA
25                        likely related      NA       1      NA      NA
26                          Pneumothorax      NA       1       1      NA
27                               related      NA       1       1      NA
28                        likely related      NA       0       0      NA
29                Portal vein thrombosis      NA      NA       1      NA
30                               related      NA      NA       1      NA
31                        likely related      NA      NA       0      NA
32                                Sepsis      NA      NA       1      NA
33                               related      NA      NA       1      NA
34                        likely related      NA      NA       0      NA
35                  Subcapsular hematoma       3       1      NA      NA
36                               related       1       2      NA      NA
37                        likely related       3       0      NA      NA
扩展数据
df <- structure(list(adverse_event = c("Haemorrhage", "related", "likely related", 
"Other", "related", "likely related", "Pain", "related", "likely related", 
"Subcapsular hematoma", "related", "likely related", "Ascites", 
"related", "related", "likely related", "Hyperbilirubinemia", 
"related", "likely related", "Liver abscess", "related", "likely related", 
"Pleural effusion with drainage", "related", "likely related", 
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula", 
"related", "likely related", "Portal vein thrombosis", "related", 
"likely related", "Sepsis", "related", "likely related"), grade_1 = c(4L, 
4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA), grade_2 = c(2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 
2L, 1L, 2L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 
1L, 1L, 1L, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA), grade_3 = c(1L, 
4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L, 1L, 
0L, 1L, 1L, 0L), grade_4 = c(2L, 4L, 1L, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 
37L), class = "data.frame")
eqoofvh9

eqoofvh96#

以下是不同建议的基准(如果需要):

library(bench)
library(dplyr)
library(data.table)
library(tidyr)

df <- data.frame(
   adverse_event = c(
      "Haemorrhage", "related", "likely related",
      "Other", "related", "likely related", "Pain", "related", "likely related",
      "Subcapsular hematoma", "related", "likely related", "Ascites",
      "related", "likely related", "Hyperbilirubinemia", "related",
      "likely related", "Liver abscess", "related", "likely related",
      "Pleural effusion with drainage", "related", "likely related",
      "Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
      "related", "likely related", "Portal vein thrombosis", "related",
      "likely related", "Sepsis", "related", "likely related"
   ),
   grade_1 = c(
      4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
      NA, NA, NA, NA, NA
   ),
   grade_2 = c(
      2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
      1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
      NA, NA, NA, NA, NA
   ),
   grade_3 = c(
      1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
      NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
      1L, 0L, 1L, 1L, 0L
   ),
   grade_4 = c(
      2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
      NA, NA, NA, NA, NA
   )
)

paul_carteron <- function(df){
   
   # where start each group
   id <- grep('related', df$adverse_event, invert = T)
   
   # size of each group
   size <- lead(id) - id
   size_of_last_group <- nrow(df) - id[length(id)] + 1 
   size[length(size)] <- size_of_last_group
   
   # add col with id
   df$id <- paste0(rep(df$adverse_event[id], times = size),
                   df$adverse_event)
   
   # order
   df <- df[order(df$id), ]
   
   # remove id
   df$id <- NULL
   
}

lang_tang_dplyr <- function(df){
   left_join(
      df,
      df %>%
         filter(!grepl('related', adverse_event)) %>%
         select(adverse_event) %>%
         arrange(adverse_event) %>%
         mutate(o = row_number())
   ) %>%
      mutate(o = data.table::nafill(o, "locf")) %>%
      arrange(o) %>%
      select(-o)
}

lang_tang_databable <- function(df) {
   setDT(df)
   
   data.table(adverse_event =  sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
      df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]
}

andre_wilberg <- function(df){
   df %>% 
      mutate(ord = !grepl("related", adverse_event), 
             grp = cumsum(ord), 
             Rank = rank(adverse_event[ord])[grp]) %>%   
      arrange(Rank) %>% 
      select(-c(ord, grp, Rank))
}

thotal <- function(df){
   df %>% 
      mutate(grp = if_else(grepl("related", adverse_event), 
                           NA_character_,
                           adverse_event)) %>% 
      fill(grp) %>% 
      nest(data = -grp) %>% 
      arrange(grp) %>% 
      unnest(cols = data) %>% 
      select(-grp)
}

results = bench::mark(
   iterations = 1000, check = FALSE, time_unit = "s", filter_gc = FALSE,
   paul_carteron = paul_carteron(df),
   lang_tang_dplyr = lang_tang_dplyr(df),
   lang_tang_databable = lang_tang_databable(df),
   andre_wilberg = andre_wilberg(df),
   thotal = thotal(df)
)

plot(results)

相关问题