将字符串的不同列模式组合成R中的新列

ftf50wuq  于 2023-09-27  发布在  其他
关注(0)|答案(4)|浏览(88)

我有一个这样的数据框,我想创建一个新列,如果存在的话,我只从其他两列中收集符号。

V1                 V2
   -------------------------------------
   SYMBOL=GABRD       SYMBOL_SOURCE=HGNC
   FLAGS=cds_end_NF   SYMBOL=GABRD
   SYMBOL=MASP2       SYMBOL_SOURCE=HGNC
   FLAGS=cds_start_NF SYMBOL=GABRD
   SYMBOL=GABRD       SYMBOL_SOURCE=HGNC
   FLAGS=cds_start_NF SYMBOL=GABRD

因此结果将包含在一个新列中,只有基于模式“SYMBOL=”的符号

V3
-----
GABRD   
GABRD   
MASP2   
GABRD   
GABRD   
GABRD
zu0ti5jz

zu0ti5jz1#

下面是一个使用dplyr的方法。我们mutate()across()V1V2来查找以SYMBOL=开始的行,然后将coalesce()放入新的列。

dat |>
    mutate(across(V1:V2, \(x) if_else(
        grepl("^SYMBOL=", x), x, NA_character_
    ), .names = "{.col}_symbol")) |>
    mutate(
        V3 = sub("^SYMBOL=", "", coalesce(V1_symbol, V2_symbol))
    ) |>
    select(-V1_symbol, -V2_symbol)
  

#                   V1                 V2    V3
# 1       SYMBOL=GABRD SYMBOL_SOURCE=HGNC GABRD
# 2   FLAGS=cds_end_NF       SYMBOL=GABRD GABRD
# 3       SYMBOL=MASP2 SYMBOL_SOURCE=HGNC MASP2
# 4 FLAGS=cds_start_NF       SYMBOL=GABRD GABRD
# 5       SYMBOL=GABRD SYMBOL_SOURCE=HGNC GABRD
# 6 FLAGS=cds_start_NF       SYMBOL=GABRD GABRD

注意:如果有两行都以"SYMBOL="开始,那么coalesce()将返回第一行不是NA(即NA)。V1)。

zqdjd7g9

zqdjd7g92#

测试数据:

df <- data.frame(
  V1 = c("SYMBOL=GABRD",       
         "FLAGS=cds_end_NF",   
         "SYMBOL=MASP2",       
         "FLAGS=cds_start_NF", 
         "SYMBOL=GABRD",       
         "FLAGS=cds_start_NF"),
  V2 = c("SYMBOL_SOURCE=HGNC",
         "SYMBOL=GABRD",
         "SYMBOL_SOURCE=HGNC",
         "SYMBOL=GABRD",
         "SYMBOL_SOURCE=HGNC",
         "SYMBOL=GABRD")
)

碱R溶液:

df$V3 = apply(df, 1, FUN = \(x) ifelse(grepl("SYMBOL=", x[1]), 
                                       gsub("SYMBOL=", "", x[1]), 
                                       gsub("SYMBOL=", "", x[2])))

这里:

> df
                  V1                 V2    V3
1       SYMBOL=GABRD SYMBOL_SOURCE=HGNC GABRD
2   FLAGS=cds_end_NF       SYMBOL=GABRD GABRD
3       SYMBOL=MASP2 SYMBOL_SOURCE=HGNC MASP2
4 FLAGS=cds_start_NF       SYMBOL=GABRD GABRD
5       SYMBOL=GABRD SYMBOL_SOURCE=HGNC GABRD
6 FLAGS=cds_start_NF       SYMBOL=GABRD GABRD

编辑:假设:- v1和v2是df中的前两列。

6ojccjat

6ojccjat3#

你可以试试这个:

library(tidyverse)
df %>% 
  mutate(V3 = coalesce(
    ifelse((str_detect(.$V1, "^SYMBOL=")), str_remove_all(.$V1, "SYMBOL="), NA), 
    ifelse((str_detect(.$V2, "^SYMBOL=")), str_remove_all(.$V2, "SYMBOL="), NA)
    )
  )
  
                  V1                 V2    V3
1       SYMBOL=GABRD SYMBOL_SOURCE=HGNC GABRD
2   FLAGS=cds_end_NF       SYMBOL=GABRD GABRD
3       SYMBOL=MASP2 SYMBOL_SOURCE=HGNC MASP2
4 FLAGS=cds_start_NF       SYMBOL=GABRD GABRD
5       SYMBOL=GABRD SYMBOL_SOURCE=HGNC GABRD
6 FLAGS=cds_start_NF       SYMBOL=GABRD GABRD

使用的数据:

data.frame(V1= 
             c(
               "SYMBOL=GABRD",       
               "FLAGS=cds_end_NF",   
               "SYMBOL=MASP2",       
               "FLAGS=cds_start_NF", 
               "SYMBOL=GABRD",      
               "FLAGS=cds_start_NF" 
             ),
           V2= 
             c(
               "SYMBOL_SOURCE=HGNC",
               "SYMBOL=GABRD",
               "SYMBOL_SOURCE=HGNC",
               "SYMBOL=GABRD",
               "SYMBOL_SOURCE=HGNC",
               "SYMBOL=GABRD"
             )
)
vvppvyoh

vvppvyoh4#

另一个简单而简短的基础R解决方案:

> df$V3 <- with(df, gsub(".*SYMBOL\\s?=\\s*(\\w+).*", "\\1" , paste(V1, V2)))
> df
                  V1                 V2    V3
1       SYMBOL=GABRD SYMBOL_SOURCE=HGNC GABRD
2   FLAGS=cds_end_NF       SYMBOL=GABRD GABRD
3       SYMBOL=MASP2 SYMBOL_SOURCE=HGNC MASP2
4 FLAGS=cds_start_NF       SYMBOL=GABRD GABRD
5       SYMBOL=GABRD SYMBOL_SOURCE=HGNC GABRD
6 FLAGS=cds_start_NF       SYMBOL=GABRD GABRD

相关问题