根据R中另一列的字符串子集创建新列?

o2gm4chl  于 2023-03-10  发布在  其他
关注(0)|答案(5)|浏览(112)

我正尝试在我的数据框中创建一个新列,基于另一个数据框中的字符串子集。这是我的数据框

df =structure(list(Combination = c("BRUV_Acoustic_Satellite", "BRUV_Acoustic_Satellite", 
"BRUV_Acoustic_Satellite", "BRUV_Acoustic_Satellite", "BRUV_Acoustic_Satellite", 
"BRUV_Acoustic_Satellite", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Controlled_Acoustic", "Controlled_Acoustic", 
"Controlled_Acoustic", "Controlled_Acoustic", "Controlled_Acoustic", 
"Controlled_Acoustic", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Stationary_Radio", "Stationary_Radio", 
"Stationary_Radio", "Animalborne_Satellite_Archival", "Animalborne_Satellite_Archival", 
"Animalborne_Satellite_Archival", "Animalborne_Satellite_Archival", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"BRUV_Acoustic_Satellite", "BRUV_Acoustic_Satellite", "BRUV_Acoustic_Satellite", 
"Stationary_Archival", "Stationary_Archival", "Stationary_Archival", 
"Stationary_Archival", "Stationary_Acoustic_Radio_PIT", "Stationary_Acoustic_Radio_PIT", 
"Stationary_Acoustic_Radio_PIT", "Controlled_Acoustic", "Controlled_Acoustic", 
"Stationary_PIT", "Stationary_PIT", "Stationary_Acousitc_PIT", 
"Stationary_Acousitc_PIT", "Stationary_Acousitc_PIT", "BRUV_Acoustic", 
"BRUV_Acoustic", "BRUV_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Archival", 
"Stationary_Archival", "Stationary_Archival", "Stationary_Archival", 
"Stationary_Satellite", "Controlled_Acoustic", "Controlled_Acoustic", 
"Controlled_Acoustic", "Controlled_Acoustic", "BRUV_Acoustic", 
"BRUV_Acoustic", "BRUV_Acoustic", "Animalborne_Satellite", "Animalborne_Satellite", 
"Stationary_Archival", "Stationary_Archival", "Stationary_Archival", 
"Stationary_Radio_PIT", "Stationary_Radio_PIT", "Controlled_Acoustic", 
"Controlled_Acoustic", "Controlled_Acoustic", "Controlled_Acoustic", 
"Controlled_Satellite", "Controlled_Satellite", "Controlled_Satellite", 
"Controlled_Satellite", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival_PIT", "Animalborne_Archival_PIT", "Animalborne_Archival_PIT", 
"Animalborne_Acoustic_Archival", "Animalborne_Acoustic_Archival", 
"Animalborne_Acoustic_Archival", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Stationary_Acoustic_Archival", 
"Stationary_Acoustic_Archival", "Stationary_Acoustic_Archival", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Stationary_Acoustic", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Stationary_Acoustic_Archival", "Stationary_Acoustic_Archival", 
"Stationary_Acoustic_Archival", "Stationary_Acoustic_Archival", 
"Animalborne_Acoustic", "Animalborne_Acoustic", "Animalborne_Acoustic", 
"Animalborne_Archival", "Animalborne_Archival", "Stationary_Acoustic_PIT", 
"Stationary_Acoustic_PIT", "Stationary_Acoustic_PIT", "BRUV_Acoustic", 
"BRUV_Acoustic", "BRUV_Acoustic", "BRUV_Acoustic", "BRUV_Acoustic", 
"BRUV_Acoustic", "Controlled_Archival", "Controlled_Archival", 
"Controlled_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Stationary_Radio", 
"Stationary_Acoustic_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Stationary_Acoustic_Archival", "Stationary_Acoustic_Archival", 
"Stationary_Acoustic_Archival", "Controlled_Acoustic", "Controlled_Acoustic", 
"Animalborne_Archival", "Animalborne_Archival", "Stationary_Acoustic", 
"Stationary_Acoustic", "Animalborne_Satellite_Archival", "Animalborne_Satellite_Archival", 
"Animalborne_Satellite_Archival", "Animalborne_Satellite_Archival", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Stationary_Satellite", "Stationary_Satellite", 
"Stationary_Satellite", "Stationary_Satellite", "Stationary_Satellite", 
"Animalborne_Archival", "Animalborne_Archival", "Stationary_Acoustic_Radio", 
"Stationary_Acoustic_Radio", "Stationary_Acoustic_Radio", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Stationary_Acoustic", "Stationary_Acoustic", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "Animalborne_Archival", "Animalborne_Archival", 
"Animalborne_Archival", "BRUV_Acoustic", "BRUV_Acoustic", "BRUV_Acoustic", 
"BRUV_Acoustic", "BRUV_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Stationary_Acoustic", "Stationary_Acoustic", 
"Stationary_Acoustic", "Animalborne_Satellite_Archival", "Animalborne_Satellite_Archival", 
"Animalborne_Satellite_Archival", "Controlled_Acoustic", "Controlled_Acoustic", 
"Controlled_Acoustic")), class = "data.frame", row.names = c(NA, 
-245L))

我需要一个新列,其中的值与一些字符串相关,因此对于每个具有Acousticradio或者PIT,新列中的值应该是receiver based,其他值应该是non receiver based,但是对于包含acousticsatellite的列,我需要新列中的值是Both
我已尝试使用以下代码执行ifelse命令

df$Type = ifelse(df$Combination == "Acoustic", 'Non Receiver Based', 'Receiver Based')

但是它把它们都叫做receiver based,我不知道如何把我上面提到的所有论点结合起来。

yh2wf1be

yh2wf1be1#

可以使用grepl()检查字符串中是否出现某种模式,然后使用if-else语句确定大小写。由于if()没有矢量化,因此需要将其 Package 在Vectorize()中,以便在mutate()中使用。

library(tidyverse)

match <- Vectorize(function(string) {
  if (!grepl("Acoustic|radio|PIT", string)) {
    "non receiver based"
  } else if ((grepl("Acoustic", string)) &
             (grepl("Satellite", string))) {
    "Both"
  } else "receiver based"
})

df %>% mutate(new_var = match(Combination))
sq1bmfud

sq1bmfud2#

使用grepl检查字符串中是否出现了其中一个单词。模式由以|分隔的单词组成,即检查字符串中是否出现了以下单词之一:

min_string = c("BRUV_Acoustic_Satellite", "Stationary_Radio_PIT", "Animalborne_Satellite_Archival")

pattern = paste0(c("Acoustic", "radio", "PIT"), collapse = "|")
ifelse(!grepl(pattern, min_string), 'Non Receiver Based', 
       ifelse(grepl("Acoustic", min_string) & grepl("Satellite", min_string), "Both",
              "Receiver Based"))
#[1] "Both"               "Receiver Based"     "Non Receiver Based"

dplyr的另一种选择是使用case_when,这可能更容易理解:

library(dplyr) 
case_when(!grepl(pattern, min_string) ~ "Non Receiver",
             grepl("Acoustic", min_string) & grepl("Satellite", min_string) ~ "Both",
             grepl(pattern, min_string) ~ "Receiver")
zpf6vheq

zpf6vheq3#

您可以使用tidyverse中的一些函数。下面,我创建了一个名为new_col的新列,它是您想要的输出:

library(tidyverse)

df %>% 
  mutate(new_col = if_else(str_detect(Combination, "Acoustic")==T | str_detect(Combination, "Radio")==T | str_detect(Combination, "PIT"), "receiver based", "non receiver based")) %>% 
  mutate(new_col = if_else(str_detect(Combination, "Acoustic")==T & str_detect(Combination, "Satellite")==T, "Both", new_col))
zf9nrax1

zf9nrax14#

像这样的东西对你有帮助吗?

df1 <- df %>% mutate(new_col = case_when(
    str_detect(Combination, "Acoustic_Satellite") ~ "both",
    str_detect(Combination, "Acoustic") ~ "Receiver Based",
    str_detect(Combination, "Radio") ~ "Receiver Based",
    str_detect(Combination, "PIT") ~ "Receiver Based"))

df2 <- replace(df1, is.na(df1), "non receiver based")

这是一个相当肮脏的解决方案,我相信有人会找到一个更好的,只有工作,如果声学_卫星是一个单一的字在您的数据集,但它做的工作。

js5cn81o

js5cn81o5#

一个简单的逐步方法可以是:

df$Type <- "non receiver based"
i <- grepl("Acoustic", df$Combination)
df$Type[i | grepl("radio|PIT", df$Combination)] <- "receiver based"
df$Type[i & grepl("Satellite", df$Combination)] <- "Both"
rm(i)

df
#                       Combination               Type
#1          BRUV_Acoustic_Satellite               Both
#2          BRUV_Acoustic_Satellite               Both
#3          BRUV_Acoustic_Satellite               Both
#4          BRUV_Acoustic_Satellite               Both
#5          BRUV_Acoustic_Satellite               Both
#6          BRUV_Acoustic_Satellite               Both
#7             Animalborne_Archival non receiver based
#8             Animalborne_Archival non receiver based
#9             Animalborne_Archival non receiver based
#10             Controlled_Acoustic     receiver based
#...

只是为了好玩的基准:

library(tidyverse)

match <- Vectorize(function(string) {  #Maybe another name would be better
  if (!grepl("Acoustic|radio|PIT", string)) {
    "non receiver based"
  } else if ((grepl("Acoustic", string)) &
             (grepl("Satellite", string))) {
    "Both"
  } else "receiver based"
})

bench::mark(check = FALSE,
Maël1 = local({pattern = paste0(c("Acoustic", "radio", "PIT"), collapse = "|")
cbind(df, Type=ifelse(!grepl(pattern, df$Combination), 'Non Receiver Based', 
       ifelse(grepl("Acoustic", df$Combination) & grepl("Satellite", df$Combination), "Both",
              "Receiver Based"))) }),

Maël2 = local({pattern = paste0(c("Acoustic", "radio", "PIT"), collapse = "|")
  cbind(df, Type=case_when(!grepl(pattern, df$Combination) ~ "Non Receiver",
             grepl("Acoustic", df$Combination) & grepl("Satellite", df$Combination) ~ "Both",
             grepl(pattern, df$Combination) ~ "Receiver")) }),

"Lukas Unterschuetz" = local({df %>% mutate(new_var = match(Combination))}),

Leonardo19 = local({df %>% 
  mutate(new_col = if_else(str_detect(Combination, "Acoustic")==T | str_detect(Combination, "Radio")==T | str_detect(Combination, "PIT"), "receiver based", "non receiver based")) %>% 
  mutate(new_col = if_else(str_detect(Combination, "Acoustic")==T & str_detect(Combination, "Satellite")==T, "Both", new_col)) }),

procerus = local({df1 <- df %>% mutate(new_col = case_when(
    str_detect(Combination, "Acoustic_Satellite") ~ "both",
    str_detect(Combination, "Acoustic") ~ "Receiver Based",
    str_detect(Combination, "Radio") ~ "Receiver Based",
    str_detect(Combination, "PIT") ~ "Receiver Based"))
    replace(df1, is.na(df1), "non receiver based")}),

GKi = local({df$Type <- "non receiver based"
  i <- grepl("Acoustic", df$Combination, fixed=TRUE)
  df$Type[i | grepl("radio|PIT", df$Combination)] <- "receiver based"
  df$Type[i & grepl("Satellite", df$Combination, fixed=TRUE)] <- "Both"
  rm(i)
  df})
)

结果

expression              min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
  <bch:expr>         <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
1 Maël1              531.63µs 562.63µs     1751.    31.1KB     2.02   865     1
2 Maël2              776.25µs 820.19µs     1204.    44.4KB     2.02   596     1
3 Lukas Unterschuetz   5.52ms   5.76ms      172.    11.8KB     4.13    83     2
4 Leonardo19           2.91ms   3.07ms      307.    50.6KB     6.26   147     3
5 procerus             1.89ms   2.02ms      457.    66.9KB     6.24   220     3
6 GKi                 214.9µs 231.51µs     4241.    11.8KB     2.02  2099     1

相关问题