regex 如何根据向量中字符的内容计算字符串中的字符数

1yjd4xko  于 2022-11-18  发布在  其他
关注(0)|答案(3)|浏览(139)

我有以下向量:

v1 <- c("R", "H", "K")  # * (asterisk sign)
v2 <- c("D", "E")       # + (plus sign)
v3 <- c("A")            # - (minus sign)

给定另一个字符串,我想计算有多少个字符属于v1v2。例如:

x1 <- "GMRRRARRRS"
#        ***-***
# v1_count = 6
# v2_count = 0
# v3_count = 1

x2 <- "KMRDFRHRAE"
#      * *+ ***-+
# v1_count = 5
# v2_count = 2
# v3_count = 1

因此,属于向量的任何字符都将作为单个计数进行计数。
最终输出将是 Dataframe 或tibble:

R,H,K        D,E       A
GMRRRARRRS      6           0        1
KMRDFRHRAE      5           2        1

我如何用R实现这一点?

juzqafwq

juzqafwq1#

我想我们可以简单一点。也许只是:

library(tidyverse)

x1 <- "GMRRRARRRS"
x2 <- "KMRDFRHRAE"

count_v <- function(string){
  tibble(text = string,
         RHK = str_count(string, paste(c("R", "H", "K"), collapse = "|")),
         DE = str_count(string, paste(c("D", "E"), collapse = "|")),
         A = str_count(string, 'A'))
}

count_v(c(x1,x2))
#> # A tibble: 2 x 4
#>   text         RHK    DE     A
#>   <chr>      <int> <int> <int>
#> 1 GMRRRARRRS     6     0     1
#> 2 KMRDFRHRAE     5     2     1

或者更灵活一点:

library(tidyverse)

x1 <- "GMRRRARRRS"
x2 <- "KMRDFRHRAE"

count_v <- function(string, checks){
  bind_cols(tibble(text = string),
            map_dfc(checks, \(x){
              tibble("{paste(x, collapse = '')}" := str_count(string, 
                                                              paste(x, collapse = "|")))
              }) )
}

count_v(string = c(x1, x2),
        checks = list(c("R", "H", "K"),
                      c("D", "E"),
                      c("A", "S"),
                      c("K", "F", "G")))
#> # A tibble: 2 x 5
#>   text         RHK    DE    AS   KFG
#>   <chr>      <int> <int> <int> <int>
#> 1 GMRRRARRRS     6     0     2     1
#> 2 KMRDFRHRAE     5     2     1     2
rjzwgtxy

rjzwgtxy2#

library(tidyverse)

对tibble或数据框中的数据进行采样

# A tibble: 2 x 1
  string    
  <chr>     
1 GMRRRARRRS
2 KMRDFRHRAE

创建函数以提取文本信息

get_count <- function(string) {
  v1 <- c("R", "H", "K") 
  v2 <- c("D", "E")      
  v3 <- c("A")
  
  char <- string %>%
    str_split("") %>%
    getElement(1)
  
  tibble(
    "RHK" = length(char[char %in% v1]),
    "DE" = length(char[char %in% v2]),
    "A" = length(char[char %in% v3])
  )
}

使用函数和unnest()改变新列data

df %>% 
  mutate(data = map(string, get_count)) %>% 
  unnest(everything())

# A tibble: 2 x 4
  string       RHK    DE     A
  <chr>      <int> <int> <int>
1 GMRRRARRRS     6     0     1
2 KMRDFRHRAE     5     2     1

相关问题