将 Dataframe 的一行中包含的字符串拆分固定数量的字符,并将结果片段存储在后续行中

rbpvctlc  于 2023-04-03  发布在  其他
关注(0)|答案(4)|浏览(139)

我有以下数据框:

df <- data.frame(V1 = c(">A1_[Er]", 
                        "aaaabbbcccc", 
                        ">B2_[Br]", 
                        "ddddeeeeeff", 
                        ">C3_[Gh]", 
                        "ggggggghhhhhiiiiijjjjjj"))

我想按照固定的字符数(这个问题的目的是两个)拆分字符串,并将它们放在新的行中。我还想排除包含以“〉”符号开头的字符串的行。结果 Dataframe 应该如下所示:

df1 <- data.frame(V1 = c(">A1_[Er]", "aa", "aa", "bb", "bc", "cc", "c", 
                         ">B2_[Br]", "dd", "dd", "ee", "ee", "ef", "f",
                         ">C3_[Gh]", "gg", "gg", "gg", "gh", "hh", "hh", "ii", "ii", "ij", "jj", "jj", "jj"))

我试过在一个子集化的df上使用separate_longer_position()函数,如下所示:

separate_longer_position(subset(df, !df$V1 %like% ">"), V1, 2)

我的方法确实分割了所需的字符串,但也将包含以“〉”开头的字符串的行从结果 Dataframe 中删除。
顺便说一句,这确实是一个FASTA格式,但出于教育目的,我不想使用像Biostrings这样的专用软件包来解决这个问题。
请指示。

byqmnocz

byqmnocz1#

你可以试试regmatches

df1 <-
  data.frame(V1 = with(
    df,
    unlist(
      lapply(
        V1,
        function(x) {
          if (startsWith(x, ">")) {
            x
          } else {
            regmatches(x, gregexpr("\\w{1,2}", x))
          }
        }
      )
    )
  ))

并获得了

> df1
         V1
1  >A1_[Er]
2        aa
3        aa
4        bb
5        bc
6        cc
7         c
8  >B2_[Br]
9        dd
10       dd
11       ee
12       ee
13       ef
14        f
15 >C3_[Gh]
16       gg
17       gg
18       gg
19       gh
20       hh
21       hh
22       ii
23       ii
24       ij
25       jj
26       jj
27        j
xvw2m8pv

xvw2m8pv2#

虽然我可能会迟到,但这里有一个tidyverse解决方案可能值得考虑:

library(tidyverse)
df %>%
  # if there are at least two lower-case chars...
  mutate(V1 = ifelse(str_detect(V1, "[a-z]{2,}"),
                     # ...extract them in pairs,...
                     str_extract_all(V1, "..?"),
                     # ...else, leave `V1` as-is:
                     V1)) %>%
  # cast the listed values in long format:
  unnest_longer(V1)
# A tibble: 24 × 1
   V1      
   <chr>   
 1 >A1_[Er]
 2 aa      
 3 bb      
 4 bc      
 5 cc      
 6 c       
 7 >B2_[Br]
 8 dd      
 9 ee      
10 ee      
# … with 14 more rows
qyyhg6bp

qyyhg6bp3#

我们可以使用tidyverse作为

library(dplyr)
library(tidyr)
library(stringr)
df %>%
  mutate(V1 = str_replace_all(V1, "([a-z]{2})", "\\1,")) %>% 
  separate_longer_delim(V1, delim = ",")
  • 输出
V1
1  >A1_[Er]
2        aa
3        aa
4        bb
5        bc
6        cc
7         c
8  >B2_[Br]
9        dd
10       dd
11       ee
12       ee
13       ef
14        f
15 >C3_[Gh]
16       gg
17       gg
18       gg
19       gh
20       hh
21       hh
22       ii
23       ii
24       ij
25       jj
26       jj
27        j
7ajki6be

7ajki6be4#

您可以使用gregexp^>.*|.{1,2}来匹配以>开头或以长度2分割的任何内容,并使用regmatches提取匹配项。

unlist(regmatches(df$V1, gregexpr("^>.*|.{1,2}", df$V1)))
# [1] ">A1_[Er]" "aa"       "aa"       "bb"       "bc"       "cc"      
# [7] "c"        ">B2_[Br]" "dd"       "dd"       "ee"       "ee"      
#[13] "ef"       "f"        ">C3_[Gh]" "gg"       "gg"       "gg"      
#[19] "gh"       "hh"       "hh"       "ii"       "ii"       "ij"      
#[25] "jj"       "jj"       "j"

或者使用strsplit(?<=..)将字符串拆分为 * 固定数量的字符 *,然后使用[<-(或replace)插入以>开头的字符串。

i <- which(startsWith(df$V1, ">"))
unlist(`[<-`(strsplit(`[<-`(df$V1, i, ""), "(?<=..)", perl=TRUE), i, df$V1[i]))
# [1] ">A1_[Er]" "aa"       "aa"       "bb"       "bc"       "cc"      
# [7] "c"        ">B2_[Br]" "dd"       "dd"       "ee"       "ee"      
#[13] "ef"       "f"        ">C3_[Gh]" "gg"       "gg"       "gg"      
#[19] "gh"       "hh"       "hh"       "ii"       "ii"       "ij"      
#[25] "jj"       "jj"       "j"

或者使用lapply

unlist(lapply(df$V1, \(x)
       if(startsWith(x, ">")) x else strsplit(x, "(?<=..)", perl=TRUE)))
# [1] ">A1_[Er]" "aa"       "aa"       "bb"       "bc"       "cc"      
# [7] "c"        ">B2_[Br]" "dd"       "dd"       "ee"       "ee"      
#[13] "ef"       "f"        ">C3_[Gh]" "gg"       "gg"       "gg"      
#[19] "gh"       "hh"       "hh"       "ii"       "ii"       "ij"      
#[25] "jj"       "jj"       "j"

基准

library(dplyr)  #For akrun and Chris Ruehlemann
library(tidyr)
library(stringr)

bench::mark(
gregexpr = unlist(regmatches(df$V1, gregexpr("^>.*|.{1,2}", df$V1))),
strsplit = {i <- which(startsWith(df$V1, ">"))
  unlist(`[<-`(strsplit(`[<-`(df$V1, i, ""), "(?<=..)", perl=TRUE), i, df$V1[i]))},
lapply = {unlist(lapply(df$V1, \(x)
  if(startsWith(x, ">")) x else strsplit(x, "(?<=..)", perl=TRUE)))},
ThomasIsCoding = {with(df, unlist( lapply(V1, function(x) {
          if (startsWith(x, ">")) {x
          } else { regmatches(x, gregexpr("\\w{1,2}", x)) } }))) },
"Chris Ruehlemann" = {df %>%  #Splitting [a-z] instead of excluding starting with >
  mutate(V1 = ifelse(str_detect(V1, "[a-z]{2,}"),
                     str_extract_all(V1, "..?"),
                     V1)) %>%
  unnest_longer(V1) %>% .$V1},
akrun = {df %>%  #Splitting [a-z] instead of excluding starting with >
  mutate(V1 = str_replace_all(V1, "([a-z]{2})", "\\1,")) %>% 
  separate_longer_delim(V1, delim = ",")  %>% .$V1}
)

结果

expression            min  median itr/s…¹ mem_al…² gc/se…³ n_itr  n_gc total…⁴
  <bch:expr>       <bch:tm> <bch:t>   <dbl> <bch:by>   <dbl> <int> <dbl> <bch:t>
1 gregexpr          41.68µs  45.3µs  21454.  48.82KB    37.8  9645    17   450ms
2 strsplit          12.26µs 13.18µs  75013.     264B    15.0  9998     2   133ms
3 lapply            20.46µs 21.54µs  46026.   9.44KB    18.4  9996     4   217ms
4 ThomasIsCoding    83.59µs 88.65µs  11107.  24.54KB    25.7  5184    12   467ms
5 Chris Ruehlemann   3.61ms  3.69ms    269.  12.22KB    23.0   117    10   435ms
6 akrun              3.01ms  3.06ms    324.  11.11KB    20.0   146     9   450ms

在这种情况下,使用strsplit是最快的,并且使用最少的内存量。

相关问题