regex 正则表达式模式,用于R中分隔字符串或规则字符串中的精确长度

kognpnkq  于 2023-06-07  发布在  其他
关注(0)|答案(2)|浏览(151)

我想得到所有的字符串,看起来像这样:

ph <- c ("ioL421.63", #6 chars.2 chars
"jur421.73.0o4435", #6 chars.2 chars.6 chars
"koL421.2p.9i4675.k23", #6 chars.2 chars.6 chars.3 chars
"6775po.78.678959.p2p.913", #6 chars.2 chars.6 chars.3 chars.3 chars
"193485.k2.l3.34.67", #6 chars.2 chars.2 chars.2 chars.2 chars

"ioL421.6", #6 chars.1 chars
"jur421.3.0o4", #6 chars.1 chars.3 chars
"koL421.2.9i5.k2390", #6 chars.1 chars.3 chars.5 chars
"6775po.8.678.p2p91.674e", #6 chars.1 chars.3 chars.5 chars.4 chars

#***** Then only with these lengths ******

"842f45", #6 chars
"234567890123567hk", #17 chars
"234567890123567hkiq", #19 chars
"234567890123567hkiq5" #20 chars
)

以下是无效字符串:

invalid_ph <- c("23289jh", # 7 chars
"2382h", #5 chars
"2934567890123567h8", # 18 chars
"234567890123q3", 
"234567890123567hkiq57878787", 
"ZX3.235.9845.3843924.39403",
"sjkfuju2rwrrlnmld828384230403208402834fs",
"TY5648.235.123456",
"ABC3.235.9845",
"361 234 4356",
"a1.02.b3.00",
"01.01.01",
"23289jhd",
"01",
"01.02",
"01.01.01",
"aa.bb",
"ac.21",
"aa.01-02",
"123.2.10.834.18934",
"a1."
)

ph <- append(ph, invalid_ph)

我第一次使用正则表达式,并提出了下面的,想知道我如何才能巩固他们和纠正一旦不产生正确的输出。
library(stringr)使用stringr包提取字符串。

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{3}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3}([.])[a-zA-Z0-9]{5})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{3}([.])[a-zA-Z0-9]{3}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1}([.])[a-zA-Z0-9]{3}([.])[a-zA-Z0-9]{5}([.])[a-zA-Z0-9]{4})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}|[a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{1})$")

str_extract(ph, "^([a-zA-Z0-9]{6}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{2}([.])[a-zA-Z0-9]{2})$")
vbopmzt1

vbopmzt11#

使用函数创建所有有效模式:

f <- function(len = 1L, pat = '[A-Za-z0-9]', sep = '\\.') {
  p <- paste0(sprintf('(%s){%s}', pat, len), collapse = sep)
  sprintf('^%s$', p)
}

f()
# [1] "^([A-Za-z0-9]){1}$"
f(len = c(1, 2))
# [1] "^([A-Za-z0-9]){1}\\.([A-Za-z0-9]){2}$"
f(len = c(6, 2, 6))
# [1] "^([A-Za-z0-9]){6}\\.([A-Za-z0-9]){2}\\.([A-Za-z0-9]){6}$"

len <- list(
  c(6, 2),
  c(6, 2, 6),
  c(6, 2, 6, 3),
  c(6, 2, 6, 3, 3),
  c(6, 2, 2, 2, 2),
  c(6, 1),
  c(6, 1, 3),
  c(6, 1, 3, 5),
  c(6, 1, 3, 5, 4),
  6, 17, 19, 20
)

pat <- paste0(sapply(len, f), collapse = '|')
data.frame(string = ph, valid = grepl(pat, ph))

#                                      string valid
# 1                                 ioL421.63  TRUE
# 2                          jur421.73.0o4435  TRUE
# 3                      koL421.2p.9i4675.k23  TRUE
# 4                  6775po.78.678959.p2p.913  TRUE
# 5                        193485.k2.l3.34.67  TRUE
# 6                                  ioL421.6  TRUE
# 7                              jur421.3.0o4  TRUE
# 8                        koL421.2.9i5.k2390  TRUE
# 9                   6775po.8.678.p2p91.674e  TRUE
# 10                                   842f45  TRUE
# 11                        234567890123567hk  TRUE
# 12                      234567890123567hkiq  TRUE
# 13                     234567890123567hkiq5  TRUE
# 14                                  23289jh FALSE
# 15                                    2382h FALSE
# 16                       2934567890123567h8 FALSE
# 17                           234567890123q3 FALSE
# 18              234567890123567hkiq57878787 FALSE
# 19               ZX3.235.9845.3843924.39403 FALSE
# 20 sjkfuju2rwrrlnmld828384230403208402834fs FALSE
# 21                        TY5648.235.123456 FALSE
# 22                            ABC3.235.9845 FALSE
# 23                             361 234 4356 FALSE
# 24                              a1.02.b3.00 FALSE
# 25                                 01.01.01 FALSE
# 26                                 23289jhd FALSE
# 27                                       01 FALSE
# 28                                    01.02 FALSE
# 29                                 01.01.01 FALSE
# 30                                    aa.bb FALSE
# 31                                    ac.21 FALSE
# 32                                 aa.01-02 FALSE
# 33                       123.2.10.834.18934 FALSE
# 34                                      a1. FALSE
nkcskrwz

nkcskrwz2#

将其拆分为2个正则表达式模式,但测试结果仅与非匹配字符串列表一样好:

library(stringr)

# 6 word charcters <.> 1 or 2 words charcters <end of line> or <.> any number of word characters and periods
regex_01 <- "^\\w{6}\\.\\w{1,2}($|\\.[\\w\\.]+$)"

# 6 or 17 or 19..20 word characters
regex_02 <- "^(\\w{6}|\\w{17}|\\w{19,20})$"

# test on matching strings:
str_view(str_ok, regex_01)
#> [1] │ <ioL421.63>
#> [2] │ <jur421.73.0o4435>
#> [3] │ <koL421.2p.9i4675.k23>
#> [4] │ <6775po.78.678959.p2p.913>
#> [5] │ <193485.k2.l3.34.67>
#> [6] │ <ioL421.6>
#> [7] │ <jur421.3.0o4>
#> [8] │ <koL421.2.9i5.k2390>
#> [9] │ <6775po.8.678.p2p91.674e>
str_view(str_ok, regex_02)
#> [10] │ <842f45>
#> [11] │ <234567890123567hk>
#> [12] │ <234567890123567hkiq>
#> [13] │ <234567890123567hkiq5>

# test on non-matching strings:
str_view(str_nok, regex_01)
str_view(str_nok, regex_02)

测试字符串:

str_ok <- c("ioL421.63", #6 chars.2 chars
            "jur421.73.0o4435", #6 chars.2 chars.6 chars
            "koL421.2p.9i4675.k23", #6 chars.2 chars.6 chars.3 chars
            "6775po.78.678959.p2p.913", #6 chars.2 chars.6 chars.3 chars.3 chars
            "193485.k2.l3.34.67", #6 chars.2 chars.2 chars.2 chars.2 chars
            
            
            "ioL421.6", #6 chars.1 chars
            "jur421.3.0o4", #6 chars.1 chars.3 chars
            "koL421.2.9i5.k2390", #6 chars.1 chars.3 chars.5 chars
            "6775po.8.678.p2p91.674e", #6 chars.1 chars.3 chars.5 chars.4 chars
            
            # ***** Then only with these lengths ******
            
            "842f45", #6 chars
            "234567890123567hk", #17 chars
            "234567890123567hkiq", #19 chars
            "234567890123567hkiq5" #20 chars
)

str_nok <- c("23289jh", # 7 chars
             "2382h", #5 chars
             "2934567890123567h8", # 18 chars
             "234567890123q3", 
             "234567890123567hkiq57878787", 
             "ZX3.235.9845.3843924.39403",
             "sjkfuju2rwrrlnmld828384230403208402834fs",
             "TY5648.235.123456",
             "ABC3.235.9845",
             "361 234 4356",
             "a1.02.b3.00",
             "01.01.01",
             "23289jhd",
             "01",
             "01.02",
             "01.01.01",
             "aa.bb",
             "ac.21",
             "aa.01-02",
             "123.2.10.834.18934",
             "a1."
)

创建于2023-06-06带有reprex v2.0.2

相关问题