从R中的混合格式化名称列中提取姓氏

9udxz4iz  于 2023-03-27  发布在  其他
关注(0)|答案(7)|浏览(168)

我需要从混合格式的名字中分离出姓氏。

df <- data.frame(id = c(1,2,3,4,5),
                 name = c("Jack Smith", "May, Flora", "Jackson","Baker, Gavin","Walls")) 

> df
  id         name
1  1   Jack Smith
2  2   May, Flora
3  3      Jackson
4  4 Baker, Gavin
5  5        Walls

我只需要提取姓氏。对于第一行,姓氏是Smith。对于第二行,姓氏是May,对于第三行,姓氏是Jackson。我如何创建姓氏列,如下所示?

> df
      id         name last_name
    1  1   Jack Smith  Smith
    2  2   May, Flora  May
    3  3      Jackson  Jackson
    4  4 Baker, Gavin  Baker
    5  5        Walls  Walls
drnojrws

drnojrws1#

如果你假设当逗号出现时,姓氏是第一个单词,并且这个人只有一个姓氏,并且它出现在最后,那么这样做:

library(stringr)
df$last = ifelse(str_detect(df$name, ','), 
                 str_replace(df$name, ',.*',''), 
                 str_extract(df$name, '\\S*$'))

您将获得:

id         name    last
1  1   Jack Smith   Smith
2  2   May, Flora     May
3  3      Jackson Jackson
4  4 Baker, Gavin   Baker
5  5        Walls   Walls

如果有逗号,从逗号开始用空字符串替换。如果没有逗号,则抓取最后一个单词作为姓氏。

mkh04yzy

mkh04yzy2#

使用gsub生成统一格式的名称的基本R选项

> transform(df,last_name = gsub("\\w+\\s", "", gsub("(\\w+), (\\w+)", "\\2 \\1", name)))
  id         name last_name
1  1   Jack Smith     Smith
2  2   May, Flora       May
3  3      Jackson   Jackson
4  4 Baker, Gavin     Baker
5  5        Walls     Walls
1yjd4xko

1yjd4xko3#

你可以尝试使用正则表达式的组合。第一个删除逗号右边的所有字符。第二个删除大写字母后面的空格前面的所有字母字符。

library(dplyr)
 
data.frame(id = c(1,2,3,4,5),
           name = c("Jack Smith", "May, Flora", "Jackson","Baker, Gavin","Walls")) |> 
    mutate(last_name = gsub(pattern = "[,].*$",  replacement = "", x = name) |> 
                       gsub(pattern = "^[A-z]+ (?=[A-Z])", replacement = "", x = _ , perl = TRUE))

  id         name last_name
1  1   Jack Smith     Smith
2  2   May, Flora       May
3  3      Jackson   Jackson
4  4 Baker, Gavin     Baker
5  5        Walls     Walls
czq61nw1

czq61nw14#

基地R路:

df$last_name <- mapply(\(n, comma) 'if'(comma, n[1], tail(n, 1)),
                       strsplit(df$name, ',? '),
                       grepl(',', df$name))
df
#   id         name last_name
# 1  1   Jack Smith     Smith
# 2  2   May, Flora       May
# 3  3      Jackson   Jackson
# 4  4 Baker, Gavin     Baker
# 5  5        Walls     Walls
sqserrrh

sqserrrh5#

我们可以使用stringr::str_extractif_else

library(dplyr)
library(stringr)

df %>%
    mutate(last_name = if_else(str_detect(name, ","),
                               str_extract_all(name, "^[A-Za-z]+"),
                               str_extract_all(name, "[A-Za-z]+$")))

  id         name last_name
1  1   Jack Smith     Smith
2  2   May, Flora       May
3  3      Jackson   Jackson
4  4 Baker, Gavin     Baker
5  5        Walls     Walls
lkaoscv7

lkaoscv76#

这里有一个tidyverse解决方案,它做了一些一般性的假设--真实的数据可能比示例数据更混乱:

library(dplyr)
library(stringr)

df <- data.frame(
  id = c(1, 2, 3, 4, 5),
  name = c("Jack Smith", "May, Flora", "Jackson", "Baker, Gavin", "Walls")
)

reformat_name <- function(name) {
  split_name <- stringr::str_split(name, ", ") %>%
    unlist() %>%
    rev() %>%
    paste(collapse = " ")
  
  return(split_name)
  
}

df %>%
  rowwise() %>%
  mutate(name = ifelse(stringr::str_detect(name, ", "), reformat_name(name), name),
         last_name = ifelse(stringr::str_detect(name, " "), trimws(stringr::str_extract(name, "\\s[^ ]+$")), name))
         
#> # A tibble: 5 × 3
#> # Rowwise: 
#>      id name        last_name
#>   <dbl> <chr>       <chr>    
#> 1     1 Jack Smith  Smith    
#> 2     2 Flora May   May      
#> 3     3 Jackson     Jackson  
#> 4     4 Gavin Baker Baker    
#> 5     5 Walls       Walls
nxagd54h

nxagd54h7#

试试看

library(dplyr)
library(stringr)
df %>% 
   mutate(last = str_extract(name, "(\\w+)(?=,|$)"))
  • 输出
id         name    last
1  1   Jack Smith   Smith
2  2   May, Flora     May
3  3      Jackson Jackson
4  4 Baker, Gavin   Baker
5  5        Walls   Walls

相关问题