从R中的dataframe列中删除重音符号

d7v8vwbk  于 2023-03-27  发布在  其他
关注(0)|答案(7)|浏览(145)

我得到了一个data.table库,在这个data.table中有一个term列

class(base$term)
[1] character
length(base$term)
[1] 27486

我可以从一个字符串中去掉重音符号。我可以从一个字符串向量中去掉重音符号。

iconv("Millésime",to="ASCII//TRANSLIT")
[1] "Millesime"
iconv(c("Millésime","boulangère"),to="ASCII//TRANSLIT")
[1] "Millesime" "boulangere"

但是由于某些原因,当我在term列上应用相同的函数时,它不起作用

base$terme[2]
[1] "Millésime"
iconv(base$terme[2],to="ASCII//TRANSLIT")
[1] "MillACsime"

有人知道这是怎么回事吗?

zkure5ic

zkure5ic1#

使用stringi包可能更容易。这样,您就不需要事先检查编码。此外,stringi在操作系统之间是一致的,而iconv则不是。

library(stringi)
 
base <- data.table(terme = c("Millésime", 
                             "boulangère", 
                             "üéâäàåçêëèïîì"))

base[, terme := stri_trans_general(str = terme, 
                                   id = "Latin-ASCII")]
 
> base
           terme
1:     Millesime
2:    boulangere
3: ueaaaaceeeiii
tjvv9vkg

tjvv9vkg2#

解决问题的方法:

Encoding(base$terme[2])
[1] "UTF-8"
iconv(base$terme[2],from="UTF-8",to="ASCII//TRANSLIT")
[1] "Millesime"

感谢@nicola

brtdzjyr

brtdzjyr3#

下面是Jeldrik为DataFrames修改的解决方案版本。注意,:=运算符在R基中被弃用。

library(stringi)

base <- data.frame(terme = c("Millésime", 
                             "boulangère", 
                             "üéâäàåçêëèïîì"))

base$terme = stri_trans_general(str = base$terme, id = "Latin-ASCII")
z8dt9xmd

z8dt9xmd4#

您可以应用此功能

rm_accent <- function(str,pattern="all") {
   if(!is.character(str))
    str <- as.character(str)

  pattern <- unique(pattern)

  if(any(pattern=="Ç"))
    pattern[pattern=="Ç"] <- "ç"

  symbols <- c(
    acute = "áéíóúÁÉÍÓÚýÝ",
    grave = "àèìòùÀÈÌÒÙ",
    circunflex = "âêîôûÂÊÎÔÛ",
    tilde = "ãõÃÕñÑ",
    umlaut = "äëïöüÄËÏÖÜÿ",
    cedil = "çÇ"
  )

  nudeSymbols <- c(
    acute = "aeiouAEIOUyY",
    grave = "aeiouAEIOU",
    circunflex = "aeiouAEIOU",
    tilde = "aoAOnN",
    umlaut = "aeiouAEIOUy",
    cedil = "cC"
  )

  accentTypes <- c("´","`","^","~","¨","ç")

  if(any(c("all","al","a","todos","t","to","tod","todo")%in%pattern)) # opcao retirar todos
    return(chartr(paste(symbols, collapse=""), paste(nudeSymbols, collapse=""), str))

  for(i in which(accentTypes%in%pattern))
    str <- chartr(symbols[i],nudeSymbols[i], str) 

  return(str)
}
nkkqxpd9

nkkqxpd95#

三种去除重音符号的方法-如下所示并相互比较。
要使用的数据:

dtCases <- fread("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/retired_datasets/individual_level/cases_2021_1.csv", stringsAsFactors = F )
dim(dtCases) #  751526     16

基准化:

> system.time(dtCases [, city0 := health_region])
   user  system elapsed 
  0.009   0.001   0.012 
> system.time(dtCases [, city1 := base::iconv (health_region, to="ASCII//TRANSLIT")]) # or ... iconv (health_region, from="UTF-8", to="ASCII//TRANSLIT")
   user  system elapsed 
  0.165   0.001   0.200 
> system.time(dtCases [, city2 := textclean::replace_non_ascii (health_region)])
   user  system elapsed 
  9.108   0.063   9.351 
> system.time(dtCases [, city3 := stringi::stri_trans_general (health_region,id = "Latin-ASCII")])
   user  system elapsed 
   4.34    0.00    4.46

结果:

> dtCases[city0!=city1, city0:city3] %>% unique
                           city0                         city1                         city2                         city3
                          <char>                        <char>                        <char>                        <char>
1:                      Montréal                      Montreal                      Montreal                      Montreal
2:                    Montérégie                    Monteregie                    Monteregie                    Monteregie
3:          Chaudière-Appalaches          Chaudiere-Appalaches          Chaudiere-Appalaches          Chaudiere-Appalaches
4:                    Lanaudière                    Lanaudiere                    Lanaudiere                    Lanaudiere
5:                Nord-du-Québec                Nord-du-Quebec                Nord-du-Quebec                Nord-du-Quebec
6:         Abitibi-Témiscamingue         Abitibi-Temiscamingue         Abitibi-Temiscamingue         Abitibi-Temiscamingue
7: Gaspésie-Îles-de-la-Madeleine Gaspesie-Iles-de-la-Madeleine Gaspesie-Iles-de-la-Madeleine Gaspesie-Iles-de-la-Madeleine
8:                     Côte-Nord                     Cote-Nord                     Cote-Nord                     Cote-Nord

结论:
base::iconv()是最快和首选的方法。在法语单词上测试过。没有在其他语言上测试过。

svdrlsy4

svdrlsy46#

基于jf2017代码,这里是一个tidyverse解决方案:

library(dplyr)    
base %>%
   # latin-ASCII-free "term" column (careful: this overwrites the original "term" column!)
   mutate(term = stringi::stri_trans_general(str = term, id = "Latin-ASCII")

要应用于数据框中的所有列,请使用

base %>%
mutate(across(.cols = everything(),
              .fns = ~ stringi::stri_trans_general(., id = "Latin-ASCII")))

运行stringi::stri_trans_list()以查看id可以采用的所有可用参数

dced5bon

dced5bon7#

快速,简单,易于修改,无依赖性:

# -----------------------------------------------------------------------------
# Removes common accents from letters.
#
# @param s The string to remove diacritics from.
# -----------------------------------------------------------------------------
accentless <- function( s ) {
  chartr(
    "áéóūáéíóúÁÉÍÓÚýÝàèìòùÀÈÌÒÙâêîôûÂÊÎÔÛãõÃÕñÑäëïöüÄËÏÖÜÿçÇ",
    "aeouaeiouAEIOUyYaeiouAEIOUaeiouAEIOUaoAOnNaeiouAEIOUycC",
    s );
}

相关问题