在R中使用Regex拆分列中的单词

wlsrxk51  于 2023-03-05  发布在  其他
关注(0)|答案(3)|浏览(137)

我有一个假设的 Dataframe ,其中包含hashtagsusername列。

hashtags <- c("['mancity', 'naomicampell']","['PaTvUpdates']","['SputnikV']","['sputnikvaccineregistration', 'UnlockOurCountry', 'AstraZeneca', 'coronaInaua']",
              "['Africa', 'Sinopharm', 'Sinopham', 'vaccine', 'vaccinedeaths']", "['Sinopharm', 'COVID']")
username <- c("a","b","z","x","d","w")
twts <- data.frame(hashtags, username)

我想拆分标签,这样我就可以得到每个标签在列hashtag中出现的频率,我该怎么做呢?
我没有可复制的代码,因为我是regex的新手

cwtwac6a

cwtwac6a1#

py_eval的帮助下,我们可以用python的方式解析单词

> library(reticulate)

> with(twts, table(unlist(lapply(hashtags, py_eval))))

                    Africa                AstraZeneca
                         1                          1
               coronaInaua                      COVID
                         1                          1
                   mancity               naomicampell
                         1                          1
               PaTvUpdates                   Sinopham
                         1                          1
                 Sinopharm                   SputnikV
                         2                          1
sputnikvaccineregistration           UnlockOurCountry
                         1                          1
                   vaccine              vaccinedeaths
                         1                          1

> with(twts, table(unlist(py_eval(sprintf("[%s]", toString(hashtags))))))

                    Africa                AstraZeneca
                         1                          1 
               coronaInaua                      COVID
                         1                          1
                   mancity               naomicampell
                         1                          1
               PaTvUpdates                   Sinopham
                         1                          1
                 Sinopharm                   SputnikV
                         2                          1
sputnikvaccineregistration           UnlockOurCountry
                         1                          1
                   vaccine              vaccinedeaths
                         1                          1
7gcisfzg

7gcisfzg2#

使用stringr

library(stringr)

str_remove_all(twts$hashtags, "[^ \\w+]") |> 
  str_split(" ") |> unlist() |> table()

                    Africa                AstraZeneca 
                         1                          1 
               coronaInaua                      COVID 
                         1                          1 
                   mancity               naomicampell 
                         1                          1 
               PaTvUpdates                   Sinopham 
                         1                          1 
                 Sinopharm                   SputnikV 
                         2                          1 
sputnikvaccineregistration           UnlockOurCountry 
                         1                          1 
                   vaccine              vaccinedeaths 
                         1                          1

保留标记中的空格

str_split(twts$hashtags, ",") |> unlist() |> str_remove_all("[^ \\w+]") |> 
  str_squish() |> table()
pexxcrt2

pexxcrt23#

使用gregexpr/regmatches、unlist将列表提取到一个向量中,并使用table获取base R中的频率计数

table(unlist(with(twts, regmatches(hashtags, gregexpr('[A-Za-z]+', hashtags)))))
  • 输出
Africa                AstraZeneca                coronaInaua                      COVID                    mancity 
                         1                          1                          1                          1                          1 
              naomicampell                PaTvUpdates                   Sinopham                  Sinopharm                   SputnikV 
                         1                          1                          1                          2                          1 
sputnikvaccineregistration           UnlockOurCountry                    vaccine              vaccinedeaths 
                         1                          1                          1                          1

或者在管子里

with(twts, regmatches(hashtags, gregexpr('[A-Za-z]+', hashtags))) |> 
   unlist() |>
   table()

相关问题