如何从csv/dataframe列中的字符串解析数组的数组

bkkx9g8r  于 2023-03-10  发布在  其他
关注(0)|答案(5)|浏览(179)

我有一个csv文件,在一列中包含一个numpy数组。当阅读csv文件时,结果列将是字符类型,因为它都被 Package 在字符串中。我想将它解析到一个单独的 Dataframe 中来分析数据。

输入数据

csv格式:

first_column,second_column
a,"[[1,2],[3,4]]"
b,"[[5,6],[7,8]]"
c,"[[9,10],[11,12]]"

作为 Dataframe :

df <- data.frame(first_column  = c("a","b","c"),
                 second_column = c("[[1,2],[3,4]]","[[5,6],[7,8]]","[[9,10],[11,12]]"))

我所尝试的

因为我不知道有任何直接解析函数可以从字符串中提取数组,所以我开始做字符串操作。
删除外部[]字符:

> df %>% mutate(second_column = str_replace_all(second_column, c("^\\[" = "","]$" = "")))
  first_column  second_column
1            a    [1,2],[3,4]
2            b    [5,6],[7,8]
3            c [9,10],[11,12]

不过,从现在开始我不知道该怎么办了。

预期输出

最终生成的 Dataframe 应如下所示:

col_1 col_2
1     1     2
2     3     4
3     5     6
4     7     8
5     9    10
6    11    12

请注意,真实的 Dataframe 中有更多的列和更多的行

kjthegm6

kjthegm61#

将出现的]、[替换为换行符,将方括号替换为空格,并使用read.table读取。

df$second_column |>
  gsub("\\],\\[", "\n", x = _) |>
  chartr("[]", "  ", x = _) |>
  read.table(text = _, sep = ",")

给出:

V1 V2
1  1  2
2  3  4
3  5  6
4  7  8
5  9 10
6 11 12
klr1opcd

klr1opcd2#

  • 碱基 * 变体可以是:
#Replace [], with space
. <- gsub("[][,]", " ", df$second_column)
#. <- chartr("[],", "   ",  df$second_column) #Alternativ

#Split at "   " and unlist result
. <- unlist(strsplit(., "   ", fixed=TRUE))
#. <- sub("   ", "\n", ., fixed=TRUE) #Alternativ

#use read.table to get columns
read.table(text = .)
#  V1 V2
#1  1  2
#2  3  4
#3  5  6
#4  7  8
#5  9 10
#6 11 12

或使用trimws

. <- trimws(df$second_column, whitespace = "[][]")

. <- unlist(strsplit(., "],[", fixed=TRUE))
#. <- sub("],[", "\n", ., fixed=TRUE) #Alternativ

read.csv(text=.)
9gm1akwq

9gm1akwq3#

处理给定列上任意行数的基R方法。

setNames(
  data.frame(Vectorize(\(x) as.numeric(x))(
    data.frame(do.call(rbind, 
      sapply(lapply(strsplit(df$second_column, "\\],\\["), 
          gsub, pattern="\\[|\\]", replacement=""), strsplit, ","))))), 
  c("col_1", "col_2"))
  col_1 col_2
1     1     2
2     3     4
3     5     6
4     7     8
5     9    10
6    11    12
tnkciper

tnkciper4#

下面是一个使用tidyverse的黑客解决方案:

df <- data.frame(first_column  = c("a","b","c"),
                 second_column = c("[[1,2],[3,4]]","[[5,6],[7,8]]","[[9,10],[11,12]]"))

library(tidyverse)

df %>% 
  mutate(second_column = str_replace_all(second_column, c("^\\[" = "","]$" = "")),
         second_column = gsub("\\[|\\]", "", second_column)) %>% 
  separate(second_column, into = c("col_1", "col_2", "col_3", "col_4"), sep = ",") %>% 
  pivot_longer(-first_column) %>% 
  mutate(name = case_when(name == "col_3" ~ "col_1",
                          name == "col_4" ~ "col_2", 
                          .default = name)) %>% 
  select(-first_column) %>% 
  pivot_wider(names_from = name, values_from = value, values_fn = list) %>% 
  unnest(cols = c(col_1, col_2))
  
#> # A tibble: 6 × 2
#>   col_1 col_2
#>   <chr> <chr>
#> 1 1     2    
#> 2 3     4    
#> 3 5     6    
#> 4 7     8    
#> 5 9     10   
#> 6 11    12
ftf50wuq

ftf50wuq5#

reticulatepy_eval的技巧

library(reticulate)
with(
  df,
  as.data.frame(
    do.call(
      rbind,
      py_eval(gsub("]], [[", "],[",
        toString(second_column),
        fixed = TRUE
      ))
    )
  )
)

或者从jsonlite使用fromJSON的另一技巧

library(jsonlite)
with(
  df,
  as.data.frame(
    fromJSON(sprintf("[%s]", gsub("]], [[", "],[",
      toString(second_column),
      fixed = TRUE
    )))[1, , ]
  )
)

输出

V1 V2
1  1  2
2  3  4
3  5  6
4  7  8
5  9 10
6 11 12

相关问题