从json文件获取数据到R Dataframe

bwitn5fc  于 2023-06-07  发布在  其他
关注(0)|答案(1)|浏览(122)

什么是将特定JSON文件转换为R Dataframe 的最佳方法?它使用jsonlite包的fromJSON()函数和下面的自定义函数json_to_dataframe(),但我担心当json文件非常大时,此代码的性能问题(在运行时间方面)。有没有一个函数可以读取json文件,我们可以直接使用,而不需要重新处理dataframe?你有什么建议来改进json_to_dataframe()函数吗?

json <- '{
  "14856969": {
    "Run_Start_Time": "2022-04-13 15:00:00",
    "Run_Stop_Time": "2022-04-13 15:11:00",
    "Parameters": {
      "Param 1": [
        [0, "2022-04-13 15:01:00"],
        [1000, "2022-04-13 15:02:00"],
        [2000, "2022-04-13 15:03:00"]
        ],
      "Param 2": [
        [0.23, "2022-04-13 15:01:00"],
        [0.47, "2022-04-13 15:02:00"],
        [2.12, "2022-04-13 15:03:00"]
        ]
    }
  },
  "150": {
    "Run_Start_Time": "2022-04-13 16:00:00",
    "Run_Stop_Time": "2022-04-13 16:11:00",
    "Parameters": {
      "Param 1": [
        [0, "2022-04-13 16:01:00"],
        [1000, "2022-04-13 16:02:00"],
        [2000, "2022-04-13 16:03:00"]
        ],
      "Param 2": [
        [0.23, "2022-04-13 16:01:00"],
        [0.47, "2022-04-13 16:02:00"],
        [2.12, "2022-04-13 16:03:00"]
        ]
    }
  }
}'

# ---- Function to get data.frame ----

json_to_dataframe = 
function(
  preprocessed_data
) {

  values_matrix     =   c()
  runs_list         =   names(preprocessed_data)
  start_time_list   =   c()
  stop_time_list    =   c()
  nb_obs_param_list =   c()
  nb_obs_run_list   =   c()
  for (i in 1:length(runs_list)) {
    start_time_list   =   c(start_time_list, preprocessed_data[[i]]$Run_Start_Time)
    stop_time_list    =   c(stop_time_list, preprocessed_data[[i]]$Run_Stop_Time)
    param_values      =   preprocessed_data[[i]]$Parameters
    nb_obs_param      =   sapply(param_values, nrow)
    nb_obs_param_list =   c(nb_obs_param_list, nb_obs_param)
    nb_obs_run_list   =   c(nb_obs_run_list, sum(nb_obs_param))
    values_matrix     =   rbind(values_matrix, do.call("rbind", param_values))
  }
  
  preprocessed_data <- data.frame("Run" = rep(runs_list, nb_obs_run_list), "Run_Start_Time" = rep(start_time_list, nb_obs_run_list),
                                  "Run_Stop_Time" = rep(stop_time_list, nb_obs_run_list), "Parameter" = rep(names(nb_obs_param_list), nb_obs_param_list),
                                  "Measure_Time" = values_matrix[,2], "Value" = values_matrix[,1])
  preprocessed_data$Run_Start_Time  <- as.POSIXct(preprocessed_data$Run_Start_Time , format = "%Y-%m-%d %H:%M:%S", origin = "1970-01-01")
  preprocessed_data$Run_Stop_Time   <- as.POSIXct(preprocessed_data$Run_Stop_Time , format = "%Y-%m-%d %H:%M:%S", origin = "1970-01-01")
  preprocessed_data$Measure_Time    <- as.POSIXct(preprocessed_data$Measure_Time , format = "%Y-%m-%d %H:%M:%S" , origin = "1970-01-01")
  preprocessed_data$Value <- as.numeric(preprocessed_data$Value)
  
  return( preprocessed_data )
  
}

library(jsonlite)
json_data = fromJSON(json, flatten = TRUE,   simplifyDataFrame = TRUE)
output = json_to_dataframe( json_data )

> output
        Run      Run_Start_Time       Run_Stop_Time Parameter        Measure_Time   Value
1  14856969 2022-04-13 15:00:00 2022-04-13 15:11:00   Param 1 2022-04-13 15:01:00    0.00
2  14856969 2022-04-13 15:00:00 2022-04-13 15:11:00   Param 1 2022-04-13 15:02:00 1000.00
3  14856969 2022-04-13 15:00:00 2022-04-13 15:11:00   Param 1 2022-04-13 15:03:00 2000.00
4  14856969 2022-04-13 15:00:00 2022-04-13 15:11:00   Param 2 2022-04-13 15:01:00    0.23
5  14856969 2022-04-13 15:00:00 2022-04-13 15:11:00   Param 2 2022-04-13 15:02:00    0.47
6  14856969 2022-04-13 15:00:00 2022-04-13 15:11:00   Param 2 2022-04-13 15:03:00    2.12
7       150 2022-04-13 16:00:00 2022-04-13 16:11:00   Param 1 2022-04-13 16:01:00    0.00
8       150 2022-04-13 16:00:00 2022-04-13 16:11:00   Param 1 2022-04-13 16:02:00 1000.00
9       150 2022-04-13 16:00:00 2022-04-13 16:11:00   Param 1 2022-04-13 16:03:00 2000.00
10      150 2022-04-13 16:00:00 2022-04-13 16:11:00   Param 2 2022-04-13 16:01:00    0.23
11      150 2022-04-13 16:00:00 2022-04-13 16:11:00   Param 2 2022-04-13 16:02:00    0.47
12      150 2022-04-13 16:00:00 2022-04-13 16:11:00   Param 2 2022-04-13 16:03:00    2.12
shyt4zoc

shyt4zoc1#

使用purrr库。参见下面的示例:

library(purrr)
library(jsonlite)

# Helper function to process each run
process_run <- function(run_name, run_data) {
  parameters <- run_data$Parameters
  params_df <- map_df(parameters, ~as.data.frame(t(.x), stringsAsFactors = FALSE), .id = "Parameter")
  params_df$Run <- run_name
  params_df$Run_Start_Time <- run_data$Run_Start_Time
  params_df$Run_Stop_Time <- run_data$Run_Stop_Time
  params_df$Measure_Time <- as.POSIXct(params_df$Measure_Time , format = "%Y-%m-%d %H:%M:%S", origin = "1970-01-01")
  params_df$Value <- as.numeric(params_df$Value)
  params_df
}

# Load and process the data
json_data <- fromJSON(json)
output <- map_df(json_data, process_run, .id = "Run")

output

*map_df-是一个purrr函数,它对列表中的每个元素应用一个函数,然后将结果组合成一个 Dataframe 。helper函数process_run应用于数据中的每个运行,为每个运行创建一个 Dataframe ,然后组合它们。这应该比前面的代码更有效。

相关问题