尝试从Web抓取 Dataframe

ergxz8rk  于 2023-05-20  发布在  其他
关注(0)|答案(2)|浏览(135)

我尝试使用来自此URL的数据制作数据框,https://api.finance.naver.com/siseJson.naver?symbol=005930&requestType=1&startTime=20220312&endTime=20230517&timeframe=day,在r studio中。

rm(list = ls(all=T))

library(jsonlite)
library(dplyr)

url <- "https://api.finance.naver.com/siseJson.naver?symbol=005930&requestType=1&startTime=20220312&endTime=20230517&timeframe=day"

data <- readLines(url, warn = FALSE)

print(data)

data_t <- gsub("\t","",data)

print(data_t)

最后,数据是这样打印的。

[1] ""                                                                     
  [2] " [['날짜', '시가', '고가', '저가', '종가', '거래량', '외국인소진율'],"
  [3] ""                                                                     
  [4] ""                                                                     
  [5] ""                                                                     
  [6] ""                                                                     
  [7] "[\"20220314\", 70000, 70200, 69600, 70200, 9040993, 51.84],"          
  [8] ""                                                                     
  [9] "[\"20220315\", 69800, 70100, 69500, 69500, 10258562, 51.8],"          
 [10] ""                                                                     
 [11] "[\"20220316\", 70200, 70500, 69700, 70400, 10175750, 51.79],"         
 [12] ""                                                                     
 [13] "[\"20220317\", 71200, 71800, 70900, 71200, 17646315, 51.79],"         
 [14] ""                                                                     
 [15] "[\"20220318\", 70600, 70900, 70200, 70700, 14410038, 51.79],"

如何选择数据并正确连接。
如何编写代码并完成它?
请从URL为dataframe代码。

new9mtju

new9mtju1#

library(dplyr)
library(stringr)

url <- "https://api.finance.naver.com/siseJson.naver?symbol=005930&requestType=1&startTime=20220312&endTime=20230517&timeframe=day"

data <- readLines(url, warn = FALSE)

当我们删除制表符\t时,我们可以同时摆脱括号,strsplit()允许我们将行分解为向量,其中每个未来的数据框列都是一个单独的条目。

data2 <- 
  gsub("\t|\\[|\\]","",data) |> 
  strsplit(", ")

让我们从第二行中获取列名,并从数据对象中删除该行

column_names <- 
  data2[[2]] |> 
  trimws() |> 
  str_remove_all("'")

data3 <- 
  data2[-2]

删除空行。

data4 <- 
  data3[lengths(data3) != 0]

为每个数据向量添加列名并使用dplyr::bind_rows()将其绑定到data.frame中

lapply(data4,
       setNames,
       column_names) |> 
  bind_rows() |> 
  mutate(날짜 = str_remove_all(날짜, "\\\""))
#> # A tibble: 294 × 7
#>    날짜     시가  고가  저가  종가  거래량   `외국인소진율,`
#>    <chr>    <chr> <chr> <chr> <chr> <chr>    <chr>          
#>  1 20220314 70000 70200 69600 70200 9040993  51.84,         
#>  2 20220315 69800 70100 69500 69500 10258562 51.8,          
#>  3 20220316 70200 70500 69700 70400 10175750 51.79,         
#>  4 20220317 71200 71800 70900 71200 17646315 51.79,         
#>  5 20220318 70600 70900 70200 70700 14410038 51.79,         
#>  6 20220321 70900 71000 69900 69900 11169002 51.75,         
#>  7 20220322 69900 70500 69900 70300 9402666  51.75,         
#>  8 20220323 70600 71200 70300 70500 12398025 51.74,         
#>  9 20220324 69600 70300 69600 69800 37943357 51.97,         
#> 10 20220325 70100 70200 69600 69800 12986010 51.91,         
#> # ℹ 284 more rows
mwkjh3gx

mwkjh3gx2#

library(dplyr)

"https://api.finance.naver.com/siseJson.naver?symbol=005930&requestType=1&startTime=20220312&endTime=20230517&timeframe=day" |> 
  readr::read_csv(col_names = c(
    '날짜', '시가', '고가', '저가', '종가', '거래량', '외국인소진율'
  )) |> 
  mutate(across(where(is.character), readr::parse_number)) |> 
  select(-last_col()) |> 
  slice(2:n())

结果:

# A tibble: 295 × 7
       날짜  시가  고가  저가  종가   거래량 외국인소진율
      <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>        <dbl>
 1 20220314 70000 70200 69600 70200  9040993         51.8
 2 20220315 69800 70100 69500 69500 10258562         51.8
 3 20220316 70200 70500 69700 70400 10175750         51.8
 4 20220317 71200 71800 70900 71200 17646315         51.8
 5 20220318 70600 70900 70200 70700 14410038         51.8
 6 20220321 70900 71000 69900 69900 11169002         51.8
 7 20220322 69900 70500 69900 70300  9402666         51.8
 8 20220323 70600 71200 70300 70500 12398025         51.7
 9 20220324 69600 70300 69600 69800 37943357         52.0
10 20220325 70100 70200 69600 69800 12986010         51.9

相关问题