使用phantomJS R下载文件

rqqzpn5f  于 11个月前  发布在  其他
关注(0)|答案(1)|浏览(89)

我想下载一个文件使用webdriver包(phantomJS),然而每当我点击/输入对象,什么也没发生.我打算点击'下载dos dados',然后点击并下载'Baixar . csv'.

library(webdriver)

url <-'https://idesevis.dee.rs.gov.br/#tab-9023-6'

#webdriver::install_phantomjs() # If it is not installed
pjs <- run_phantomjs()

ses <- Session$new(port = pjs$port)
ses$go(url)
ses$getUrl()

### Click on 'Download dos dados'
search <- ses$findElement(xpath='/html/body/div[2]/nav/div/ul/li[6]/a')
search$sendKeys('html',key$enter)

### Finds and clicks 'Baixar .csv'
search <- ses$findElement(xpath='/html/body/div[2]/div[3]/div/div[7]/div/div[1]/form/a')
search$sendKeys('html',key$enter)
ses$takeScreenshot()

字符串
显然,即使使用完整的xpath,我也没有选择对象'Baixar .csv'。我使用webdriver,因为我不能在我的笔记本上使用RSelenium,静态webscraping不适合这里,因为每次访问主URL时会话代码都会改变。

z6psavjg

z6psavjg1#

CSV下载,至少在这个特定的情况下,Shiny驱动的页面,也可以在浏览器会话之外工作,所以不是在JavaScript中触发下载,你可以从CSV链接中提取 href 并将其传递给download.file()/httr(2)/curl

library(webdriver)
library(httr2)
library(stringr)
library(readr)

url_ <-'https://idesevis.dee.rs.gov.br/#tab-9023-6'

#webdriver::install_phantomjs() # If it is not installed
pjs <- run_phantomjs()

ses <- Session$new(port = pjs$port)
ses$go(url_)
ses$getUrl()
#> [1] "https://idesevis.dee.rs.gov.br/#tab-9023-6"

### Click on 'Download dos dados'
ses$findElement("a[data-value='Download dos Dados']")$sendKeys('html',key$enter)
ses$getUrl()
#> [1] "https://idesevis.dee.rs.gov.br/#tab-9023-6"

### Wait until downloadData element is available and href is set;
### defaults: checkInterval = 100, timeout = 3000;
### find a#downloadData and get download link
ses$waitFor('document.getElementById("downloadData").getAttribute("href")')
#> [1] TRUE

(csv_url <- ses$findElement("a#downloadData")$getAttribute("href"))
#> [1] "https://idesevis.dee.rs.gov.br/session/ca0f03facf886621a9ce4a7c8d6520bc/download/downloadData?w="

### Test url and extract filename from response headers
(resp_head <- request(csv_url) |> req_method("HEAD") |> req_perform())
#> <httr2_response>
#> HEAD
#> https://idesevis.dee.rs.gov.br/session/ca0f03facf886621a9ce4a7c8d6520bc/download/downloadData?w=
#> Status: 200 OK
#> Content-Type: text/csv
#> Body: Empty

filename <- resp_header(resp_head, header = "content-disposition") |> 
  str_split_i("=", 2) |>
  str_remove_all('\\"')
filename
#> [1] "base_idese.csv"

### Fetch CSV
request(csv_url) |> req_perform(path = filename)
#> <httr2_response>
#> GET
#> https://idesevis.dee.rs.gov.br/session/ca0f03facf886621a9ce4a7c8d6520bc/download/downloadData?w=
#> Status: 200 OK
#> Content-Type: text/csv
#> Body: On disk 'body'
fs::file_info(filename)[,1:3]
#> # A tibble: 1 × 3
#>   path           type         size
#>   <fs::path>     <fct> <fs::bytes>
#> 1 base_idese.csv file        11.8M

字符串
下载的数据集:

# Read with correct encoding
read_csv("base_idese.csv", locale = locale(encoding = "ISO-8859-1"))
#> New names:
#> • `` -> `...1`
#> Rows: 114720 Columns: 7
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (4): TIPO_UNID, COD, NOME, CATEGORIA
#> dbl (3): ...1, ANO, VALOR
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 114,720 × 7
#>     ...1 TIPO_UNID  COD     NOME            CATEGORIA                  ANO VALOR
#>    <dbl> <chr>      <chr>   <chr>           <chr>                    <dbl> <dbl>
#>  1     1 Municípios 4300059 Água Santa      "Bloco Renda\\Apropriaç…  2013 0.919
#>  2     2 Municípios 4304804 Carlos Barbosa  "Bloco Renda\\Apropriaç…  2013 0.929
#>  3     3 Municípios 4300901 Aratiba         "Bloco Renda\\Apropriaç…  2013 0.746
#>  4     4 Municípios 4310462 Ipiranga do Sul "Bloco Renda\\Apropriaç…  2013 0.924
#>  5     5 Municípios 4322806 Veranópolis     "Bloco Renda\\Apropriaç…  2013 0.835
#>  6     6 Municípios 4321634 Três Arroios    "Bloco Renda\\Apropriaç…  2013 1    
#>  7     7 Municípios 4313334 Nova Ramada     "Bloco Renda\\Apropriaç…  2013 0.741
#>  8     8 Municípios 4304903 Casca           "Bloco Renda\\Apropriaç…  2013 0.793
#>  9     9 Municípios 4314001 Paraí           "Bloco Renda\\Apropriaç…  2013 0.849
#> 10    10 Municípios 4322350 União da Serra  "Bloco Renda\\Apropriaç…  2013 0.843
#> # ℹ 114,710 more rows


创建于2023-12-27带有reprex v2.0.2

相关问题