R从子页面中包含的链接中抓取数据

dm7nw8vv  于 2023-01-28  发布在  其他
关注(0)|答案(1)|浏览(120)

我正试图创建一个数据框架从pubmed网站获取数据。我有一个网页,其中包含子页面的链接,我会从所有的刮一些文本,但我的代码不工作,我不能抓住抽象的文本,我会。我在这里搜索其他主题,但显然不能解决这个问题。这里是我的代码,尚未。

library(xml2)
              library(rvest)  
              library(tibble)
              library(dplyr)
              library(tidyverse) 
              

 link <- "https://pubmed.ncbi.nlm.nih.gov/?term=((((((%E2%80%98Food%20Supply%E2%80%99%20(MeSH))%20OR%20%E2%80%98Food%20Storage%E2%80%99%20(MeSH))%20OR%20%E2%80%98Hunger%E2%80%99(MeSH)%20OR%20food%20security%20OR%20food%20insecurity%20OR%20household%20food%20security%20OR%20global%20food%20security)%20OR%20household%20food%20insecurity)))%20AND%20((%E2%80%98Prevalence%E2%80%99%20(MeSH))%20OR%20%E2%80%98Cross-Sectional%20Studies%E2%80%99%20(MeSH)%20OR%20cross-sectional%20study%20OR%20Prevalence%20Studies%20OR%20prevalence%20study%20OR%20Cross-Sectional%20Analyses%20OR%20CrossSectional%20Analysis%20OR%20Cross%20Sectional%20Analysis%20OR%20Cross%20Sectional%20Analyses)&filter=lang.english&filter=lang.portuguese"

 # I start building variables for a data frame.

 page <- read_html(link)

 name <- page %>% 
  html_nodes(".docsum-title") %>%
  html_text()

 name_links_synopsis <- page %>%          # This takes all the links tgo the subpages
  html_nodes(".docsum-title") %>%
  html_attr("href") %>% 
  paste("https://pubmed.ncbi.nlm.nih.gov", ., sep="")

 authors <- page %>% 
  html_nodes(".full-authors") %>%
  html_text()

 PMID <- page %>% 
  html_nodes(".docsum-pmid") %>%
  html_text()

synopsis <- page %>% 
  html_nodes(".full-view-snippet") %>%
  html_text()

  pubmed <- data.frame(name, authors, name_links_synopsis, PMID, synopsis, 
 stringsAsFactors = FALSE)

  # I create a function to scrape the text of the abstract in every subpage
  
  get_pubmed = function(pubmed_link) {
     pubmed_link = "https://pubmed.ncbi.nlm.nih.gov/?term=((((((%E2%80%98Food%20Supply%E2%80%99%20(MeSH))%20OR%20%E2%80%98Food%20Storage%E2%80%99%20(MeSH))%20OR%20%E2%80%98Hunger%E2%80%99(MeSH)%20OR%20food%20security%20OR%20food%20insecurity%20OR%20household%20food%20security%20OR%20global%20food%20security)%20OR%20household%20food%20insecurity)))%20AND%20((%E2%80%98Prevalence%E2%80%99%20(MeSH))%20OR%20%E2%80%98Cross-Sectional%20Studies%E2%80%99%20(MeSH)%20OR%20cross-sectional%20study%20OR%20Prevalence%20Studies%20OR%20prevalence%20study%20OR%20Cross-Sectional%20Analyses%20OR%20CrossSectional%20Analysis%20OR%20Cross%20Sectional%20Analysis%20OR%20Cross%20Sectional%20Analyses)&filter=lang.english&filter=lang.portuguese"
     pubmed_page = read_html(pubmed_link)
     pubmed_abs = pubmed_page %>% html_nodes(".docsum-title , .docsum-title b") %>%
       html_text()
     pubmed_abs_tot =  name_links_synopsis %>% html_nodes("#eng-abstract p") %>%
      html_text()
     return(pubmed_abs_tot)
         }
5uzkadbs

5uzkadbs1#

你可以在最后一栏找到每篇文章的摘要。

library(tidyverse)
library(rvest)

page <- "https://pubmed.ncbi.nlm.nih.gov/?term=((((((%E2%80%98Food%20Supply%E2%80%99%20(MeSH))%20OR%20%E2%80%98Food%20Storage%E2%80%99%20(MeSH))%20OR%20%E2%80%98Hunger%E2%80%99(MeSH)%20OR%20food%20security%20OR%20food%20insecurity%20OR%20household%20food%20security%20OR%20global%20food%20security)%20OR%20household%20food%20insecurity)))%20AND%20((%E2%80%98Prevalence%E2%80%99%20(MeSH))%20OR%20%E2%80%98Cross-Sectional%20Studies%E2%80%99%20(MeSH)%20OR%20cross-sectional%20study%20OR%20Prevalence%20Studies%20OR%20prevalence%20study%20OR%20Cross-Sectional%20Analyses%20OR%20CrossSectional%20Analysis%20OR%20Cross%20Sectional%20Analysis%20OR%20Cross%20Sectional%20Analyses)&filter=lang.english&filter=lang.portuguese" %>% 
  read_html()

df <- page %>% 
  html_elements(".docsum-content") %>% 
  map_dfr(~ tibble(
    title = .x %>% 
      html_element(".docsum-title") %>% 
      html_text2(), 
    authors = .x %>% 
      html_element(".full-authors") %>% 
      html_text2(), 
    PMID = .x %>% 
      html_element(".docsum-pmid") %>% 
      html_text2(), 
    synopsis = .x %>% 
      html_element(".full-view-snippet") %>% 
      html_text2(), 
    link = .x %>% 
      html_element(".docsum-title") %>% 
      html_attr("href") %>% 
      str_c("https://pubmed.ncbi.nlm.nih.gov", .)
  ))

get_abstract <- function(link) {
  cat("Scraping:", link, "\n")
  link %>%
    read_html() %>%
    html_elements(".abstract-content.selected") %>%
    html_text2()
}

df %>% 
  mutate(
    abstract = map_chr(link, get_abstract)
  )

# A tibble: 10 × 6
   title                                                                                                authors           PMID  synop…¹ link  abstr…²
   <chr>                                                                                                <chr>             <chr> <chr>   <chr> <chr>  
 1 Food Insecurity and Obesity in US Adolescents: A Population-Based Analysis.                          Fleming MA, Kane… 3348… "Preva… http… "Backg…
 2 Food insecurity and mental health during the COVID-19 pandemic.                                      Polsky JY, Gilmo… 3332… "This … http… "Backg…
 3 Household Food Security and Associated Factors among Portuguese Children.                            Silva MG, Machad… 3493… "This … http… "This …
 4 Food Insecurity and Cardiometabolic Markers: Results From the Study of Latino Youth.                 Maldonado LE, So… 3529… "METHO… http… "Objec…
 5 Persistent and Episodic Food Insecurity and Associated Coping Strategies Among College Students.     Mitchell A, Elli… 3618… "OBJEC… http… "Objec…
 6 Food Insecurity: Child Care Programs' Perspectives.                                                  Noerper TE, Elmo… 3499… "BACKG… http… "Backg…
 7 Food in the cold: exploring food security and sovereignty in Whitehorse, Yukon.                      Blom CDB, Steege… 3508… "This … http… "Harsh…
 8 Food insecurity among Finnish private service sector workers: validity, prevalence and determinants. Walsh HM, Nevala… 3506… "OBJEC… http… "Objec…
 9 Food insecurity in baccalaureate nursing students: A cross-sectional survey.                         Cockerham M, Cam… 3386… "METHO… http… "Backg…
10 Household food insecurity and educational outcomes in school-going adolescents in Ghana.             Masa R, Chowa G.  3271… "We me… http… "Objec…
# … with abbreviated variable names ¹​synopsis, ²​abstract

摘要

df %>% 
  slice(1) %>% 
  pull(abstract)

“背景:食物不安全和肥胖是影响青少年的重要问题。最近缺乏研究这种关系的数据。本研究利用最近美国青少年的全国代表性样本来研究肥胖和食物安全状况以及其他危险因素之间的关系。方法:使用2007-2016年美国国家健康和营养检查调查的数据,对4777名美国青少年(13-18岁)进行了横断面分析,计算了基于食品安全状况的肥胖患病率,并进行了多变量逻辑回归,以检查与肥胖相关的青少年特征。来自粮食不安全家庭的青少年肥胖患病率显著高于来自粮食不安全家庭的青少年,患病率比为1.3(95% CI:1.2-1.5,p〈0.0001)。食品不安全与较高的未校正肥胖率相关,比值比为1.4(95% CI:1.2-1.7,p = 0.0002)。在调整潜在混杂因素后,食品不安全不再与肥胖显著相关(OR 1.19,95% CI:1.0-1.4,p = 0.08)。然而,其他因素如黑人、西班牙裔、男性和月收入≤贫困线185%的家庭与肥胖几率增加相关。结论:虽然来自粮食不安全家庭的青少年肥胖患病率高于那些没有粮食不安全家庭的青少年,但在考虑其他风险因素时,没有发现两者之间的关联。关于青少年独立觅食行为的数据可能有助于在未来的工作中澄清这种复杂的关系。”

相关问题