javascript 无法从网站的最内层页面获取标题

frebpwbc  于 2023-01-29  发布在  Java
关注(0)|答案(2)|浏览(130)

我在下面创建了一个脚本,从这个website中收集省下的所有链接,然后访问单个链接来收集office下的所有链接,最后从目标页面中刮取标题。
脚本在从第一个链接刮取标题后卡住了。我对Promise很陌生,所以我不知道这个问题。
这是我得出的结论:

const request = require('request');
const cheerio = require('cheerio');

const link = 'https://egpostal.com/en/p/all';
const base_link = 'https://egpostal.com/en/';

const landing_page_links = [];
const inner_page_links = [];

let getLinks = () => {
    return new Promise((resolve, reject) => {
        request(link, function(error, response, html) {
            let $ = cheerio.load(html);
            if (error) return reject(error);
            try {
                $('.table tbody tr').each(function() {
                    landing_page_links.push(base_link + $(this).find("a").attr("href"));
                });
                resolve(landing_page_links);
            } catch (e) {
                reject(e);
            }
        });
    });
};

let getInnerLinks = (links) => {
    return new Promise((resolve, reject) => {
    for (let url of links) {
        request(url, function(error, response, html) {
            let $ = cheerio.load(html);
            if (error) return reject(error);
            try {
                $('.table tbody tr').each(function() {
                    inner_page_links.push(base_link + $(this).find("a").attr("href"));
                });
                resolve(inner_page_links);
            } catch (e) {
                reject(e);
            }
      });
    }
  });
};

let FetchTitle = (links) => {
    return new Promise((resolve, reject) => {
    for (let url of links) {
        request(url, function(error, response, html) {
            let $ = cheerio.load(html);
            if (error) return reject(error);
            try {
                resolve($(".home-title > h2").eq(0).text());
            } catch (e) {
                reject(e);
            }
        })
      }
  })
}

getLinks().then(resultList => {
    getInnerLinks(resultList).then(resultSet => {
        FetchTitle(resultSet).then(title =>{
            console.log(title);
        })
    })
})
j9per5c4

j9per5c41#

首先是request is deprecated,所以你应该避开这个包,既然你想和 promises 一起工作,我建议你改用axios
老实说,如果我是你,我会使用一个爬行库来使事情变得更容易和更健壮。但我想这或多或少是一个编码挑战,所以我将把你的代码“原样”,并调整它,使它工作。
这里的主要问题是getInnerLinks,您在一个request回调中解决您的承诺,回调本身在一个循环中。如果第一次迭代是OK的,您的脚本将通过主承诺链中的.then()跳到下一步。因此,实际上,在循环之后解决您的承诺更合乎逻辑。但是考虑到所有请求都是异步的,您需要等待。setTimeout()在这种情况下对运行一些测试很有用。现在的问题是:你要等多久?嗯...当你玩弄许多承诺时,一个可靠而安全的解决方案是使用Promise.all()
而且,一个对FetchTitle的承诺是大材小用,你可以直接把一个console.log()放进去打印标题。
下面是代码:

const axios = require('axios'),
      cheerio = require('cheerio');

const link = 'https://egpostal.com/en/p/all',
      base_link = 'https://egpostal.com/en/';

const landing_page_links = [],
      inner_page_links = [];

let getLinks = () => {
  return new Promise((resolve, reject) => {
    axios.get(link)
      .then(({ data }) => {
        let $ = cheerio.load(data);

        try {
          $('.table tbody tr').each(function() {
            landing_page_links.push(base_link + $(this).find('a').attr('href'));
          });

          resolve(landing_page_links);
          console.log('Landing page OK');
        } catch (e) {
          reject(e);
        }
      })
      .catch(error => {
        console.log(error);
      });
  });
};

let getInnerLinks = links => {
  return new Promise((resolve, reject) => {
    let requests = [];

    for (let url of links) {
      requests.push(axios.get(url)
        .then(({ data }) => {
          console.log(`Shallow request: ${url}`);

          let $ = cheerio.load(data);

          $('.table tbody tr').each(function() {
            inner_page_links.push(base_link + $(this).find('a').attr('href'));
          });
        })
        .catch(error => {
          console.log(error);
        })
      );
    }

    Promise.all(requests)
      .then(() => {
        resolve(inner_page_links);
      })
      .catch(error => {
        reject(error);
      });
  });
};

let FetchTitle = links => {
  for (let url of links) {
    axios.get(url)
      .then(({ data }) => {
        console.log(`Deep request: ${url}`);

        let $ = cheerio.load(data);
        console.log($('.home-title > h2').eq(0).text());
      })
      .catch(error => {
        console.log(error);
      });
  }
}

getLinks()
  .then(resultList => getInnerLinks(resultList))
  .then(resultSet => FetchTitle(resultSet))
  .catch(error => console.log(error));
mo49yndu

mo49yndu2#

这种网页抓取非常适合Python上的scrapy
步骤

$scrapy startproject egpostal
$cd egpostal
$scrapy genspider post-office https://egpostal.com

它将创建文件

D:\temp\test>tree /F
Folder PATH listing for volume DATA
Volume serial number is 16D6-338C
D:.
└───egpostal
    │   scrapy.cfg
    │
    └───egpostal
        │   items.py
        │   middlewares.py
        │   pipelines.py
        │   settings.py
        │   __init__.py
        │
        ├───spiders
        │   │   post_office.py
        │   │   __init__.py
        │   │
        │   └───__pycache__
        │           post_office.cpython-310.pyc
        │           __init__.cpython-310.pyc
        │
        └───__pycache__
                items.cpython-310.pyc
                settings.cpython-310.pyc
                __init__.cpython-310.pyc

在这些文件中,我们只会接触到两个文件(items.py和post_office. py)
使用此代码覆盖两个文件

邮局. py

import scrapy

from egpostal.items import EgpostalItem

class PostOfficeSpider(scrapy.Spider):
    name = 'post-office'
    start_urls = ['https://egpostal.com/en/p/all']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url, 
                callback = self.url_parse
            )

    # parsing https://egpostal.com/en/p/all level
    def url_parse(self, response):
        base_link = 'https://egpostal.com/en/'
        provinces = []
        for a in response.xpath('//tbody//tr//a'):
            provinces.append({
                'province': a.xpath('./text()').extract_first(),
                'url': base_link + a.xpath('./@href').extract_first()
            })
        for province in provinces:
            yield scrapy.Request(
                province['url'], 
                callback = self.parse_province,
                # 1st parameter
                cb_kwargs=dict(province=province['province'])
            )

    # parsing province level
    def parse_province(self, response, province):
        base_link = 'https://egpostal.com/en/'
        offices = []
        for a in response.xpath('//tbody//tr//td//a'):
            offices.append({
                'province' : province,
                'office': a.xpath('./text()').extract_first(),
                'url': base_link + a.xpath('./@href').extract_first()
            })
        for office in offices:
            request = scrapy.Request(
                office['url'], 
                callback = self.parse_postal_office,
                # 1st parameter
                cb_kwargs=dict(office=office['office'])
            )
            request.cb_kwargs['province'] = province    # 2nd parameter
            yield request

    # parsing office level
    def parse_postal_office(self, response, office, province):
        title = response.xpath('//div[@class="home-title"]//h2/text()').get().strip()
        loader = EgpostalItem()  # Here you create a new item each iteration
        loader['province'] = province
        loader['office'] = office
        loader['title'] = title
        yield loader

网站items.py

import scrapy

class EgpostalItem(scrapy.Item):
    # define the fields for your item here like:
    province = scrapy.Field()
    office = scrapy.Field()
    title = scrapy.Field()
    pass

运行并保存到result.json中

如果您成功完成了这些步骤,您可以在终端中看到一些登录信息。(* 注意-O是大写字母)

$ scrapy crawl post-office -O result.json
[
{"province": "Ismailia", "office": "Sheikh Zayed", "title": "Sheikh Zayed Post Office"},
{"province": "Ismailia", "office": "Channel", "title": "Channel Post Office"},
{"province": "Ismailia", "office": "Balwah", "title": "Balwah Post Office"},
{"province": "Ismailia", "office": "Eye twig", "title": "Eye twig Post Office"},
{"province": "Ismailia", "office": "Nvich", "title": "Nvich Post Office"},
{"province": "Ismailia", "office": "Manshyet martyrs", "title": "Manshyet martyrs Post Office"},
{"province": "Ismailia", "office": "Fanara", "title": "Fanara Post Office"},
{"province": "Ismailia", "office": "Evacuation camp", "title": "Evacuation camp Post Office"},
{"province": "Dakahlia", "office": "Nqath", "title": "Nqath Post Office"},
{"province": "Dakahlia", "office": "Kafr El Allam", "title": "Kafr El Allam Post Office"},
{"province": "Alexandria", "office": "San Stefano", "title": "San Stefano Post Office"},
{"province": "Beheira", "office": "White housing", "title": "White housing Post Office"},
{"province": "Beheira", "office": "Kvraldoar fabric", "title": "Kvraldoar fabric Post Office"},
{"province": "Beheira", "office": "Kvraldoarveray", "title": "Kvraldoarveray Post Office"},
{"province": "Beheira", "office": "Kvraldoar", "title": "Kvraldoar Post Office"},
{"province": "Beheira", "office": "Palm Marine", "title": "Palm Marine Post Office"},
{"province": "Luxor", "office": "Abu generosity", "title": "Abu generosity Post Office"},
{"province": "Luxor", "office": "Madamoud", "title": "Madamoud Post Office"},
{"province": "Luxor", "office": "Alacalth", "title": "Alacalth Post Office"},
{"province": "New Valley", "office": "Azab palace", "title": "Azab palace Post Office"},
{"province": "New Valley", "office": "Almosah", "title": "Almosah Post Office"},
{"province": "New Valley", "office": "Tnadh", "title": "Tnadh Post Office"},
{"province": "New Valley", "office": "Mott sub", "title": "Mott sub Post Office"},
{"province": "New Valley", "office": "Alhendao", "title": "Alhendao Post Office"},
{"province": "New Valley", "office": "Nasser Revolution", "title": "Nasser Revolution Post Office"},
{"province": "New Valley", "office": "West talented", "title": "West talented Post Office"},
{"province": "New Valley", "office": "Farafra", "title": "Farafra Post Office"},
{"province": "New Valley", "office": "Enlightening", "title": "Enlightening Post Office"},
{"province": "New Valley", "office": "Kalamoon", "title": "Kalamoon Post Office"},
{"province": "New Valley", "office": "Winepress Dakhla", "title": "Winepress Dakhla Post Office"},
{"province": "New Valley", "office": "Adult", "title": "Adult Post Office"},
{"province": "New Valley", "office": "Palace Dakhla Center", "title": "Palace Dakhla Center Post Office"},
{"province": "New Valley", "office": "Bulaq New Valley", "title": "Bulaq New Valley Post Office"},
{"province": "New Valley", "office": "New inflows Center", "title": "New inflows Center Post Office"},
{"province": "New Valley", "office": "Court", "title": "Court Post Office"},
{"province": "New Valley", "office": "Emerging country", "title": "Emerging country Post Office"},
{"province": "New Valley", "office": "Paris", "title": "Paris Post Office"},
{"province": "New Valley", "office": "Mott", "title": "Mott Post Office"},
{"province": "New Valley", "office": "\u0627\u0644\u062e\u0627\u0631\u062c\u0629", "title": "\u0627\u0644\u062e\u0627\u0631\u062c\u0629 Post Office"},
{"province": "Aswan", "office": "Npan tribal", "title": "Npan tribal Post Office"},
{"province": "Aswan", "office": "Nag Alamuesat", "title": "Nag Alamuesat Post Office"},
{"province": "Aswan", "office": "Mahamid", "title": "Mahamid Post Office"},
{"province": "Aswan", "office": "Mansuriya tribal", "title": "Mansuriya tribal Post Office"},
{"province": "Aswan", "office": "Alklh", "title": "Alklh Post Office"},
{"province": "Aswan", "office": "Alcajoj", "title": "Alcajoj Post Office"},
{"province": "Aswan", "office": "Waterfall", "title": "Waterfall Post Office"},
{"province": "Aswan", "office": "Seven East", "title": "Seven East Post Office"},
{"province": "Aswan", "office": "Seven West", "title": "Seven West Post Office"},
{"province": "Aswan", "office": "Gray sea", "title": "Gray sea Post Office"},
{"province": "Aswan", "office": "Alrdesah", "title": "Alrdesah Post Office"},
{"province": "Aswan", "office": "Khattara Aswan", "title": "Khattara Aswan Post Office"},
{"province": "Aswan", "office": "Aldjaafarh", "title": "Aldjaafarh Post Office"},
{"province": "Aswan", "office": "Albesalah", "title": "Albesalah Post Office"},
{"province": "Aswan", "office": "Akulait", "title": "Akulait Post Office"},
{"province": "Aswan", "office": "Edfu East", "title": "Edfu East Post Office"},
{"province": "Aswan", "office": "Edfu", "title": "Edfu Post Office"},
{"province": "Aswan", "office": "Edfu tribal", "title": "Edfu tribal Post Office"},
{"province": "Aswan", "office": "Abu Rish tribal", "title": "Abu Rish tribal Post Office"},
{"province": "Aswan", "office": "Aswan", "title": "Aswan Post Office"},
{"province": "Red Sea", "office": "Abu ashes", "title": "Abu ashes Post Office"},
{"province": "Red Sea", "office": "Hurghada sub", "title": "Hurghada sub Post Office"},
{"province": "Red Sea", "office": "Or the new Huwaitat", "title": "Or the new Huwaitat Post Office"},
{"province": "Red Sea", "office": "El Gouna tourism", "title": "El Gouna tourism Post Office"},
{"province": "Red Sea", "office": "Red Sea passage", "title": "Red Sea passage Post Office"},
{"province": "Red Sea", "office": "Ras Gharib Petroleum", "title": "Ras Gharib Petroleum Post Office"},
{"province": "Red Sea", "office": "Shalateen", "title": "Shalateen Post Office"},
{"province": "Red Sea", "office": "Safaga _ subsidiary", "title": "Safaga _ subsidiary Post Office"},
{"province": "Red Sea", "office": "Marsa Alam", "title": "Marsa Alam Post Office"},
{"province": "Red Sea", "office": "A second short", "title": "A second short Post Office"},
{"province": "Red Sea", "office": "Aweyna Red Sea", "title": "Aweyna Red Sea Post Office"},
{"province": "Red Sea", "office": "Safaga port", "title": "Safaga port Post Office"},
{"province": "Red Sea", "office": "Ras Gharib sub", "title": "Ras Gharib sub Post Office"},
{"province": "Red Sea", "office": "Hurghada port", "title": "Hurghada port Post Office"},
{"province": "Red Sea", "office": "Directorate of Health", "title": "Directorate of Health Post Office"},
{"province": "Red Sea", "office": "Ahamraoyen", "title": "Ahamraoyen Post Office"},
{"province": "Red Sea", "office": "Safaga", "title": "Safaga Post Office"},
{"province": "Red Sea", "office": "Short of the Red Sea", "title": "Short of the Red Sea Post Office"},
{"province": "Red Sea", "office": "Ras Gharib", "title": "Ras Gharib Post Office"},
{"province": "Red Sea", "office": "Hurghada", "title": "Hurghada Post Office"},
{"province": "Sohag", "office": "Majabrh", "title": "Majabrh Post Office"},
{"province": "Sohag", "office": "Ketkath", "title": "Ketkath Post Office"},
{"province": "Sohag", "office": "Dar es Salaam Peace Center", "title": "Dar es Salaam Peace Center Post Office"},
{"province": "Sohag", "office": "Ahlgrezac", "title": "Ahlgrezac Post Office"},
{"province": "Sohag", "office": "El Usayrat established center", "title": "El Usayrat established center Post Office"},
{"province": "Sohag", "office": "Godmother Abydos", "title": "Godmother Abydos Post Office"},
{"province": "Sohag", "office": "Asalaa", "title": "Asalaa Post Office"},
{"province": "Sohag", "office": "Tleihat", "title": "Tleihat Post Office"},
{"province": "Sohag", "office": "Sheikh Marzouq", "title": "Sheikh Marzouq Post Office"},
{"province": "Sohag", "office": "Alsoamah West", "title": "Alsoamah West Post Office"},
{"province": "Sohag", "office": "Zouk East", "title": "Zouk East Post Office"},
{"province": "Sohag", "office": "Alajabirat", "title": "Alajabirat Post Office"},
{"province": "Sohag", "office": "Algelawih", "title": "Algelawih Post Office"},
{"province": "Sohag", "office": "Belina", "title": "Belina Post Office"},
{"province": "Sohag", "office": "Albarba", "title": "Albarba Post Office"},
{"province": "Sohag", "office": "Batakh", "title": "Batakh Post Office"},
{"province": "Sohag", "office": "Adva", "title": "Adva Post Office"},
{"province": "Sohag", "office": "Akhmim", "title": "Akhmim Post Office"},
{"province": "Sohag", "office": "Sohag", "title": "Sohag Post Office"},
{"province": "Sohag", "office": "Sohag sub", "title": "Sohag sub Post Office"},
{"province": "Qena", "office": "Reseda", "title": "Reseda Post Office"},
{"province": "Qena", "office": "Dandara", "title": "Dandara Post Office"},
{"province": "Qena", "office": "The main Dishna", "title": "The main Dishna Post Office"},
{"province": "Qena", "office": "Bahgoura", "title": "Bahgoura Post Office"},
{"province": "Qena", "office": "Children Amr", "title": "Children Amr Post Office"},
{"province": "Qena", "office": "El Usayrat", "title": "El Usayrat Post Office"},
{"province": "Qena", "office": "Halfaya nautical", "title": "Halfaya nautical Post Office"},
{"province": "Qena", "office": "Asamta", "title": "Asamta Post Office"},
{"province": "Qena", "office": "Manor Nagagerh", "title": "Manor Nagagerh Post Office"},
{"province": "Qena", "office": "Rahmaniyah tribal country b", "title": "Rahmaniyah tribal country b Post Office"},
{"province": "Qena", "office": "Home", "title": "Home Post Office"},
{"province": "Qena", "office": "Unguarded Qena", "title": "Unguarded Qena Post Office"},
{"province": "Qena", "office": "Beekeeper", "title": "Beekeeper Post Office"},
{"province": "Qena", "office": "Supervising", "title": "Supervising Post Office"},
{"province": "Qena", "office": "Abnoud", "title": "Abnoud Post Office"},
{"province": "Qena", "office": "Abu Manna nautical", "title": "Abu Manna nautical Post Office"},
{"province": "Qena", "office": "Abu Tesht main", "title": "Abu Tesht main Post Office"},
{"province": "Qena", "office": "Abu Shusha", "title": "Abu Shusha Post Office"},
{"province": "Qena", "office": "The main Qena", "title": "The main Qena Post Office"},
{"province": "Qena", "office": "Arabs Sabha", "title": "Arabs Sabha Post Office"},
{"province": "Luxor", "office": "Navigation", "title": "Navigation Post Office"},
{"province": "Luxor", "office": "Sidi Abu pilgrims", "title": "Sidi Abu pilgrims Post Office"},
{"province": "Luxor", "office": "Imari facility", "title": "Imari facility Post Office"},
{"province": "Luxor", "office": "Qurna", "title": "Qurna Post Office"},
{"province": "Luxor", "office": "Karnak Luxor", "title": "Karnak Luxor Post Office"},
{"province": "Luxor", "office": "Baghdadi", "title": "Baghdadi Post Office"},
{"province": "Luxor", "office": "Blindness", "title": "Blindness Post Office"},
{"province": "Luxor", "office": "Tourism Market", "title": "Tourism Market Post Office"},
{"province": "Luxor", "office": "Zinnia tribal", "title": "Zinnia tribal Post Office"},
{"province": "Luxor", "office": "Tod", "title": "Tod Post Office"},
{"province": "Luxor", "office": "Aloqasrata Airport", "title": "Aloqasrata Airport Post Office"},
{"province": "Luxor", "office": "Manshyet Nuba", "title": "Manshyet Nuba Post Office"},
{"province": "Luxor", "office": "El Bayadeya", "title": "El Bayadeya Post Office"},
{"province": "Luxor", "office": "Tribal Qamola", "title": "Tribal Qamola Post Office"},
{"province": "Luxor", "office": "Lentoids", "title": "Lentoids Post Office"},
{"province": "Luxor", "office": "Dabayaa", "title": "Dabayaa Post Office"},
{"province": "Cairo", "office": "Cairo Sporting Club", "title": "Cairo Sporting Club Post Office"},
{"province": "Cairo", "office": "_ Second Abbasid", "title": "_ Second Abbasid Post Office"},
{"province": "Luxor", "office": "Luxor", "title": "Luxor Post Office"},
{"province": "Cairo", "office": "Ministry of Tourism", "title": "Ministry of Tourism Post Office"},
{"province": "Cairo", "office": "Ministry of Foreign Affairs", "title": "Ministry of Foreign Affairs Post Office"},
{"province": "Cairo", "office": "Cairo traffic", "title": "Cairo traffic Post Office"},
{"province": "Cairo", "office": "Tenth District", "title": "Tenth District Post Office"},
{"province": "Cairo", "office": "Republican Palace dome", "title": "Republican Palace dome Post Office"},
{"province": "Cairo", "office": "Ministry of Education", "title": "Ministry of Education Post Office"},
{"province": "Cairo", "office": "Ramses", "title": "Ramses Post Office"},
{"province": "Cairo", "office": "Divans", "title": "Divans Post Office"},
{"province": "Cairo", "office": "Faggala", "title": "Faggala Post Office"},
{"province": "Cairo", "office": "Garden City", "title": "Garden City Post Office"},
{"province": "Cairo", "office": "Mohamed Farid", "title": "Mohamed Farid Post Office"},
{"province": "Cairo", "office": "Abbasid", "title": "Abbasid Post Office"},
{"province": "Cairo", "office": "Parliament", "title": "Parliament Post Office"},
{"province": "Cairo", "office": "Television", "title": "Television Post Office"},
{"province": "Cairo", "office": "Liberation complex", "title": "Liberation complex Post Office"},
{"province": "Cairo", "office": "Ministries", "title": "Ministries Post Office"},
{"province": "Cairo", "office": "Bab El Louk", "title": "Bab El Louk Post Office"},
{"province": "Cairo", "office": "Cairos main", "title": "Cairos main Post Office"},
{"province": "Giza", "office": "City Arts", "title": "City Arts Post Office"},
{"province": "Giza", "office": "Central Ahram", "title": "Central Ahram Post Office"},
{"province": "Giza", "office": "Student", "title": "Student Post Office"},
{"province": "Giza", "office": "Western Urban", "title": "Western Urban Post Office"},
{"province": "Giza", "office": "Queen", "title": "Queen Post Office"},
{"province": "Giza", "office": "Urban East", "title": "Urban East Post Office"},
{"province": "Giza", "office": "Giza station", "title": "Giza station Post Office"},
{"province": "Giza", "office": "Mohammed Island", "title": "Mohammed Island Post Office"},
{"province": "Giza", "office": "Recruitment Giza", "title": "Recruitment Giza Post Office"},
{"province": "Giza", "office": "Provision of Giza", "title": "Provision of Giza Post Office"},
{"province": "Giza", "office": "Munib", "title": "Munib Post Office"},
{"province": "Giza", "office": "Teresa brief", "title": "Teresa brief Post Office"},
{"province": "Giza", "office": "Giza fourth", "title": "Giza fourth Post Office"},
{"province": "Giza", "office": "Paternoster Macki", "title": "Paternoster Macki Post Office"},
{"province": "Giza", "office": "Spring Gizy", "title": "Spring Gizy Post Office"},
{"province": "Giza", "office": "Sunday market", "title": "Sunday market Post Office"},
{"province": "Giza", "office": "Tourism Pyramid", "title": "Tourism Pyramid Post Office"},
{"province": "Giza", "office": "Giza second", "title": "Giza second Post Office"},
{"province": "Giza", "office": "Giza first", "title": "Giza first Post Office"},
... removed
{"province": "Assiut", "office": "Recruitment of Assiut", "title": "Recruitment of Assiut Post Office"}
]

相关问题