javascript 人偶师是给我一个空数组后刮

e0bqpujr  于 2023-05-21  发布在  Java
关注(0)|答案(2)|浏览(130)

我刮https://naamhinaam.com/baby-girl-names-a?page=${pageNumber}网站,并在这样做后, puppet 抛出一个空的对象没有价值。下面是我的代码:

const puppeteer = require("puppeteer");
const express = require("express");
const cors = require("cors");
const app = express();
app.use(cors());
let data = [];
(async () => {
  const browser = await puppeteer.launch({
    headless: false,
    defaultViewport: null,
  });
  const page = await browser.newPage();
  for (let pageNumber = 1; pageNumber < 42; pageNumber++) {
    await page.goto(`https://naamhinaam.com/baby-girl-names-a?page=${pageNumber}`);
    await page.waitForTimeout(3000);
    await page.click("#promotionalPopup > div > div > div > button > span");
    await page.$eval(
      "div.name-suggestion.mt-1 > div > div:nth-child(22)",
      (el) => el.remove()
    );
    await page.$eval(
      "div.name-suggestion.mt-1 > div > div:nth-child(43)",
      (el) => el.remove()
    );
    for (let i = 3; i < 54; i++) {
      let fullName = "Null";
      if (await page.$("div.name-suggestion.mt-1 > div > div:nth-child(22)")) {
        continue;
      }          
      if (await page.$("div.name-suggestion.mt-1 > div > div:nth-child(22)")) {
        continue;
      }           
      await page.waitForSelector(
        `div.name-suggestion.mt-1 > div > div:nth-child(${i}) > div.nsg__name_meaning > a`
      );
      let element = await page.$(
        `div.name-suggestion.mt-1 > div > div:nth-child(${i}) > div.nsg__name_meaning > a`
      );
      fullName = await page.evaluate((el) => el.textContent, element);
      data.push({ fullName });
    }
    console.log(data);
  }

  await browser.close();
})();
app.get("/", (req, res) => {
  res.status(200).json(data);
});
app.listen(3000, () => {
  console.log("App is running...");
});

我正在删除puppeteer中的这个元素,因为它包含ad::

await page.$eval(
          "div.name-suggestion.mt-1 > div > div:nth-child(22)",
          (el) => el.remove()
        );
        await page.$eval(
          "div.name-suggestion.mt-1 > div > div:nth-child(43)",
          (el) => el.remove()
        );

我正在循环页面并在这里获取数据。但是在我得到一个空数组之后。

wecizke3

wecizke31#

假设你正试图提取婴儿的名字和含义,你可以使用下面的代码,我已经更新了定位器,并删除点击弹出窗口,因为它不需要,因为我们只是提取的内容

const puppeteer = require("puppeteer");
const express = require("express");
const cors = require("cors");
const app = express();
app.use(cors());
let data = [];
(async () => {
  const browser = await puppeteer.launch({
    headless: true,
    defaultViewport: null,
  });
  const page = await browser.newPage();
  for (let pageNumber = 1; pageNumber <= 42; pageNumber++) {
    await page.goto(`https://naamhinaam.com/baby-girl-names-a?page=${pageNumber}`);
    await page.waitForTimeout(3000);
    let nameElements = await page.$$(
      `a.nsg__name`
    );
    let meaningElements = await page.$$(
      `div.nsg__meaning > i`
    );

    for (let i = 0; i < nameElements.length; i++) {
      let fullName = "";
      let name = await page.evaluate(el => el.textContent, nameElements[i])
      let meaning = await page.evaluate(el => el.textContent, meaningElements[i])
      fullName = `${name.split(/[\n\t]/).join('').trim()}, ${meaning}`;
      data.push({ fullName });
    }

    
  }
  console.log(data);
  await browser.close();
})();
app.get("/", (req, res) => {
  res.status(200).json(data);
});

app.listen(3000, () => {
  console.log("App is running...");
});

产出

{ fullName: 'Aamuktha, Liberated' },
  { fullName: 'Aanadhitha, Happy one' },
  around 2087 in total
wnavrhmk

wnavrhmk2#

有一个:has() CSS伪类,你可以使用它来代替删除元素,阅读here,注意它不适用于Firefox,但与操纵者使用的Chromium一起工作。所以这个

let suggestions = await page.$$('div.nsg__list:has(div.nsg__name_meaning)');

获取列表,从中删除不需要的元素。
您试图关闭的弹出窗口不会阻止您从页面中获取数据,因此您不需要单击它。
page.waitForTimeout()方法是obsolete,而不是使用page.waitForSelector()

await page.waitForSelector('body');

弹出窗口似乎不会阻止任何东西,所以你不需要做任何事情。
你的for循环中也有一个错误,你没有得到最后一页,所以pageNumber < 42应该是pageNumber <= 42;
验证码:

const puppeteer = require("puppeteer");
let data = [];

(async () => {
    const browser = await puppeteer.launch({headless: false, defaultViewport: null});
    const page = await browser.newPage();
    // Skipable start
    await page.setRequestInterception(true);
    page.on('request', (req) => /image|imageset|media|stylesheet|font|script/.test(req.resourceType()) && !req.isInterceptResolutionHandled() 
        ? req.respond({status: 200, body: 'aborted'}) 
        : req.continue()
    );
    // Skipable end 
    let t0 = performance.now();

    // relative parts start

    let url = `https://naamhinaam.com/baby-girl-names-a`
    let gotoSettings = {waitUntil: "load", timeout: 70000};    
    
    await page.goto(url, gotoSettings);
    await page.waitForSelector('body');

    // get last page number
    let lastPage = await page.$eval('.page_info', el => el.textContent.trim());
    lastPage = +lastPage.replace('Viewing page', '').split('of').pop().trim();

    // get data from first page
    let data = [];
    let suggestions = await page.$$('div.nsg__list:has(div.nsg__name_meaning)'); 
        
    for(let s of suggestions) {
        let name = await s.$eval('a.nsg__name', el => { return { name : el.textContent.trim(), link : el.getAttribute('href')}});
        let meaning = await s.$eval('div.nsg__meaning', el => el.textContent.trim());
        data.push({name: name.name, link: name.link, meaning: meaning });
    }

    // get data from other pages
    if (lastPage > 1) {
        for (let pageNumber = 2; pageNumber <= lastPage; pageNumber++) {
            await page.goto(`${url}?page=${pageNumber}`, );
            await page.waitForSelector('body');

            let suggestions = await page.$$('div.nsg__list:has(div.nsg__name_meaning)'); 
            
            for(let s of suggestions) {
                let name = await s.$eval('a.nsg__name', el => { return { name : el.textContent.trim(), link : el.getAttribute('href')}});
                let meaning = await s.$eval('div.nsg__meaning', el => el.textContent.trim());
                data.push({name: name.name, link: name.link, meaning: meaning });
            }
        }
    }

    await browser.close();
    // relative parts end

    let t1 = performance.now();
    console.log(data);
    console.log(t1 - t0, 'milliseconds');

})();

注意:从//Skipable start// Skipable end的部分绕过了正则表达式中规定的加载元素,以加快速度。

相关问题