NodeJS 如何刮数据从一个输入电台与 puppet

vxqlmq5t  于 2023-06-22  发布在  Node.js
关注(0)|答案(1)|浏览(122)

现在我正在尝试从这个网站上抓取数据:text与puppeteer。我想做的是,访问的信息,如果过滤器无线电输入“免费”已被选中,但我还没有管理。这就是我目前为止的做法,但不确定我做得好不好。任何帮助将是美好的!

const puppeteer = require("puppeteer");
const fs = require('fs');

const extracteventData = async (url,browser) => {
    try{
        const eventData = {}
        const page = await browser.newPage()
        await page.goto(url)
        await page.waitForSelector('label.ChoiceListItem_choice__hikcw', { visible: true });
        await page.click('label.ChoiceListItem_choice__hikcw');
        

       eventData['Free'] = await page.$$eval('.ChoiceListItem_label__hikcw', (elements) => {
            const data = [];
          
            elements.forEach((element) => {
              const title = element.querySelector('.event-title').innerText;
              const description = element.querySelector('.event-details__main-inner').innerText;
          
              data.push({ titulo, descripcion });
            });
          
            return data;
          });
          ...
mbyulnm0

mbyulnm01#

从这个url开始,您必须首先检查没有选择任何内容,如果是,则重置选择

if (!! await page.$('span[class$=_desktop] *[class^=Button_root]', {visible: true})) {
    let btn = await page.waitForSelector('span[class$=_desktop] *[class^=Button_root]');
    await btn.click();
}

注意:在选择器中,^表示以开始,$表示以结束
然后使用XPATH,找到label,它具有带有给定文本Gratis的span标记,并等待相关部分更新,如下所示:

let selector = "xpath/" + "//label/descendant::span[text()='Gratis']";
let lbl = await page.waitForSelector(selector, { visible: true });
await lbl.click();

await page.waitForSelector('ul[class^=search-main-content]')

你打开的网页与必要的过滤器,以获得所有相关的链接。

let eventLinks = await page.$$eval('div > div[class$=desktop-card] .event-card-details a', el => el.map( x => x.getAttribute('href'));

然后你就可以通过每个链接来获得详细信息。
以下是所有这些的一个例子:

const puppeteer = require('puppeteer');
const fs = require("fs");
const fsp = fs.promises;

(async()=>{
    const browser = await puppeteer.launch({headless:false});
    const page = await browser.newPage();
    await page.setViewport({ width: 1920, height: 1080 });

    let url = `https://www.eventbrite.es/d/spain/all-events/?page=1`;

    await page.goto(url, {waitUntil: "load", timeout: 15000 });

    try {
        // clear any previous selected events
        if (!! await page.$('span[class$=_desktop] *[class^=Button_root]', {visible: true})) {
            let btn = await page.waitForSelector('span[class$=_desktop] *[class^=Button_root]');
            await btn.click();
        }

        // make whatever selection here..
        async function clickLabel(txt) {
            let selector = "xpath/" + `//label/descendant::span[text()='${txt}']`;
            let lbl = await page.waitForSelector(selector, { visible: true });
            await lbl.click();
        }

        let selectionsArr = ['Gratis','Hoy','Negocios','Euro'];

        for (let sel of selectionsArr) {
            await clickLabel(sel);
        }        

        await page.waitForSelector('ul[class^=search-main-content]'); // after clicking this reloads

        // get pages
        let pages = await page.$eval('li[data-spec=pagination-parent]', el => el.textContent);
        pages = pages.split(' de ')[1]; // get last page number

        
        // get links
        let links = [];
        for (let i = 1; i <= pages ; i++) {
            let pageUrl = await page.url();
            pageUrl = (i==1) ? pageUrl : pageUrl.replace(`page=${i-1}`, `page=${i}`);
            await page.goto(pageUrl, { waitUntil: "load", timeout: 15000 });
            await page.waitForSelector('body');
            let eventLinks = await page.$$eval('div > div[class$=desktop-card] .event-card-details a', el => el.map( x => x.getAttribute('href').split('?')[0])); // get href but remove affiliate stuff ?aff=..
            links.push(eventLinks);
        }
        links = links.flat(); // flatten the array

        // go through links to get title & details
        let events = [];
        for (let link of links) {
            await page.goto(link, { waitUntil: "load", timeout: 15000 });
            await page.waitForSelector('body');

            let title = await page.$eval(".event-title", el => el.textContent);
            let startDate = (!! await page.$('.start-date')) ? await page.$eval('.start-date', el => el.getAttribute('datetime')) : null; // if exists get the datetime
            let summary = await page.$eval('p.summary', el => el.textContent);
            let description = await page.$$eval("div[class$=main-inner]", el => el.map(x => x.textContent.replace(/[\uFEFF]/g, ''))); // get text and remove utf8 bom
            let image = await page.$$eval("[data-testid=hero-img]", el => el.map( x => x.getAttribute('src'))); // can be multiple

            events.push({
                link : link,
                title : title,
                startDate : startDate,
                summary : summary,
                description : description,
                image : image
            })
        }

        const json = JSON.stringify(events, null, 2);
        await fsp.writeFile('events.json', json);

        browser.close();

    } catch (e) {
        console.log(e);
    }

})();

备注:

  • 选择'Gratis','Hoy','Negocios','Euro',标签,以最小化打开的页面

相关问题