NodeJS 人偶师无法检测选择器

jq6vz3qz  于 2023-06-22  发布在  Node.js
关注(0)|答案(1)|浏览(103)

我正在尝试抓取这个网站:https://www.vivino.com/CA/en/caymus-vineyards-special-selection-cabernet-sauvignon/w/66294?year=2017&price_id=27210070

...
    let wine = wineLinks[i];
    console.log(wine);
    let winePage = await browser.newPage();
    await winePage.goto(wine.link, {waitUntil: "domcontentloaded"});

    const alcohol = await winePage.evaluate(() => document.querySelector("div.wrap div#wine-page-lower-section div.vintagePage__bottom--1fcqg div.inner div:nth-child(5) table tbody tr:nth-child(5) td span").textContent);
    console.log(alcohol);

但是,它显示以下错误:
错误[TypeError]:无法读取null的属性(阅读“textContent”)
当在浏览器中测试上述选择器时,它返回了我需要的内容。有什么建议我可能做错了吗?

xpcnnkqh

xpcnnkqh1#

你可以通过多次滚动来加载页面的所有内容,因为在你向下滚动几个点后,页面的一部分会被加载。但是这是不必要的,因为所有的信息都是用window.__PRELOADED_STATE__加载的,你可以只评估它并获得任何你喜欢的信息,看看下面的代码。
代码:

const puppeteer = require('puppeteer');

let browser;
(async () => {

    const browser = await puppeteer.launch({headless: "new"});
    const page = await browser.newPage();
    
    let url = "https://www.vivino.com/CA/en/caymus-vineyards-special-selection-cabernet-sauvignon/w/66294?year=2017&price_id=27210070";

    await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
    await page.waitForSelector("body");

    const preloadedState = await page.evaluate(() => {
        return window.__PRELOADED_STATE__;
    });

    console.log(preloadedState.vintagePageInformation.vintage.wine_facts.alcohol); // the alchol content

    //console.log(preloadedState.vintagePageInformation); // all info on the page

})().catch(err => console.error(err)).finally(() => browser ?. close());

更新:您需要检查alcohol字段是否存在,因为在某些页面上它不存在。下面代码的Result

const puppeteer = require('puppeteer');

let browser;
(async () => {

    const browser = await puppeteer.launch({headless: false});
    const page = await browser.newPage();
    
    let url = "https://www.vivino.com/explore?e=eJwlykEOQDAQRuHb_Eupsp1LiI2IyKhqmigyGvT2Gjbv27wgVBcawW-kEPihUikFk6htYHI6HFTCLXSxeBt5xS4zzfY02KdEwtFv7hz5ssLO4o79kP8P_VP91C8jayMw";

    await page.goto(url, { waitUntil: "networkidle0", timeout: 30000 });
    await page.waitForSelector("body");

    //this won't get all the links on the page, you to get need to scroll down first
    let links = await page.$$eval('a[data-testid=vintagePageLink]', el => el.map(x => x.getAttribute('href')));

    let wines = [];
    for(let link of links) {
        await page.goto(`https://www.vivino.com${link}`, {waitUntil: "networkidle0", timeout: 30000 });
        await page.waitForSelector("body");

        const data = await page.evaluate(() => {
            return window.__PRELOADED_STATE__.vintagePageInformation.vintage;
        });

        let alcohol = (data.wine_facts.alcohol) ? data.wine_facts.alcohol : null;

        wines.push({ name: data.name, alcohol: alcohol});
    }

    console.log(wines);

})().catch(err => console.error(err)).finally(() => browser ?. close());

相关问题