javascript Puppeteer节点Js YouTube数据抓取错误“评估失败”

pnwntuvh  于 2022-12-28  发布在  Java
关注(0)|答案(1)|浏览(189)

我正在尝试使用Puppeteer从一个频道抓取YouTube标题和链接。在执行程序时,我面临如下评估错误:

Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText')
    at pptr://__puppeteer_evaluation_script__:10:65
    at ExecutionContext._ExecutionContext_evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:229:15)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
    at async ExecutionContext.evaluate (E:\somoy\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:107:16)
    at async initiate (E:\somoy\appNew.js:45:20)
    at async E:\somoy\appNew.js:155:9
async function initiate() {
    const browser = await puppeteer.launch({ headless: false, defaultViewport: null, userDataDir: './userdata', executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' });
    const page = await browser.newPage();
    page.setDefaultNavigationTimeout(0)
    await page.goto('https://www.youtube.com/@ProthomAlo/videos', { waitUntil: 'networkidle2' });
    await delay(5000);
    if (!fs.existsSync('storeLink.txt')) {
        //create new file if not exist
        fs.writeFileSync("storeLink.txt", '');
    }
    articleLinkarr = (fs.readFileSync('storeLink.txt', { encoding: 'utf8' })).split('\n')
    let articles = await page.evaluate(async (articleLinkarr) => {
        //console.log('Hello1')
        let arrObj = [];
        articles = document.querySelectorAll('.style-scope.ytd-rich-grid-media');

        for (let i = 0; i < articles.length; i++) {
            //for (let i = 0; i < 20; i++) {
                //const category = document.querySelector('.print-entity-section-wrapper.F93gk').innerText
                //const headline = articles[i].querySelector('div > h3').innerText
                const headline = articles[i].querySelector('h3').innerText
                const link = 'https://www.youtube.com' + articles[i].querySelector('a').getAttribute('href')
                // if (!(link.includes('video') || link.includes('fun') || link.includes('photo'))) {
                //     if (!articleLinkarr.includes(link)) {
                arrObj.push({ articleHeadline: headline, articleLink: link })
                //     }
                // }
    };
    return arrObj;
}, articleLinkarr)
}
fhity93d

fhity93d1#

如果你只想要初始的标题集,Puppeteer在这里似乎没有必要。静态HTML中有一个JSON blob,它有标题列表,所以你可以向URL发出一个简单的HTTP请求,用HTML解析器提取blob,然后遍历对象结构。

const cheerio = require("cheerio"); // 1.0.0-rc.12

const url = "Your URL";

fetch(url) // Node 18 or install node-fetch
  .then(res => {
    if (!res.ok) {
      throw Error(res.statusText);
    }

    return res.text();
  })
  .then(html => {
    const $ = cheerio.load(html);
    const script = $(
      [...$("script")].find(e =>
        $(e).text().startsWith("var ytInitialData = {")
      )
    )
      .text()
      .slice(20, -1);
    const data = JSON.parse(script);
    const titles = [];
    const {contents} =
      data.contents.twoColumnBrowseResultsRenderer.tabs[1].tabRenderer
        .content.richGridRenderer;

    for (const c of contents) {
      if (!c.richItemRenderer) {
        continue;
      }

      const title =
        c.richItemRenderer.content.videoRenderer.title.runs[0].text;
      const url =
        c.richItemRenderer.content.videoRenderer.navigationEndpoint
          .commandMetadata.webCommandMetadata.url;
      titles.push({title, url});
    }

    console.log(titles);
  })
  .catch(err => console.error(err));

如果你想使用Puppeteer,你可以选择这些标题和网址:

const puppeteer = require("puppeteer"); // ^19.0.0

const url = "Your URL";

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();
  await page.goto(url, {waitUntil: "domcontentloaded"});
  await page.waitForSelector("#video-title-link");
  const titles = await page.$$eval("#video-title-link", els =>
    els.map(e => ({title: e.textContent, url: e.href}))
      .filter(e => e.url)
  );
  console.log(titles);
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close());

出于某种原因,身份证不是唯一的。
虽然代码较少,但这种方法比fetch慢得多(在我的机器上慢了大约10倍),尽管您可以通过阻塞不相关的资源来稍微加快速度。
另外,始终在变量前面使用const,以避免使它们成为全局变量。
page.setDefaultNavigationTimeout(0)应该等待,通常不是一个很好的模式--这可能会永远挂起。我会将此设置为最多3或4分钟。如果nav花费了这么长时间,则一定是出了问题,您应该将其记录下来,以便查看。

相关问题