javascript 有什么更好的方法来调试我的Puppeteer爬虫程序,以便我可以精确定位破坏脚本的样式/标记?

wnavrhmk  于 2023-05-05  发布在  Java
关注(0)|答案(1)|浏览(73)

我有一个傀儡爬虫,每隔几个月就会因为网站的小变化而坏掉。我希望加快调试过程,而不是每次发生这种情况时都陷入黑洞。考虑到下面的代码,我可以用什么具体的方法来做到这一点?

const puppeteer = require("puppeteer");
const delay = require("./delay.js")
const fs = require("fs");

//url of some event in the future
const url = ["https://www.oddsportal.com/baseball/usa/mlb/oakland-athletics-seattle-mariners-bFxLtxG9/"]

const res = [];

puppeteer.launch({ headless: false, sloMo:1000, }).then(async (browser) => {
    var page = await browser.newPage();
    await page.goto("https://www.oddsportal.com/");
    await delay(1000);

    console.log("Getting data..");
    for (i = 0; i < url.length; i++) {
        try {
            await page.goto(url[i]);
            await page.waitForSelector('[class="flex flex-col items-center justify-center gap-1 border-r border-[#E0E0E0] min-w-[60px] max-sm:min-w-[55px] max-sm:max-w-[55px]"]')
            //relative flex flex-col items-center justify-center font-bold text-black-main
            
            await delay(500);
            var data = await page.evaluate(async () => {
                debugger; //docs said include this..does not do what I want yet..
                const delay = (milliseconds) => new Promise((resolve) => setTimeout(resolve, milliseconds));
                var t = document.querySelectorAll('[class="flex flex-col items-center justify-center gap-1 border-r border-[#E0E0E0] min-w-[60px] max-sm:min-w-[55px] max-sm:max-w-[55px]"]')
                var average = {
                    avg_odds_lt: t[0].innerText,
                    avg_odds_rt: t[1].innerText
                }
                console.log(average);
                t = document.querySelectorAll('[class="absolute w-full text-center cursor-pointer height-content"]');
                var UserPredictions = {
                    pxtx_lt: t[0].innerText,
                    pxtx_rt: t[1].innerText
                }
                var bettingExchange = [];
                t = document.querySelectorAll('[class="height-content min-mt:!hidden text-black-main font-bold text-xs leading-[18px]"]')
                for (i = 0; i < t.length; i++) {
                    t[i].click();
                    await delay(500);
                    if (i % 2 == 0) {
                        var exch_name = document.querySelectorAll('[class="w-[75px] bg-cover bg-no-repeat"]')[0]?.alt;
                        var odds = document.querySelectorAll('[class="flex flex-col gap-1 text-xs"]')[1]?.innerText.split("\n");
                        var volume = document.querySelectorAll('[class="flex flex-col gap-1 text-xs"]')[2]?.innerText.split("\n");
                        var opOdds = document.querySelectorAll('[class="flex gap-1"]')[1]?.innerText.split("\n");
                        bettingExchange.push({
                            exch_name,
                            left_OddsMovement: {
                                odds,
                                volume
                            },
                            OpeningOdds: opOdds
                        })
                    } else {
                        var exch_name = document.querySelectorAll('[class="w-[75px] bg-cover bg-no-repeat"]')[0]?.alt;
                        var odds = document.querySelectorAll('[class="flex flex-col gap-1 text-xs"]')[1]?.innerText.split("\n");
                        var volume = document.querySelectorAll('[class="flex flex-col gap-1 text-xs"]')[2]?.innerText.split("\n");
                        var opOdds = document.querySelectorAll('[class="flex gap-1"]')[1]?.innerText.split("\n");
                        bettingExchange.push({
                            exch_name,
                            right_OddsMovement: {
                                odds,
                                volume
                            },
                            OpeningOdds: opOdds
                        })
                    }
                }
                return ({
                    average,
                    UserPredictions,
                    bettingExchange
                })
            })
            data['url'] = page.url();
            // console.log(JSON.stringify(data));
            console.log(url[i] + " =>[Success]");
            res.push(data);
        } catch {
            console.error(url[i] + " =>[ERROR]");
        }
    }
    await browser.close();

几个小时前,我收到了[成功]的消息,在这段时间里,我认为11种风格中的一种已经改变了。我怎样才能更好地调试它,这样我就可以确切地知道哪种风格破坏了脚本,这样我就不必深入研究每种风格了?
我对Puppeteer相当陌生,因为我更习惯Python的BeautifulSoup。干杯

x33g5p2x

x33g5p2x1#

我有个主意你可以保存每个css文件,并将它们与下一次执行的前一个版本进行比较。
我正在使用AI生成此片段的 * 部分 *。

const fs = require('fs');

// Enable request interception
await page.setRequestInterception(true);

// Collect CSS responses
const cssResponses = new Map();
page.on('request', request => {
  if (request.resourceType() === 'stylesheet') {
    request.continue();
    request.response().then(response => {
      cssResponses.set(request.url(), response.text());
    });
  } else {
    request.continue();
  }
});

// Save CSS responses
cssResponses.forEach((content, url) => {
  const filename = url.split('/').pop();
  
  // Compare to cached version
  const storedContent = await fs.readFile(filename, 'utf8').catch(() => '');
  if (storedContent && content != storedContent) {
     throw new Error(`File ${filename} has changed!`);
  }

  fs.writeFile(filename, content, err => {
    if (err) throw err;
    console.log(`Saved ${filename}`);
  });
});

相关问题