javascript Puppeteer不关闭浏览器

h9a6wy2h  于 2022-12-17  发布在  Java
关注(0)|答案(8)|浏览(287)

我正在express/node/ubuntu上运行puppeteer,如下所示:

var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();

/* GET home page. */
router.get('/', function(req, res, next) {
    (async () => {
        headless = true;
        const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
        const page = await browser.newPage();
        url = req.query.url;
        await page.goto(url);
        let bodyHTML = await page.evaluate(() => document.body.innerHTML);
        res.send(bodyHTML)
        await browser.close();
    })();
});

多次运行此脚本会留下数百个僵尸:

$ pgrep chrome | wc -l
133

阻塞了SRV,
我该怎么解决这个问题?
从Express JS脚本运行kill可以解决这个问题吗?
除了 puppet 戏和Headless Chromium ,有没有更好的方法来获得同样的效果?

xa9qqrwz

xa9qqrwz1#

啊!这是一个简单的疏忽。如果发生错误,你的await browser.close()永远不会执行,从而给你留下僵尸怎么办?
使用shell.js似乎是解决这个问题的一种很好的方法。
更好的做法是使用try..catch..finally。原因是您希望浏览器无论是否出现满意的流或抛出错误都关闭。与其他代码片段不同,您不必尝试同时在catch块和finally块中关闭浏览器。无论是否抛出错误,finally块始终执行。
所以,你的代码应该看起来像,

const puppeteer = require('puppeteer');
const express = require('express');

const router = express.Router();

/* GET home page. */
router.get('/', function(req, res, next) {
  (async () => {
    const browser = await puppeteer.launch({
      headless: true,
      args: ['--no-sandbox'],
    });

    try {
      const page = await browser.newPage();
      url = req.query.url;
      await page.goto(url);
      const bodyHTML = await page.evaluate(() => document.body.innerHTML);
      res.send(bodyHTML);
    } catch (e) {
      console.log(e);
    } finally {
      await browser.close();
    }
  })();
});

希望这有帮助!

s71maibg

s71maibg2#

像这样将代码 Package 在try-catch中,看看是否有帮助

headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
try {
  const page = await browser.newPage();
  url = req.query.url;
  await page.goto(url);
  let bodyHTML = await page.evaluate(() => document.body.innerHTML);
  res.send(bodyHTML);
  await browser.close();
} catch (error) {
  console.log(error);
} finally {
  await browser.close();
}
vdgimpew

vdgimpew3#

根据我的经验,调用close后,浏览器关闭过程可能需要一段时间,无论如何,可以检查浏览器进程属性,查看是否还没有关闭,强制杀死它。

if (browser && browser.process() != null) browser.process().kill('SIGINT');

我也在下面发布了我的 puppet 师资源管理器的完整代码。

const puppeteer = require('puppeteer-extra')
const randomUseragent = require('random-useragent');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')

const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36';
puppeteer.use(StealthPlugin())

function ResourceManager(loadImages) {
    let browser = null;
    const _this = this;
    let retries = 0;
    let isReleased = false;

    this.init = async () => {
        isReleased = false;
        retries = 0;
        browser = await runBrowser();
    };

    this.release = async () => {
        isReleased = true;
        if (browser) await browser.close();
    }

    this.createPage = async (url) => {
        if (!browser) browser = await runBrowser();
        return await createPage(browser,url);
    }

    async function runBrowser () {
        const bw = await puppeteer.launch({
            headless: true,
            devtools: false,
            ignoreHTTPSErrors: true,
            slowMo: 0,
            args: ['--disable-gpu','--no-sandbox','--no-zygote','--disable-setuid-sandbox','--disable-accelerated-2d-canvas','--disable-dev-shm-usage', "--proxy-server='direct://'", "--proxy-bypass-list=*"]
        });

        bw.on('disconnected', async () => {
            if (isReleased) return;
            console.log("BROWSER CRASH");
            if (retries <= 3) {
                retries += 1;
                if (browser && browser.process() != null) browser.process().kill('SIGINT');
                await _this.init();
            } else {
                throw "===================== BROWSER crashed more than 3 times";
            }
        });

        return bw;
    }

    async function createPage (browser,url) {
        const userAgent = randomUseragent.getRandom();
        const UA = userAgent || USER_AGENT;
        const page = await browser.newPage();
        await page.setViewport({
            width: 1920 + Math.floor(Math.random() * 100),
            height: 3000 + Math.floor(Math.random() * 100),
            deviceScaleFactor: 1,
            hasTouch: false,
            isLandscape: false,
            isMobile: false,
        });
        await page.setUserAgent(UA);
        await page.setJavaScriptEnabled(true);
        await page.setDefaultNavigationTimeout(0);
        if (!loadImages) {
            await page.setRequestInterception(true);
            page.on('request', (req) => {
                if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){
                    req.abort();
                } else {
                    req.continue();
                }
            });
        }

        await page.evaluateOnNewDocument(() => {
            //pass webdriver check
            Object.defineProperty(navigator, 'webdriver', {
                get: () => false,
            });
        });

        await page.evaluateOnNewDocument(() => {
            //pass chrome check
            window.chrome = {
                runtime: {},
                // etc.
            };
        });

        await page.evaluateOnNewDocument(() => {
            //pass plugins check
            const originalQuery = window.navigator.permissions.query;
            return window.navigator.permissions.query = (parameters) => (
                parameters.name === 'notifications' ?
                    Promise.resolve({ state: Notification.permission }) :
                    originalQuery(parameters)
            );
        });

        await page.evaluateOnNewDocument(() => {
            // Overwrite the `plugins` property to use a custom getter.
            Object.defineProperty(navigator, 'plugins', {
                // This just needs to have `length > 0` for the current test,
                // but we could mock the plugins too if necessary.
                get: () => [1, 2, 3, 4, 5],
            });
        });

        await page.evaluateOnNewDocument(() => {
            // Overwrite the `plugins` property to use a custom getter.
            Object.defineProperty(navigator, 'languages', {
                get: () => ['en-US', 'en'],
            });
        });

        await page.goto(url, { waitUntil: 'networkidle2',timeout: 0 } );
        return page;
    }
}

module.exports = {ResourceManager}
bq8i3lrv

bq8i3lrv4#

我用https://www.npmjs.com/package/shelljs来解

var shell = require('shelljs');
shell.exec('pkill chrome')
sxissh06

sxissh065#

尝试在发送响应之前关闭浏览器

var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();

router.get('/', function(req, res, next) {
    (async () => {
        headless = true;
        const browser = await puppeteer.launch({headless: true});
        const page = await browser.newPage();
        url = req.query.url;
        await page.goto(url);
        let bodyHTML = await page.evaluate(() => document.body.innerHTML);
        await browser.close();
        res.send(bodyHTML);
    })();
});
lnxxn5zx

lnxxn5zx6#

我遇到了同样的问题,虽然你的shelljs解决方案确实有效,但它会杀死所有chrome进程,这可能会中断一个正在处理请求的进程。

var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();

router.get('/', function (req, res, next) {
    (async () => {
        await puppeteer.launch({ headless: true }).then(async browser => {
            const page = await browser.newPage();
            url = req.query.url;
            await page.goto(url);
            let bodyHTML = await page.evaluate(() => document.body.innerHTML);
            await browser.close();
            res.send(bodyHTML);
        });
    })();
});
8e2ybdfx

8e2ybdfx7#

使用

(await browser).close()

这种情况的发生是因为浏览器所包含的是一个你必须解决它的承诺,我为此遭受了很多我希望它有帮助

idfiyjo8

idfiyjo88#

我使用以下基本设置运行Puppeteer:

const puppeteer = require("puppeteer");

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();

  /* use the page */
  
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close())
;

在这里,finally块保证浏览器将正确关闭,而不管是否抛出错误。错误被记录(如果需要)。我喜欢.catch.finally作为链接调用,因为主线Puppeteer代码更扁平,但这完成了同样的事情:

const puppeteer = require("puppeteer");

(async () => {
  let browser;

  try {
    browser = await puppeteer.launch();
    const [page] = await browser.pages();

    /* use the page */
  }
  catch (err) {
    console.error(err);
  }
  finally {
    await browser?.close();
  }
})();

没有理由调用newPage,因为Puppeteer从打开的页面开始。
至于Express,您只需要将上面的整个代码(包括let browser;,不包括require("puppeteer"))放置到您的路线中,就可以开始了,尽管您可能希望使用async middleware error handler
你问:
除了 puppet 戏和Headless Chromium ,有没有更好的方法来获得同样的效果?
这取决于你在做什么以及你所说的“更好”是什么意思。如果你的目标是得到document.body.innerHTML,并且你感兴趣的页面内容被烘焙到静态HTML中,你可以完全转储Puppeteer,只请求得到资源,然后使用Cheerio提取所需的信息。
另一个注意事项是,您可能不需要为每个请求加载和关闭整个浏览器。如果可以为每个请求使用一个新页,请考虑以下策略:

const express = require("express");
const puppeteer = require("puppeteer");

const asyncHandler = fn => (req, res, next) =>
  Promise.resolve(fn(req, res, next)).catch(next)
;
const browserReady = puppeteer.launch({
  args: ["--no-sandbox", "--disable-setuid-sandbox"]
});

const app = express();
app
  .set("port", process.env.PORT || 5000)
  .get("/", asyncHandler(async (req, res) => {
    const browser = await browserReady;
    const page = await browser.newPage();

    try {
      await page.goto(req.query.url || "http://www.example.com");
      return res.send(await page.content());
    }
    catch (err) {
      return res.status(400).send(err.message);
    }
    finally {
      await page.close();
    }
  }))
  .use((err, req, res, next) => res.sendStatus(500))
  .listen(app.get("port"), () =>
    console.log("listening on port", app.get("port"))
  )
;

最后,确保不要将任何超时设置为0(例如,await page.setDefaultNavigationTimeout(0);),因为这可能会导致脚本永远挂起,如果您需要一个较宽的超时,最多将其设置为几分钟--足够长,不会触发误报。
另见:

相关问题