chrome无头浏览器可以用来保存完整的网页与资源吗?

zmeyuzjn  于 2023-04-27  发布在  Go
关注(0)|答案(1)|浏览(178)

我希望使用chrome无头浏览器复制“保存为”功能,以保存完整的网页及其所有资源。有没有办法做到这一点?我尝试了--print-to-pdf和--screenshot选项,并希望探索“另存为”选项。

agxfikkp

agxfikkp1#

这是完全可能的,但并不容易。你必须自己做繁重的工作。这意味着:
1.将所有链接资源保存到本地目录。
1.将所有指向这些资产的链接重写为相对的。
1.将重写的HTML文件保存到同一本地目录。
下面是一个使用Playwright的例子。(注意,这段代码是从现有项目中截取的,并为这个答案进行了清理。它可能无法完美地工作。)

const { webkit } = require('playwright');
const { parse } = require('node-html-parser');
const fs = require('fs-extra');
const path = require('path');
const url = require('url');

// Save the webpage and its assets to a local directory
async function saveWebpage(urlToSave, outputDir) {
  // Launch a new browser instance
  const browser = await webkit.launch();
  const context = await browser.newContext();
  const page = await context.newPage();

  // Navigate to the specified URL
  await page.goto(urlToSave);
  const html = await page.content();

  // Parse the HTML content
  const parsedHtml = parse(html);
  const baseTag = parsedHtml.querySelector('base');
  const baseUrl = baseTag ? baseTag.getAttribute('href') : urlToSave;

  const assetUrls = new Set();
  const assetDownloadPromises = [];

  // Fetch the asset and return its content as a buffer
  async function fetchAsset(originalUrl) {
    try {
      const assetPage = await context.newPage();
      const response = await assetPage.goto(originalUrl, { waitUntil: 'networkidle' });
      const buffer = await response.buffer();
      return buffer;
    } catch (error) {
      console.error(`Error fetching asset: ${originalUrl} - ${error.message}`);
    }
  }

  // Process the specified attribute to update the links and fetch the assets
  function processAttribute(attributeName) {
    for (const element of parsedHtml.querySelectorAll(`[${attributeName}]`)) {
      const originalUrl = element.getAttribute(attributeName);
      if (originalUrl.startsWith('data:')) continue;

      const absoluteUrl = url.resolve(baseUrl, originalUrl);
      const parsedUrl = url.parse(absoluteUrl);
      const relativePath = path.join(parsedUrl.host || '', parsedUrl.pathname);
      const localPath = path.join(outputDir, relativePath);

      element.setAttribute(attributeName, relativePath);

      if (!assetUrls.has(absoluteUrl)) {
        assetUrls.add(absoluteUrl);
        assetDownloadPromises.push(
          fetchAsset(absoluteUrl)
            .then((buffer) => buffer && fs.outputFile(localPath, buffer))
        );
      }
    }
  }

  // Process 'src' and 'href' attributes to update links and download assets
  processAttribute('src');
  processAttribute('href');

  // Save the updated HTML content
  await fs.outputFile(path.join(outputDir, 'index.html'), parsedHtml.toString());

  // Wait for all assets to be downloaded
  await Promise.allSettled(assetDownloadPromises);

  // Close the browser instance
  await browser.close();
}

const urlToSave = 'https://example.com/';
const outputDir = 'saved-website';

saveWebpage(urlToSave, outputDir).catch((error) => console.error('Error:', error));

相关问题