我希望使用chrome无头浏览器复制“保存为”功能,以保存完整的网页及其所有资源。有没有办法做到这一点?我尝试了--print-to-pdf和--screenshot选项,并希望探索“另存为”选项。
agxfikkp1#
这是完全可能的,但并不容易。你必须自己做繁重的工作。这意味着:1.将所有链接资源保存到本地目录。1.将所有指向这些资产的链接重写为相对的。1.将重写的HTML文件保存到同一本地目录。下面是一个使用Playwright的例子。(注意,这段代码是从现有项目中截取的,并为这个答案进行了清理。它可能无法完美地工作。)
const { webkit } = require('playwright'); const { parse } = require('node-html-parser'); const fs = require('fs-extra'); const path = require('path'); const url = require('url'); // Save the webpage and its assets to a local directory async function saveWebpage(urlToSave, outputDir) { // Launch a new browser instance const browser = await webkit.launch(); const context = await browser.newContext(); const page = await context.newPage(); // Navigate to the specified URL await page.goto(urlToSave); const html = await page.content(); // Parse the HTML content const parsedHtml = parse(html); const baseTag = parsedHtml.querySelector('base'); const baseUrl = baseTag ? baseTag.getAttribute('href') : urlToSave; const assetUrls = new Set(); const assetDownloadPromises = []; // Fetch the asset and return its content as a buffer async function fetchAsset(originalUrl) { try { const assetPage = await context.newPage(); const response = await assetPage.goto(originalUrl, { waitUntil: 'networkidle' }); const buffer = await response.buffer(); return buffer; } catch (error) { console.error(`Error fetching asset: ${originalUrl} - ${error.message}`); } } // Process the specified attribute to update the links and fetch the assets function processAttribute(attributeName) { for (const element of parsedHtml.querySelectorAll(`[${attributeName}]`)) { const originalUrl = element.getAttribute(attributeName); if (originalUrl.startsWith('data:')) continue; const absoluteUrl = url.resolve(baseUrl, originalUrl); const parsedUrl = url.parse(absoluteUrl); const relativePath = path.join(parsedUrl.host || '', parsedUrl.pathname); const localPath = path.join(outputDir, relativePath); element.setAttribute(attributeName, relativePath); if (!assetUrls.has(absoluteUrl)) { assetUrls.add(absoluteUrl); assetDownloadPromises.push( fetchAsset(absoluteUrl) .then((buffer) => buffer && fs.outputFile(localPath, buffer)) ); } } } // Process 'src' and 'href' attributes to update links and download assets processAttribute('src'); processAttribute('href'); // Save the updated HTML content await fs.outputFile(path.join(outputDir, 'index.html'), parsedHtml.toString()); // Wait for all assets to be downloaded await Promise.allSettled(assetDownloadPromises); // Close the browser instance await browser.close(); } const urlToSave = 'https://example.com/'; const outputDir = 'saved-website'; saveWebpage(urlToSave, outputDir).catch((error) => console.error('Error:', error));
1条答案
按热度按时间agxfikkp1#
这是完全可能的,但并不容易。你必须自己做繁重的工作。这意味着:
1.将所有链接资源保存到本地目录。
1.将所有指向这些资产的链接重写为相对的。
1.将重写的HTML文件保存到同一本地目录。
下面是一个使用Playwright的例子。(注意,这段代码是从现有项目中截取的,并为这个答案进行了清理。它可能无法完美地工作。)