我试图从Instagram报废数据,并在一个API中提供它,所以我使用 puppet 和所有本地好东西,但在heroku我得到超时
这就是整个代码,有人有解决方案吗
const puppeteer = require("puppeteer");
const fs = require("fs");
const ig = require("instagram-url-dl");
const cheerio = require("cheerio");
const NodeCache = require("node-cache");
const myCache = new NodeCache({ stdTTL: 5 * 60 });
const express = require("express");
const app = express();
const doc = fs.readFileSync("./doc.html", "utf-8");
let browser;
let page;
const lanuchBrowser = async () => {
browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
page = await browser.newPage();
};
lanuchBrowser();
let url;
app.use(express.json());
app.get("/", (req, res) => {
res.status(200).send(doc);
});
app.post("/download-reel", async (req, res) => {
res.setHeader("Content-Type", "application/json");
res.writeHead(200);
try {
const { reel_url } = req.body;
url = reel_url;
const regex = /^https:\/\/www.instagram.com\/reel\//;
const valid = regex.test(reel_url);
if (!reel_url) {
throw new Error("Missing parameter: reel_url");
} else if (!valid) {
throw new Error("Invalid parameter: reel_url");
}
// Caching
const theRes = await myCache.get(reel_url);
if (theRes) {
return res.write(JSON.stringify(theRes));
}
res.write("");
// Get download link
const resp = await ig(reel_url);
// Get author, title, timestamp
await page.goto(reel_url, {
timeout: 60000,
waitUntil: "domcontentloaded",
});
console.log("1");
await page.waitForSelector("main", {
timeout: 0,
});
console.log("2");
const content = await page.content();
console.log("3");
let $ = cheerio.load(content);
const author = $(
"a.x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz._acan._acao._acat._acaw._aj1-._a6hd"
).text();
$ = cheerio.load($("li._a9zj._a9zl._a9z5").html());
console.log("4");
const timestamp = $("time").attr("datetime");
const title = $("h1._aacl._aaco._aacu._aacx._aad7._aade").text();
let resObj = {
sucess: true,
download_link: resp.data[0].url,
title,
author,
timestamp,
};
console.log("5");
res.write(JSON.stringify(resObj));
console.log("6");
myCache.set(reel_url, resObj);
} catch (err) {
console.log(err);
err.message == "Missing parameter: reel_url"
? err.message
: (err.message = "Invalid parameter: reel_url");
res.write(
JSON.stringify({
sucess: false,
error_message: err.message,
})
);
} finally {
res.end();
}
});
app.listen(process.env.PORT || 3000, () => {
console.log("Lanuch the app ....");
});
我尝试添加{timeout:0}我没有得到来自服务器的响应,所以我试图找出哪里是问题,所以我console.log编号后,每个操作,当我运行heroku日志--tail
我发现控制台里只有数字1是打印的
所以这行代码超时了await page.waitForSelector("main", { timeout: 0, });
2条答案
按热度按时间h7appiyu1#
你的问题很可能就在这里
您可以使用比
domContentLoaded
更好的waitUntil:networkidle0
来捕获页面中任何延迟加载的请求waitForSelector
会一直等到请求超时,所以不建议使用它,除非在某些情况下,延迟加载的选择器肯定会在页面上。相反,你可以这样做:这将在页面加载和网络调用完成后尝试查找选择器,如果没有找到,它将继续并完成请求的其余部分,而不是挂起直到超时。您可以处理选择器不存在的情况,以记录某些内容或重新导航到另一个页面。
kadbb4592#
我只是使用这个buildpack和它的工作
https://github.com/jontewks/heroku-buildpack-puppeteer-firefox