javascript Puppeteer抓取div标签元素返回一些空值

fiei3ece  于 2023-06-04  发布在  Java
关注(0)|答案(1)|浏览(98)

我试图从ebay上抓取产品,然后在amazon上打开它们,从搜索结果中获得asins。
到目前为止,脚本从循环通过的ebay标题列表中抓取了一些asins,但是有很多随机空白。我猜这可能是由于如果页面已经完全呈现。我确实试过等待,但没有运气。

const puppeteer = require('puppeteer');

     const URL = "https://www.amazon.co.uk/";
     const selectors = {
         searchBox : '#twotabsearchtextbox',   
         productLinks: 'span.a-size-base-plus.a-color-base.a-text-normal',
         productTitle: '#productTitle'
    
     };

     let browser;

   (async () => {
       
 
     browser = await puppeteer.launch( {headless: false});

   

 
    const page = await browser.newPage();

    
     await page.goto('https://www.ebay.co.uk/sch/jmp_supplies/m.html?_trkparms=folent%3Ajmp_supplies%7Cfolenttp%3A1&rt=nc&_trksid=p2046732.m1684');
    

   
     const ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
    await page.setExtraHTTPHeaders({"Accept-Language": "en-US,en;q=0.9"});
    await page.setUserAgent(ua);
 
    
    
    //Get product titles from ebay
    const grabTitles = await page.evaluate(() =>{
        const itemTitles = document.querySelectorAll('#e1-11 > #ResultSetItems > #ListViewInner > li > .lvtitle > .vip');
        var items = []
        itemTitles.forEach((tag) =>{
            items.push(tag.innerText)
        })
 
        return items
    })
 
 
   
   for (const title of grabTitles) {
    const page = await browser.newPage();
await page.goto("https://www.amazon.co.uk/");
await page.type("#twotabsearchtextbox", title);


await Promise.all([
  await page.keyboard.press("Enter"),
 

  page.waitForNavigation(),
]);

const attr = await page.$$eval("div.s-result-item.s-asin.sg-col-0-of-12.sg-col-16-of-20.sg-col.s-widget-spacing-small.sg-col-12-of-16",
 el => el.map(x => x.getAttribute("data-asin")));

console.log(attr.slice(0, 5));

 }

})()

  .catch(err => console.error(err))
  .finally(() => browser?.close())
 ;
icnyk63a

icnyk63a1#

为了你的目的,你可以使用一个NPM包ecommerce-scraper-js它允许你用更少的代码行做你需要的事情:

import { amazon, ebay } from "ecommerce-scraper-js";

(async () => {
  // get 10 results for "playstation 5" from ebay
  const ebayProducts = await ebay.getListings("playstation 5", 10);

  // Regex pattern to get asin from product link
  const asinPattern = /\/dp\/(?<asin>\S{10})/gm;

  // iterate over received products
  for (const product of ebayProducts) {
    // destructure product and get title
    const { title } = product;
    // get 10 results for received title from amazon
    const amazonProducts = await amazon.getListings(title, 10);
    // get asins from received products
    const asins = amazonProducts
      .map((el) => [...el.link.matchAll(asinPattern)].map(({ groups }) => groups.asin)[0])
      .filter((el) => el);

    console.log(asins);
  }
})();

输出示例:

[
  "B0BP1VJ7ML",
  "B0BNC97L3J",
  "B0BNL1YG5Q",
  "B0BN8FHB5K",
  "B0BNBPWMD5",
  "B0BHC395WW",
  "B09DFCB66S",
  "B0BBWPFRHV",
  "B0B8T6TV38",
  "B0B5B2B2S1"
]

你可以在他们的文档中看到更多的用例(带例子)。

相关问题