python 如何从谷歌图片获得100张图片

oxf4rvwz  于 2022-10-30  发布在  Python
关注(0)|答案(2)|浏览(140)

我试着从谷歌的列表(list_name)中获取一些图片(比如100张),但是我的代码只返回了20张,我不知道为什么。
下面是我的代码:

import os
import requests
from bs4 import BeautifulSoup

liste_name = ['blood orange','apple golden']

for name in liste_name:
name_splited = name.split(" ")
if len(name_splited) > 1:
    full_name = name_splited[0] + "_" + name_splited[1]
    path = "./Dataset/Trainset/" + full_name + "/"
    name = name_splited[0] + "%" + name_splited[1]

url = "https://www.google.ch/search?site=webhp&tbm=isch&source=hp&q=" + \
       name + "&oq=" + name + "biw=1280&bih=579&num=100"

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

list_res_image = soup.find_all("img")

if not os.path.exists(path):
    os.makedirs(path)
for index, lien in enumerate(list_res_image):
    link = lien['src']
    test = False

    while not test:
        try:
            img = requests.get(link).content
            test = True
        except requests.exceptions.SSLError:
            pass
    with open(path + full_name + str(index) + ".png", "wb") as f:
        f.write(img)
uqxowvwt

uqxowvwt1#

谷歌API阻止你最多20张图片
有关详细信息,请参阅here

8gsdolmq

8gsdolmq2#

您可以使用seleniumplaywright来取得所有的影像。您也可以使用"ijn" URL parameter来定义页码,例如0是第一页,1是第二页,依此类推。参数应该大于或相等0。
但是,我们也可以通过内联JSON使用regular expressionsBeautifulSoup执行此操作。
为了不对特定链接发出请求,您可以设置parameters,该值在后续搜索中始终可以更改:


# this URL params is taken from the actual Google search URL

# and transformed to a more readable format

params = {
    "q": "blood orange",              # search query
    "tbm": "isch",                    # image results
    "hl": "en",                       # language of the search
    "gl": "us",                       # country where search comes from
  }

在正则表达式的帮助下,我们逐渐过滤掉内联JSON代码以查找图像结果:


# https://regex101.com/r/kyLU8S/1

matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))

matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)

# https://regex101.com/r/GbVLOq/1

matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)

# https://regex101.com/r/LzhCYM/1

matched_google_images_thumbnails = ", ".join(
    re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                   str(matched_google_image_data))).split(", ")

thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]

# removing previously matched thumbnails for easier full resolution image matches.

removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))

# https://regex101.com/r/fXjfb1/4

# https://stackoverflow.com/a/19821774/15164646

matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)

online IDE中的完整代码和示例

import requests, re, json, lxml
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
  }

queries = ['blood orange','apple golden']
google_images = []
for query in queries:
    print(f'Extracting images for query: {query}')

    params = {    
        "q": query,              # search query
        "tbm": "isch",           # image results
        "hl": "en",              # language of the search
        "gl": "us",              # country where search comes fro
      }

    html = requests.get("https://google.com/search", params=params, headers=headers, timeout=30)
    soup = BeautifulSoup(html.text, "lxml")

    all_script_tags = soup.select("script")

    # https://regex101.com/r/48UZhY/4
    matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))

    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # https://regex101.com/r/VPz7f2/1
    matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)

    # https://regex101.com/r/Jt5BJW/1
    matched_google_images_thumbnails = ", ".join(
        re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                       str(matched_google_image_data))).split(", ")

    thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
            r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))

    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)

    full_res_images = [
            bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
    ]

    for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1):
        google_images.append({
            "title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
            "link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
            "source": metadata.select_one(".fxgdke").text,
            "thumbnail": thumbnail,
            "original": original
        })

print(json.dumps(google_images, indent=2, ensure_ascii=False))

输出示例

[

   {
    "title": "Glazed Blood Orange and Lavender Loaf | Olive & Mango",
    "link": "https://www.oliveandmango.com/glazed-blood-orange-and-lavender-loaf",
    "source": "oliveandmango.com",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTT_92Ydz2t-niZ8bF7tExYqVSYzeLldzXQjg&usqp=CAU",
    "original": "https://d33wubrfki0l68.cloudfront.net/09a0f8357a7f0d667b7b20537b74886649cc35cc/9bb85/images/uploads/2019_02_09_glazed_blood_orange_and_lavender_loaf_3.jpg"
  },
  {
    "title": "Blood Orange Gin & Tonic – A Couple Cooks",
    "link": "https://www.acouplecooks.com/blood-orange-cocktail/",
    "source": "acouplecooks.com",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcScgVolM0b-ilf63TlcTSJTSpkV_3HX9iQh5Q&usqp=CAU",
    "original": "https://www.acouplecooks.com/wp-content/uploads/2021/01/Blood-Orange-Cocktail-001.jpg"
  },
  {
    "title": "Fresh Golden Delicious Apples - Shop Fruit at H-E-B",
    "link": "https://www.heb.com/product-detail/fresh-golden-delicious-apples/377503",
    "source": "heb.com",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT1y_FmZ56YcN6NeVyzT-TKHh54HgtByvSSFpcxVIBRjYB-l9HDaE_rMDrmKlI6IcvfTZs&usqp=CAU",
    "original": "https://images.heb.com/is/image/HEBGrocery/000377503"
  },
  {
    "title": "Golden Delicious - Wikipedia",
    "link": "https://en.wikipedia.org/wiki/Golden_Delicious",
    "source": "en.wikipedia.org",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSdSLrBTzdhPzJp-AbZftn8iTm-6OR_PFLSmqJqiZyfjsPGMB6lryZdb8tF3rYiwxmTJC0&usqp=CAU",
    "original": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0f/Golden_Delicious_apples.jpg/1200px-Golden_Delicious_apples.jpg"
  },
  # ...
]

你也可以使用SerpApi的Google Images API。这是一个免费的付费API。不同的是它会绕过Google的块(包括CAPTCHA),不需要创建解析器和维护它。
要集成的示例:

from serpapi import GoogleSearch
import os, json

image_results = []

queries = ['blood orange','apple golden']
for query in queries:
    print(f'extracting images for query: {query}')

# search query     parameters

    params = {
        "engine": "google",               # search engine. Google, Bing, Yahoo, Naver, Baidu...
        "q": query,                       # search query
        "tbm": "isch",                    # image results
        "num": "100",                     # number of images per page
        "ijn": 0,                         # page number: 0 -> first page, 1 -> second...
        "api_key": os.getenv("API_KEY")   # your serpapi api key
                                          # other query parameters: hl (lang), gl (country), etc  
    }

    search = GoogleSearch(params)         # where data extraction happens

    images_is_present = True
    while images_is_present:
        results = search.get_dict()       # JSON -> Python dictionary

    # checks for "Google hasn't returned any results for this query."
        if "error" not in results:
            for image in results["images_results"]:
                if image["original"] not in image_results:
                        image_results.append(image["original"])

    # update to the next page
            params["ijn"] += 1
        else:
            print(results["error"])
            images_is_present = False

print(json.dumps(image_results, indent=2))

输出量:

[
  "https://www.researchgate.net/publication/340952507/figure/fig1/AS:885003558846464@1588012699713/Apple-varieties-Red-Delicious-Granny-Smith-Golden-Delicious-respectively-Sekil-1.jpg",
  "https://goodfruitguide.co.uk/wp-content/uploads/Apple-Golden-Delicious-ZA-DSC_0021-cr-sq-300x300.jpg",
  "http://newenglandapples.files.wordpress.com/2011/12/img_6239.jpg",
  "https://i5.peapod.com/c/IY/IY47G.png",
  "https://cdn.shopify.com/s/files/1/1251/5173/products/goldendelicious_1024x1024.jpeg?v=1572074514",
  "https://www.gannett-cdn.com/-mm-/a5076e7a43a0cec6129489319d0fb728e2cd1814/c=0-264-5184-3193/local/-/media/2018/01/03/Phoenix/Phoenix/636505888078540454-opal-apples-8.JPG?width=660&height=373&fit=crop&format=pjpg&auto=webp",
  "https://cdn.shopify.com/s/files/1/0250/1384/6115/products/golden-reinette-apple-tree_800x.JPG?v=1565650598",
  "https://blogchef.net/wp-content/uploads/2022/04/golden-delicious-juicy-ripe-fresh-yellow-apples-brown-wooden-background-side-view-scaled.jpg",
  # ...
]

如果您需要更多的代码解释,可以参考Scrape and download Google Images with Python博客文章。
免责声明,我为SerpApi工作。

相关问题