json 在Python中使用Selenium抓取Google图片

7z5jn7bk  于 2023-02-06  发布在  Python
关注(0)|答案(2)|浏览(139)

现在,我一直在尝试刮谷歌图片使用以下代码:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys 
import os
import time
import requests
import re
import urllib2
import re
from threading import Thread
import json
#Assuming I have a folder named Pictures1, the images are downloaded there. 
def threaded_func(url,i):
     raw_img = urllib2.urlopen(url).read()
     cntr = len([i for i in os.listdir("Pictures1") if image_type in i]) + 1
     f = open("Pictures1/" + image_type + "_"+ str(total), 'wb')
     f.write(raw_img)
     f.close()
driver = webdriver.Firefox()
driver.get("https://images.google.com/")
elem = driver.find_element_by_xpath('/html/body/div/div[3]/div[3]/form/div[2]/div[2]/div[1]/div[1]/div[3]/div/div/div[2]/div/input[1]')
elem.clear()
elem.send_keys("parrot")
elem.send_keys(Keys.RETURN)
image_type = "parrot_defG"
images=[]
total=0
time.sleep(10)
for a in driver.find_elements_by_class_name('rg_meta'):
     link =json.loads(a.text)["ou"]
     thread = Thread(target = threaded_func, args = (link,total))
     thread.start()
     thread.join()
     total+=1

我试着用Selenium打开谷歌的图片结果页面,然后注意到每个div都有类“rg-meta”,后面跟着JSON代码。
我尝试使用.text访问它。JSON的“ou”索引包含我尝试下载的图像的源代码。我尝试使用类“rg-meta”获取所有此类div并下载图像。但它显示错误**”NO JSON OBJECT CAN BE DECODED”**,我不知道该怎么做。
编辑:这就是我要说的:

<div class="rg_meta">{"cl":3,"id":"FqCGaup9noXlMM:","isu":"kids.britannica.com","itg":false,"ity":"jpg","oh":600,"ou":"http://media.web.britannica.com/eb-media/89/89689-004-4C85E0F0.jpg","ow":380,"pt":"grain weevil -- Kids Encyclopedia | Children\u0026#39;s Homework Help ...","rid":"EusB0pk_sLg7vM","ru":"http://kids.britannica.com/comptons/art-143712/grain-or-granary-weevil","s":"grain weevil","sc":1,"st":"Kids Britannica","th":282,"tu":"https://encrypted-tbn2.gstatic.com/images?q\u003dtbn:ANd9GcQPbgXbRVzOicvPfBRtAkLOpJwy_wDQEC6a2q0BuTsUx-s0-h4b","tw":179}</div>

检查JSON的ou索引,请帮我提取。
原谅我的无知。
这就是我如何通过以下更新来解决这个问题:

for a in driver.find_elements_by_xpath('//div[@class="rg_meta"]'):
        atext = a.get_attribute('innerHTML')
        link =json.loads(atext)["ou"]
        print link
        thread = Thread(target = threaded_func, args = (link,total))
        thread.start()
        thread.join()
        total+=1
qni6mghb

qni6mghb1#

替换:
driver.find_elements_by_class_name('rg_meta')driver.find_element_by_xpath('//div[@class="rg_meta"]/text()')
以及a.texta的关系
就能解决你的问题。
生成的代码:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys 
import os
import time
import requests
import re
import urllib2
import re
from threading import Thread
import json
#Assuming I have a folder named Pictures1, the images are downloaded there. 
def threaded_func(url,i):
     raw_img = urllib2.urlopen(url).read()
     cntr = len([i for i in os.listdir("Pictures1") if image_type in i]) + 1
     f = open("Pictures1/" + image_type + "_"+ str(total), 'wb')
     f.write(raw_img)
     f.close()
driver = webdriver.Firefox()
driver.get("https://images.google.com/")
elem = driver.find_element_by_xpath('/html/body/div/div[3]/div[3]/form/div[2]/div[2]/div[1]/div[1]/div[3]/div/div/div[2]/div/input[1]')
elem.clear()
elem.send_keys("parrot")
elem.send_keys(Keys.RETURN)
image_type = "parrot_defG"
images=[]
total=0
time.sleep(10)
for a in driver.find_element_by_xpath('//div[@class="rg_meta"]/text()'):
     link =json.loads(a)["ou"]
     thread = Thread(target = threaded_func, args = (link,total))
     thread.start()
     thread.join()
     total+=1

打印link会导致:

http://media.web.britannica.com/eb-media/89/89689-004-4C85E0F0.jpg
lvjbypge

lvjbypge2#

由于使用selenium进行解析非常耗时,您可以使用BeautifulSoup web抓取库来实现相同的结果。
我将向您展示如何从内联JSON解析图像(包括全分辨率的图像),这样会快得多。
由于页面是动态呈现的,我们还需要使用regular expressions从内联JSON中提取数据。
首先,我们可以在页面源代码(Ctrl+U)中查找第一张图片的标题,以找到我们需要的匹配项,如果它们在<script>元素中,那么很可能是内联JSON,从那里我们可以提取数据。
接下来,我们通过选择包含所需数据的代码部分来缩小搜索范围:

# https://regex101.com/r/RPIbXK/1
matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
    
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
      
# https://regex101.com/r/NRKEmV/1
matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)

之后,我们已经找到缩略图和直接原始图像:

# https://regex101.com/r/SxwJsW/1
matched_google_images_thumbnails = ", ".join(
    re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                       str(matched_google_image_data))).split(", ")
    
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
    
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))
    
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)

在联机IDE中检查完整代码。

import requests, re, json, lxml
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}

google_images = []

params = {    
    "q": "cat",                # search query
    "tbm": "isch",             # image results
    "hl": "en",                # language of the search
    "gl": "us"                 # country where search comes from
}
    
html = requests.get("https://www.google.co.in/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
    
all_script_tags = soup.select("script")
    
# https://regex101.com/r/RPIbXK/1
matched_images_data = "".join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
    
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
      
# https://regex101.com/r/NRKEmV/1
matched_google_image_data = re.findall(r'\"b-GRID_STATE0\"(.*)sideChannel:\s?{}}', matched_images_data_json)
    
# https://regex101.com/r/SxwJsW/1
matched_google_images_thumbnails = ", ".join(
    re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                       str(matched_google_image_data))).split(", ")
    
thumbnails = [bytes(bytes(thumbnail, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for thumbnail in matched_google_images_thumbnails]
    
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', "", str(matched_google_image_data))
    
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]", removed_matched_google_images_thumbnails)
    
full_res_images = [
        bytes(bytes(img, "ascii").decode("unicode-escape"), "ascii").decode("unicode-escape") for img in matched_google_full_resolution_images
]
        
for index, (metadata, thumbnail, original) in enumerate(zip(soup.select('.isv-r.PNCib.MSM1fd.BUooTd'), thumbnails, full_res_images), start=1):
    google_images.append({
        "title": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["title"],
        "link": metadata.select_one(".VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb")["href"],
        "source": metadata.select_one(".fxgdke").text,
        "thumbnail": thumbnail,
        "original": original
    })

print(json.dumps(google_images, indent=2, ensure_ascii=False))

输出示例

[
  
  {
    "title": "Domestic cat",
    "link": "https://www.nationalgeographic.com/animals/mammals/facts/domestic-cat",
    "source": "National Geographic",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRM50JpAefCcT9Alyo8H_v5iqsBbNB6A5CXD8Jgggs3UePzvzA9onL7ULKxBJ4JZxeWsSc&usqp=CAU",
    "original": "https://i.natgeofe.com/n/548467d8-c5f1-4551-9f58-6817a8d2c45e/NationalGeographic_2572187_square.jpg"
  },
  {
    "title": "cat - Wiktionary",
    "link": "https://en.wiktionary.org/wiki/cat",
    "source": "Wiktionary",
    "thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSKr5wT7rfkjkGvNeqgXjBmarC5ZNoZs-H2uMpML8O7Q4F9W-IlUQibBT6IPqyvX45NOgw&usqp=CAU",
    "original": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg"
  },
  other results ...
]

你也可以使用SerpApi的Google Images API,这是一个免费的付费API,不同的是它会绕过Google的屏蔽(包括CAPTCHA),不需要创建解析器和维护它。
集成示例:

from serpapi import GoogleSearch
import os, json

image_results = []
   
# search query parameters
params = {
    "engine": "google",               # search engine. Google, Bing, Yahoo, Naver, Baidu...
    "q": "cat",                       # search query
    "tbm": "isch",                    # image results
    "num": "100",                     # number of images per page
    "ijn": 0,                         # page number: 0 -> first page, 1 -> second...
    "api_key": "..."                  # serpapi key, https://serpapi.com/manage-api-key
                                      # other query parameters: hl (lang), gl (country), etc  
}
    
search = GoogleSearch(params)         # where data extraction happens
    
images_is_present = True
while images_is_present:
    results = search.get_dict()       # JSON -> Python dictionary
    
# checks for "Google hasn't returned any results for this query."
    if "error" not in results:
        for image in results["images_results"]:
            if image["original"] not in image_results:
                    image_results.append(image["original"])
                
# update to the next page
        params["ijn"] += 1
    else:
        images_is_present = False
        print(results["error"])

print(json.dumps(image_results, indent=2))

输出:

[
  "https://i.ytimg.com/vi/eX2qFMC8cFo/maxresdefault.jpg",
  "https://www.thesprucepets.com/thmb/QMJh_AV-7ThLXG3QFYkn7KXe8dQ=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/facts-about-tuxedo-cats-554704-hero-6fcf332fd5ee4d93aecc4bcd3657396c.jpg",
  "https://upload.wikimedia.org/wikipedia/en/thumb/8/82/Sylvester_the_Cat.svg/800px-Sylvester_the_Cat.svg.png",
  "https://www.purina.co.uk/sites/default/files/styles/square_medium_440x440/public/2022-06/Bengal.02.jpg?h=16d99826&itok=-mU4KnIM",
  "https://www.thesprucepets.com/thmb/5mrNYjDpNn0GnXD_crwBDGIijcs=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/persian-cats-gallery-4121944-hero-f5c237b8c6404655afb1e1bbae219ba5.jpg",
  "https://i.ytimg.com/vi/CPISexQPvQw/maxresdefault.jpg",
  "https://www.purina.co.uk/sites/default/files/2020-12/Cat%20Pregnancy%20Information%20%26%20AdviceTEASER.jpg",
  "https://www.thesprucepets.com/thmb/Re5vfPPvJhcRfTP22om9ivmFhJM=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc()/cat-meowing-57b738885f9b58cdfda9a497.jpg",
  other results ...
]

如果你需要更多的代码解释,有一篇Scrape and download Google Images with Python的博客文章。

相关问题