:GetFileList
Rem as posted to https://stackoverflow.com/questions/42878196
for /F "eol=;" %%f in (filelist.txt) do curl -O %%f
exit /b 0
goto error GetFileList
import requests
urls = pd.read_csv('cat_urls.csv') #save the url list as a dataframe
rows = []
for index, i in urls.iterrows():
rows.append(i[-1])
counter = 0
for i in rows:
file_name = 'cat' + str(counter) + '.jpg'
print(file_name)
response = requests.get(i)
file = open(file_name, "wb")
file.write(response.content)
file.close()
counter += 1
import os
import time
import sys
import urllib
from progressbar import ProgressBar
def get_raw_html(url):
version = (3,0)
curr_version = sys.version_info
if curr_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
request = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(request)
respData = str(resp.read())
return respData
except Exception as e:
print(str(e))
else: #If the Current Version of Python is 2.x
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
request = urllib2.Request(url, headers = headers)
try:
response = urllib2.urlopen(request)
except URLError: # Handling SSL certificate failed
context = ssl._create_unverified_context()
response = urlopen(req,context=context)
#response = urllib2.urlopen(req)
raw_html = response.read()
return raw_html
except:
return"Page Not found"
def next_link(s):
start_line = s.find('rg_di')
if start_line == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
def all_links(page):
links = []
while True:
link, end_content = next_link(page)
if link == "no_links":
break
else:
links.append(link) #Append all the links in the list named 'Links'
#time.sleep(0.1) #Timer could be used to slow down the request for image downloads
page = page[end_content:]
return links
def download_images(links, search_keyword):
choice = input("Do you want to save the links? [y]/[n]: ")
if choice=='y' or choice=='Y':
#write all the links into a test file.
f = open('links.txt', 'a') #Open the text file called links.txt
for link in links:
f.write(str(link))
f.write("\n")
f.close() #Close the file
num = input("Enter number of images to download (max 100): ")
counter = 1
errors=0
search_keyword = search_keyword.replace("%20","_")
directory = search_keyword+'/'
if not os.path.isdir(directory):
os.makedirs(directory)
pbar = ProgressBar()
for link in pbar(links):
if counter<=int(num):
file_extension = link.split(".")[-1]
filename = directory + str(counter) + "."+ file_extension
#print ("Downloading image: " + str(counter)+'/'+str(num))
try:
urllib.request.urlretrieve(link, filename)
except IOError:
errors+=1
#print ("\nIOError on Image" + str(counter))
except urllib.error.HTTPError as e:
errors+=1
#print ("\nHTTPError on Image"+ str(counter))
except urllib.error.URLError as e:
errors+=1
#print ("\nURLError on Image" + str(counter))
counter+=1
return errors
def search():
version = (3,0)
curr_version = sys.version_info
if curr_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
else:
import urllib2 #If current version of python is 2.x
search_keyword = input("Enter the search query: ")
#Download Image Links
links = []
search_keyword = search_keyword.replace(" ","%20")
url = 'https://www.google.com/search?q=' + search_keyword+ '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (get_raw_html(url))
links = links + (all_links(raw_html))
print ("Total Image Links = "+str(len(links)))
print ("\n")
errors = download_images(links, search_keyword)
print ("Download Complete.\n"+ str(errors) +" errors while downloading.")
search()
6条答案
按热度按时间whlutmcx1#
cd
到该文件夹。wget -i images.txt
pgky5nke2#
在Windows 10/11上,这是相当琐碎的使用
请注意,包含
eol=;
允许我们通过在filelist.txt中我们不想要的那些行的开头添加;
来屏蔽个别排除。因此,在我的系统中,我只需键入并输入
Do GetFileList
,所有存储的URL都将下载,其中Do
是一个旧的DO技巧,可以在一个自编辑zip文件中保存许多小命令,但现在我使用CMD,其中Do Edit
将其调用为Notepad "%~f0"
来粘贴这样的部分。Do.bat的一部分
Windows 7有一个FTP命令,但它经常会弹出一个需要用户授权响应的防火墙对话框。
目前运行Windows 7并希望下载URL列表而不下载任何wget.exe或其他依赖项,如curl.exe(这将是最简单的第一个命令),最短的兼容方法是power-shell命令(不是我最喜欢的速度,但如果需要必须。
带有URL的文件是
filelist.txt
,IWR
是wget
的PS近似等价物。Security Protocol first命令确保我们使用的是现代TLS1.2协议
-OutF ... split-path ...
意味着文件名将与远程文件名相同,但在CWD(当前工作目录)中,如果需要,可以使用cd /d folder
进行脚本编写。要作为CMD运行,请在
'Tls12'
周围使用稍微不同的引号wa7juj8i3#
这需要制作成一个具有错误处理功能的函数,但它会重复下载图像进行图像分类项目
9lowa7mx4#
falq053o5#
在这个python project中,我在www.example.com中进行搜索unsplash.com,它会给我一个URL列表,然后我将其中的一些URL(由用户预先定义)保存到一个预先定义的文件夹中。
ssm49v7z6#
在Windows上,
install wget - https://sourceforge.net/projects/gnuwin32/files/wget/1.11.4-1/
并将
C:\Program Files (x86)\GnuWin32\bin
添加到您的环境路径。创建一个文件夹,其中包含所有要下载的图像的txt文件。
在文件资源管理器顶部的位置栏中键入
cmd
当命令提示符打开时,输入以下内容。
wget -i images.txt --no-check-certificate