python-3.x 从公共可用数据获取的网页抓取卫星图像

import re
import requests
from bs4 import BeautifulSoup

webpage = 'https://xgis.maaamet.ee/xgis2/page/app/ristipuud'

----------

response = requests.get(site)

bsoup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')

urls = \[img\['src'\] for img in img_tags\]

for url in urls:
filename = re.search(r'/(\[\\w\_-\]+\[.\](jpg|gif|tif|png))$', url)
if not filename:
print("didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(webpage, url)
response = requests.get(url)
f.write(response.content)`

#代码立陶宛

import time
import requests
from bs4 import BeautifulSoup
import os

def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

def get_file_name(url):
    tokens = url.split("/")
    file_name = tokens[-1].split("?")[0]
    return file_name

# Start timer
start_time = time.time()
print("Start time: ", start_time)

# Create image directory
image_directory = 'images'
isExist = os.path.exists(image_directory)
if not isExist:
    os.makedirs(image_directory)

template = "https://www.geoportal.lt/map/webapp/rest/mapgateway/6100e156c755e15f6e46a8820824d8c595d30ae51?f=json"

response = requests.get(template)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    link = soup.find("a")
    if link is not None:
        url = 'https://www.geoportal.lt/' + link['href']
        file_name = get_file_name(url)
        print(file_name)
        # Save zip file
        download_url(url, './' + image_directory + '/' + file_name)

# End timer
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time[![image attached[![\]\[1\]][1]][1]][1]
print("Elapsed time: ", elapsed_time)

Link: https://www.geoportal.lt/map/index.jsp?lang=en
我想从这个网站下载卫星图像（链接：https://xgis.maaamet.ee/xgis2/page/app/ristipuud）。大约有6000张tif格式的卫星图像。其中，我想得到500美元用于我的研究。我必须经常重复同样的过程，所以想通过刮取来得到它。但我有问题当我运行这个代码时，它没有显示任何错误，但它也没有下载任何数据。网站上的图像被划分为瓷砖，并可以通过从此链接https://geoportaal.maaamet.ee/eng/Maps-and-Data/Orthophotos/Download-Orthophotos-p662.html搜索瓷砖编号单独下载。RGB Orthophotos以.tif格式的zip文件提供。有多个版本的图像取决于年份，我想得到最新的一个。但不幸的是，我的代码不起作用。请您帮助我找出代码中的错误或分享您的经验。我是新手在编码和试图学习更多。

此代码可以下载压缩的Map文件。

import time
import requests
from bs4 import BeautifulSoup
import os

def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

def get_file_name(url):
    tokens = url.split("&")
    for token in tokens:
        if(token[:2] == 'f='):
            return token[2:]
    return ''

# Start timer
start_time = time.time()
print("Start time: ", start_time)

# create image directory     
image_directory = 'images'
isExist = os.path.exists(image_directory)
if not isExist:
   os.makedirs(image_directory)

# get zip URL and file name
start_sheet = 44744
end_sheet = 44844 # you need to change with 74331, I just test 100 range
total_download = 0
for index in range(start_sheet, end_sheet):
    template = "https://geoportaal.maaamet.ee/index.php?lang_id=2&plugin_act=otsing&page_id=662&&kaardiruut={sheet_number:n}&andmetyyp=ortofoto_eesti_rgb"
    webpage = template.format(sheet_number = index)
    response = requests.get(webpage)
    if (response.status_code == 200):
        soup = BeautifulSoup(response.content, "html.parser")
        link = soup.find("a")
        if link is not None:
            url = 'https://geoportaal.maaamet.ee/' + link['href']
            file_name = get_file_name(url)
            print(file_name)
            # save zip file
            download_url(url, './' + image_directory + '/' + get_file_name(url))
            total_download = total_download + 1
# End timer
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)
print("Total Download zip files: ", total_download)

完成后的结果

如果解压缩，您可以看到geoTIFF文件。

主旨
当你指向这个URL时

https://geoportaal.maaamet.ee/eng/Maps-and-Data/Orthophotos/Download-Orthophotos-p662.html

指示了图纸编号范围

Map sheet numbers of 1:10000 scale are between 44744 to 74331.

在Chrome（或Firefox）中，如果按F12键，则会显示Dev Tool。
“网络”选项卡可以在header选项卡上看到https调用。
打开此屏幕后，按search按钮，以页号（44744）搜索时可以看到请求URL。
这是模板URL。

https://geoportaal.maaamet.ee/index.php?lang_id=2&plugin_act=otsing&page_id=662&&kaardiruut=44744&andmetyyp=ortofoto_eesti_rgb&_=1686945341505

kaardiruut参数是开关表编号的关键。

kaardiruut=44744

下载一个程序增加其数值以改变另一个区域。

更新立陶宛Orthphoto 2021 - 2013

立陶宛Map不支持zip下载，它支持直接Map图片下载。
此Map服务器是切片Map的一个很好的示例

https://www.maptiler.com/google-maps-coordinates-tile-bounds-projection/#10/24.70/56.21

`https://www.geoportal.lt/map`

"https://www.geoportal.lt/map/webapp/rest/mapgateway/{year_id:s}/tile/{scale:n}/{y:n}/{x:n}"

year_id示例

Ortophoto 2021 - 2023是'6100e156c755e15f6e46a8820824d8c595d30ae50'
Ortophoto 2018~2020是'8ddf422a20f8a22fd7c116ef7d6a46eec4126d521'

比例= 8 #（1：10 000人）

2021 - 2023年x（经度）范围

start_x =最小值8263
end_x = max number 8510

2021 - 2023年y（纬度）范围

start_y =最小编号5524
end_y =最大数量5839
演示代码

import time
import requests
import os
import requests, imghdr

def download_url(image_url, save_path):
    # copy from Chrome's Network Tab/Headers/Request Headers/Cookie
    cookies = {'JSESSIONID_MWEB': '26F90E44851C5CC9CD41E7A1AE056C54;'}
    response = requests.get(url=image_url, cookies=cookies)
    if response.status_code == 200:
        extension = imghdr.what(file=None, h=response.content)
        print(save_path + '.' + extension)
        with open(save_path + '.' + extension, 'wb') as handler:
            handler.write(response.content)
        return True
    return False

def get_file_name(url):
    file_name = url.rsplit('/',1)[1] # file name
    return file_name

def get_directory_name(url):
    x = url.rsplit('/',2)[1]
    scale = url.rsplit('/',3)[1]
    return scale + '/' + x

def create_directory_name(directory_name):
    isExist = os.path.exists(directory_name)
    if not isExist:
        os.makedirs(directory_name)

start_x = 8330 # min number 8263
end_x = 8334   # max number 8510 

start_y = 5635 # min number 5524
end_y = 5640   # max number 5839

total_download = 0

year_id = '6100e156c755e15f6e46a8820824d8c595d30ae50' # Ortophoto 2021-2023
scale = 8 # (1 : 10 000)
# Start timer
start_time = time.time()
print("Start time: ", start_time)

for x_number in range(start_x, end_x):
    for y_number in range(start_y, end_y):
        # ~/{year_id}/tile/{scale}/{y}/{x}
        template = "https://www.geoportal.lt/map/webapp/rest/mapgateway/{year_id:s}/tile/{scale:n}/{y:n}/{x:n}"
        url = template.format(year_id = year_id, scale = scale, y = y_number, x = x_number)
        directory=get_directory_name(url)
        create_directory_name('./' + directory + '/')
        success = download_url(url,  './' + directory + '/' + get_file_name(url))
        if (success == True):
            total_download = total_download + 1

# End timer
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time, " Secs")
print("Total Download tile files: ", total_download)

结果

我从开发工具

得到URL和cookie

更新距离、地理位置和图像分辨率

你可以看到网格上的瓷砖和位置的米分辨率在左下角区域。（我发现了一个缺陷，X和Y被切换了）单位是米。

https://www.geoportal.lt/map/index.jsp?lang=en

通过鼠标悬停计算红色矩形距离，显示x，y位置捕获值。并粘贴每个点并计算距离。（再次X，Y需要切换-这是bug）

X distance = 3,909 m (yellow color)
Y distance = 4,637 m (green color)

回到我们的程序中来计算图块的分辨率
所有平铺（256 * 256）像素图像-这是一个小尺寸文件。
重新运行我的程序获取那个区域

start_x = 8416 # min number 8263
end_x = 8424   # max number 8510 

start_y = 5604 # min number 5524
end_y = 5611   # max number 5839

得到了这个结果

我将计算出真实世界的距离是多少像素大小？后期红交叉宽度为1092米（我用左下区域工具测量鼠标悬停）Δ X = 582123 - 581031 = 1092米

total pixels 418 pixels = 56 + 256 + 106
pixel per meter = 1085 m/ 418 px= 2.59 m/pixel

所以我的计算是2.5米/像素。（猜）
1块大小= 256像素 * 256像素= 640 m * 640 m
如果你很多图像，例如20000 * 20000像素（像ESTIJA的GeoTiff），像78瓦片 * 78瓦片它将是高分辨率的图像。
我希望我的猜测与实际尺寸相符。祝你好运!我没有更多的时间花在这个问题上了。其他地方自己调查。

python-3.x 从公共可用数据获取的网页抓取卫星图像

1条答案