python 从URL阅读Tarfile

rsl1atfo  于 2023-05-21  发布在  Python
关注(0)|答案(3)|浏览(181)

尝试从URL读取tar文件
主要是从网站上抓取数据。甚至尝试使用gzip打开文件,但它产生类似的相同错误。请对此提出解决方案。

import tarfile
from io import BytesIO
import urllib.request as urllib2

rt = urllib2.urlopen("https://opentender.eu/data/files/CY_ocds_data.json.tar.gz").read()
csvzip = tarfile.open(BytesIO(rt),mode='r:gz')

这会产生类型错误

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-23-2ed9e3f5bdd6> in <module>()
      4 import urllib.request as urllib2
      5 rt = urllib2.urlopen("https://opentender.eu/data/files/CY_ocds_data.json.tar.gz").read()
----> 6 csvzip = tarfile.open(BytesIO(rt),mode='r:gz')
      7 # csvzip.printdir()

2 frames
/usr/lib/python3.7/gzip.py in __init__(self, filename, mode, compresslevel, fileobj, mtime)
    166             mode += 'b'
    167         if fileobj is None:
--> 168             fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
    169         if filename is None:
    170             filename = getattr(fileobj, 'name', '')

TypeError: expected str, bytes or os.PathLike object, not _io.BytesIO
vfhzx4xs

vfhzx4xs1#

您必须使用fileobj关键字参数调用tarfile.open

csvzip = tarfile.open(fileobj=BytesIO(rt),mode='r:gz')
ogq8wdun

ogq8wdun2#

也许这样会更好:

import tarfile
import urllib.request as urllib2

rt = urllib2.urlopen("https://opentender.eu/data/files/CY_ocds_data.json.tar.gz")
csvzip = tarfile.open(fileobj=rt,mode='r:gz')

urlopen函数返回一个file对象,并将其传递给tarfile.open。
鲍比

eoigrqb6

eoigrqb63#

共享支持基本身份验证的备选方案:

import tarfile
from io import BytesIO
import requests
from requests.adapters import HTTPAdapter, Retry

def session_get(url, user, passw):
    session = requests.session()
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[403, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    response = session.get(url, auth=(user, passw))
    if response.status_code != 200:
        raise Exception(log(f'Unable to access link:\n{url}\nError code: {response.status_code}'))
    return response

url = "https://opentender.eu/data/files/CY_ocds_data.json.tar.gz"
resp = session_get(url, '<username>', '<password>')
tf = tarfile.open(fileobj=BytesIO(resp.content), mode="r:gz")
tf.extractfile('<filename>').read()

相关问题