我的子类torch.utils.data.DataLoader
有什么问题?在使用这个而不是原来的子类后,我得到了一个错误。这个错误可以通过将pin_memory
(在超类中)设置为False
来解决。
这是类(实际的类有几个参数都没有使用,为了与我的代码兼容):
class CustomDataLoader(torchdata.DataLoader):
def __init__(self, dataset, batch_size, shuffle,
collate_fn=None, pin_memory=False, num_workers=4, **kwargs):
''' Custom data loader to test how to subclass torchdata.DataLoader.
'''
super().__init__( dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn,
pin_memory=pin_memory, num_workers=num_workers, **kwargs)
def __iter__(self):
self.super_iter = super().__iter__()
return self
def __next__(self):
next_batch = self.super_iter.__next__()
return next_batch, None, None
字符串
这里是150行错误消息的开始:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 133, in _remove_temp_dir
rmtree(tempdir)
File "/opt/conda/lib/python3.10/shutil.py", line 725, in rmtree
_rmtree_safe_fd(fd, path, onerror)
File "/opt/conda/lib/python3.10/shutil.py", line 633, in _rmtree_safe_fd
onerror(os.scandir, path, sys.exc_info())
File "/opt/conda/lib/python3.10/shutil.py", line 629, in _rmtree_safe_fd
with os.scandir(topfd) as scandir_it:
OSError: [Errno 24] Too many open files: '/tmp/pymp-n1e35f4z'
型
引发错误的代码使用的数据集当前不是公共的,所以我不能在这里使用相同的代码。但是,仅循环数据集无法重现问题。
我在这里看到了一些相关的问题,但通常答案并不需要子类DataLoader
(here和here)。更相关的可能是:https://discuss.pytorch.org/t/too-many-open-files-caused-by-persistent-workers-and-pin-memory/193372,因为将pin_memory
设置为False
在我的情况下也有帮助。
我上面的类是否有任何问题,可能导致pytorch内部的数据加载问题?或者它看起来没问题?
编辑:错误消息的剩余部分:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 224, in __call__
s res = self._callback(*self._args, **self._kwargs)
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 133, in _remove_temp_dir
rmtree(tempdir)
File "/opt/conda/lib/python3.10/shutil.py", line 725, in rmtree
_rmtree_safe_fd(fd, path, onerror)
File "/opt/conda/lib/python3.10/shutil.py", line 633, in _rmtree_safe_fd
onerror(os.scandir, path, sys.exc_info())
File "/opt/conda/lib/python3.10/shutil.py", line 629, in _rmtree_safe_fd
with os.scandir(topfd) as scandir_it:
OSError: [Errno 24] Too many open files: '/tmp/pymp-n1e35f4z'
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 133, in _remove_temp_dir
rmtree(tempdir)
File "/opt/conda/lib/python3.10/shutil.py", line 725, in rmtree
_rmtree_safe_fd(fd, path, onerror)
File "/opt/conda/lib/python3.10/shutil.py", line 633, in _rmtree_safe_fd
onerror(os.scandir, path, sys.exc_info())
File "/opt/conda/lib/python3.10/shutil.py", line 629, in _rmtree_safe_fd
with os.scandir(topfd) as scandir_it:
OSError: [Errno 24] Too many open files: '/tmp/pymp-u0_3n68n'
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/resource_sharer.py", line 138, in _serve
File "/opt/conda/lib/python3.10/multiprocessing/connection.py", line 463, in accept
File "/opt/conda/lib/python3.10/multiprocessing/connection.py", line 609, in accept
File "/opt/conda/lib/python3.10/socket.py", line 293, in accept
fd, addr = self._accept()
OSError: [Errno 24] Too many open files
Exception in thread Thread-534 (_pin_memory_loop):
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/opt/conda/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 51, in _pin_memory_loop
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/resource_sharer.py", line 145, in _serve
send(conn, destination_pid)
File "/opt/conda/lib/python3.10/multiprocessing/resource_sharer.py", line 50, in send
reduction.send_handle(conn, new_fd, pid)
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 183, in send_handle
with socket.fromfd(conn.fileno(), socket.AF_UNIX, socket.SOCK_STREAM) as s:
File "/opt/conda/lib/python3.10/socket.py", line 545, in fromfd
nfd = dup(fd)
OSError: [Errno 24] Too many open files
do_one_step()
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in do_one_step
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/opt/conda/lib/python3.10/multiprocessing/queues.py", line 122, in get
return _ForkingPickler.loads(res)
File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 307, in rebuild_storage_fd
fd = df.detach()
File "/opt/conda/lib/python3.10/multiprocessing/resource_sharer.py", line 58, in detach
return reduction.recv_handle(conn)
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 189, in recv_handle
return recvfds(s, 1)[0]
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 159, in recvfds
raise EOFError
EOFError
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/queues.py", line 244, in _feed
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 370, in reduce_storage
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 198, in DupFd
return resource_sharer.DupFd(fd)
File "/opt/conda/lib/python3.10/multiprocessing/resource_sharer.py", line 48, in __init__
new_fd = os.dup(fd)
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/conda/lib/python3.10/multiprocessing/util.py", line 133, in _remove_temp_dir
rmtree(tempdir)
File "/opt/conda/lib/python3.10/shutil.py", line 725, in rmtree
_rmtree_safe_fd(fd, path, onerror)
File "/opt/conda/lib/python3.10/shutil.py", line 633, in _rmtree_safe_fd
onerror(os.scandir, path, sys.exc_info())
File "/opt/conda/lib/python3.10/shutil.py", line 629, in _rmtree_safe_fd
with os.scandir(topfd) as scandir_it:
OSError: [Errno 24] Too many open files: '/tmp/pymp-oyedwdyi'
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/queues.py", line 244, in _feed
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 51, in dumps
File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 370, in reduce_storage
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 198, in DupFd
File "/opt/conda/lib/python3.10/multiprocessing/resource_sharer.py", line 48, in __init__
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/queues.py", line 244, in _feed
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 51, in dumps
File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 370, in reduce_storage
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 198, in DupFd
File "/opt/conda/lib/python3.10/multiprocessing/resource_sharer.py", line 48, in __init__
OSError: [Errno 24] Too many open files
型
然后是最后一部分(在上面的错误之后的“finally”中引发):
Exception in thread Thread-535 (_pin_memory_loop):
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
File "/opt/conda/lib/python3.10/threading.py", line 953, in run
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 51, in _pin_memory_loop
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in do_one_step
File "/opt/conda/lib/python3.10/multiprocessing/queues.py", line 122, in get
File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 307, in rebuild_storage_fd
File "/opt/conda/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in detach
File "/opt/conda/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in get_connection
File "/opt/conda/lib/python3.10/multiprocessing/connection.py", line 502, in Client
File "/opt/conda/lib/python3.10/multiprocessing/connection.py", line 628, in SocketClient
File "/opt/conda/lib/python3.10/socket.py", line 232, in __init__
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "[...]/train.py", line 545, in train
File "[...]/train.py", line 248, in train_or_eval_epoch
File "/src/group-orbit-cl/group_orbit_cl/data/sample_transformer.py", line 637, in __iter__
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 442, in __iter__
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 388, in _get_iterator
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1043, in __init__
File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 121, in start
File "/opt/conda/lib/python3.10/multiprocessing/context.py", line 224, in _Popen
File "/opt/conda/lib/python3.10/multiprocessing/context.py", line 281, in _Popen
File "/opt/conda/lib/python3.10/multiprocessing/popen_fork.py", line 19, in __init__
File "/opt/conda/lib/python3.10/multiprocessing/popen_fork.py", line 65, in _launch
OSError: [Errno 24] Too many open files
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "[...]/train.py", line 1379, in <module>
File "[...]/train.py", line 1376, in main
File "[...]/train.py", line 639, in train
File "[...]/train.py", line 687, in maybe_store_model
File "/opt/conda/lib/python3.10/site-packages/torch/serialization.py", line 440, in save
File "/opt/conda/lib/python3.10/site-packages/torch/serialization.py", line 315, in _open_zipfile_writer
File "/opt/conda/lib/python3.10/site-packages/torch/serialization.py", line 288, in __init__
RuntimeError: File saved_models/4636315_0_mlp_ep-521.state_dict cannot be opened.
型
1条答案
按热度按时间guicsvcw1#
你应该可以用
ulimit
解决这个问题。运行
ulimit -n
查看当前文件限制。然后运行
ulimit -n {larger_value}
来增加限制。这应该可以让你避开错误。您还应该查看文件被打开的位置,并尝试最大限度地减少打开的文件数量或清理打开的文件。例如,如果您的dataloader
__getitem__
实现每次都打开一个文件而没有关闭该文件,则可能会累加。在训练之前将数据集预处理为单个文件也是一个好主意。