pytorch 如何修复'Exception:进程0以信号SIGSEGV '错误终止,如果单个gpu代码工作正常?

1rhkuytd  于 2022-11-09  发布在  其他
关注(0)|答案(2)|浏览(292)

我启动了2个进程,因为我只有2个GPU,但它给了我一个Exception: process 0 terminated with signal SIGSEGV。这段代码确实可以在多个CPU上工作(或者至少没有抛出错误)。此外,它可以在单个GPU上工作。此外,当world_size > 0和多个cuda/GPU存在时,它会失败。
我的错误消息是这样的:

(automl-meta-learning) miranda9~/ML4Coq $ python playground/multiprocessing_playground/ddp_hello_world.py

world_size=2

Traceback (most recent call last):
  File "playground/multiprocessing_playground/ddp_hello_world.py", line 49, in <module>
    main()
  File "playground/multiprocessing_playground/ddp_hello_world.py", line 43, in main
    mp.spawn(example,
  File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
    while not context.join():
  File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 105, in join
    raise Exception(
Exception: process 0 terminated with signal SIGSEGV

下面是给出错误的代码:

import os

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP

def example(rank, world_size):
    # create default process group
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'
    dist.init_process_group("gloo", rank=rank, world_size=world_size)

    # create local model
    model = nn.Linear(10, 10).to(rank)
    # construct DDP model
    ddp_model = DDP(model, device_ids=[rank])
    # define loss function and optimizer
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    # forward pass
    outputs = ddp_model(torch.randn(20, 10).to(rank))
    labels = torch.randn(20, 10).to(rank)
    # backward pass
    loss_fn(outputs, labels).backward()
    # update parameters
    optimizer.step()

def main():
    # world_size = 2
    world_size = torch.cuda.device_count()
    mp.spawn(example,
        args=(world_size,),
        nprocs=world_size,
        join=True)

if __name__=="__main__":
    main()
    print('Done\n\a')

[可选]较大的自包含示例(给出相同的错误)

但是请注意,这个稍微完整一点的示例(只缺少一个分布式数据加载器)也给我带来了同样的问题:

"""
Based on: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html

Correctness of code: https://stackoverflow.com/questions/66226135/how-to-parallelize-a-training-loop-ever-samples-of-a-batch-when-cpu-is-only-avai

Note: as opposed to the multiprocessing (torch.multiprocessing) package, processes can use
different communication backends and are not restricted to being executed on the same machine.
"""
import time

from typing import Tuple

import torch
from torch import nn, optim
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

import os

num_epochs = 5
batch_size = 8
Din, Dout = 10, 5
data_x = torch.randn(batch_size, Din)
data_y = torch.randn(batch_size, Dout)
data = [(i*data_x, i*data_y) for i in range(num_epochs)]

class PerDeviceModel(nn.Module):
    """
    Toy example for a model ran in parallel but not distributed accross gpus
    (only processes with their own gpu or hardware)
    """
    def __init__(self):
        super().__init__()
        self.net1 = nn.Linear(Din, Din)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(Din, Dout)

    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))

def setup_process(rank, world_size, backend='gloo'):
    """
    Initialize the distributed environment (for each process).

    gloo: is a collective communications library (https://github.com/facebookincubator/gloo). My understanding is that
    it's a library/API for process to communicate/coordinate with each other/master. It's a backend library.
    """
    # set up the master's ip address so this child process can coordinate
    # os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # - use NCCL if you are using gpus: https://pytorch.org/tutorials/intermediate/dist_tuto.html#communication-backends
    if torch.cuda.is_available():
        backend = 'nccl'
    # Initializes the default distributed process group, and this will also initialize the distributed package.
    dist.init_process_group(backend, rank=rank, world_size=world_size)

def cleanup():
    """ Destroy a given process group, and deinitialize the distributed package """
    dist.destroy_process_group()

def get_batch(batch: Tuple[torch.Tensor, torch.Tensor], rank):
    x, y = batch
    if torch.cuda.is_available():
        x, y = x.to(rank), y.to(rank)
    else:
        x, y = x.share_memory_(), y.share_memory_()
    return x, y

def get_ddp_model(model: nn.Module, rank):
    """
    Moves the underlying storage to shared memory.

        This is a no-op if the underlying storage is already in shared memory
        and for CUDA tensors. Tensors in shared memory cannot be resized.

    :return:

    TODO: does this have to be done outside or inside the process? my guess is that it doesn't matter because
    1) if its on gpu once it's on the right proc it moves it to cpu with id rank via mdl.to(rank)
    2) if it's on cpu then mdl.share_memory() or data.share_memory() is a no op if it's already in shared memory o.w.
    """
    # if gpu avail do the standard of creating a model and moving the model to the GPU with id rank
    if torch.cuda.is_available():
    # create model and move it to GPU with id rank
        model = model.to(rank)
        ddp_model = DDP(model, device_ids=[rank])
    else:
    # if we want multiple cpu just make sure the model is shared properly accross the cpus with shared_memory()
    # note that op is a no op if it's already in shared_memory
        model = model.share_memory()
        ddp_model = DDP(model)  # I think removing the devices ids should be fine...?
    return ddp_model
    # return OneDeviceModel().to(rank) if torch.cuda.is_available() else OneDeviceModel().share_memory()

def run_parallel_training_loop(rank, world_size):
    """
    Distributed function to be implemented later.

    This is the function that is actually ran in each distributed process.

    Note: as DDP broadcasts model states from rank 0 process to all other processes in the DDP constructor,
    you don’t need to worry about different DDP processes start from different model parameter initial values.
    """
    setup_process(rank, world_size)
    print()
    print(f"Start running DDP with model parallel example on rank: {rank}.")
    print(f'current process: {mp.current_process()}')
    print(f'pid: {os.getpid()}')

    # get ddp model
    model = PerDeviceModel()
    ddp_model = get_ddp_model(model, rank)

    # do training
    for batch_idx, batch in enumerate(data):
        x, y = get_batch(batch, rank)
        loss_fn = nn.MSELoss()
        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

        optimizer.zero_grad()
        outputs = ddp_model(x)
        # Gradient synchronization communications take place during the backward pass and overlap with the backward computation.
        loss_fn(outputs, y).backward()  # When the backward() returns, param.grad already contains the synchronized gradient tensor.
        optimizer.step()  # TODO how does the optimizer know to do the gradient step only once?

    print()
    print(f"Start running DDP with model parallel example on rank: {rank}.")
    print(f'current process: {mp.current_process()}')
    print(f'pid: {os.getpid()}')
    # Destroy a given process group, and deinitialize the distributed package
    cleanup()

def main():
    print()
    print('running main()')
    print(f'current process: {mp.current_process()}')
    print(f'pid: {os.getpid()}')
    # args
    if torch.cuda.is_available():
        world_size = torch.cuda.device_count()
    else:
        world_size = mp.cpu_count()
    print(f'world_size={world_size}')
    mp.spawn(run_parallel_training_loop, args=(world_size,), nprocs=world_size)

if __name__ == "__main__":
    print('starting __main__')
    start = time.time()
    main()
    print(f'execution length = {time.time() - start}')
    print('Done!\a\n')

交叉过帐:https://discuss.pytorch.org/t/why-is-mp-spawn-spawning-4-processes-when-i-only-want-2/112299

yyhrrdl8

yyhrrdl81#

我运行了您的“(最小)代码示例”,没有任何更改,也没有任何错误,在服务器上有4个GPU(python版本:3.6.9,以及pytorch版本:1.5.0+ Cu 101)中进行。
当您运行最小代码示例时,问题是否仍然存在?
如果是这样,并且如果您使用的是linux机器,请运行以下代码,并告诉我您得到了什么输出:

import os

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP

def get_visible_gpus():
    ns = os.popen('nvidia-smi')
    lines_ns = ns.readlines()
    # print(lines_ns)
    for _i, _line in enumerate(lines_ns):
        if _line.find('|=') >= 0:
            break
    line_gpus = lines_ns[_i:]
    for _i, _line in enumerate(line_gpus):
        if _line.find('Processes') >= 0:
            break
    line_gpus = line_gpus[:_i-3]
    # print(line_gpus)
    idx_gpu_lines = []
    for _i, _line in enumerate(line_gpus):
        if _line.find('+') >= 0:
            idx_gpu_lines.append(_i+1)
    idx_gpus = []
    for _line_gpu in  idx_gpu_lines:
        idx_gpus.append(int(line_gpus[_line_gpu].split()[1]))
    # print(idx_gpus)
    return idx_gpus

def example(rank, world_size):
    print('rank:{}'.format(rank))
    # create default process group
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'
    dist.init_process_group("gloo", rank=rank, world_size=world_size)

    # create local model
    model = nn.Linear(10, 10).to(rank)
    # construct DDP model
    ddp_model = DDP(model, device_ids=[rank])
    # define loss function and optimizer
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    # forward pass
    outputs = ddp_model(torch.randn(20, 10).to(rank))
    labels = torch.randn(20, 10).to(rank)
    # backward pass
    loss_fn(outputs, labels).backward()
    # update parameters
    optimizer.step()

def main():
    # world_size = 2
    world_size = torch.cuda.device_count()
    print('world_size:{}'.format(world_size))
    print('get_visible_gpus():{}'.format(get_visible_gpus()))
    mp.spawn(example,
        args=(world_size,),
        nprocs=world_size,
        join=True)

if __name__ == "__main__":
    print(torch.__version__)
    main()
    print('Done\n\a')

在我的例子中,我只得到:

1.5.0+cu101
world_size:4
get_visible_gpus():[0, 1, 2, 3]
rank:1
rank:3
rank:0
rank:2
Done

get_visible_gpus()只是文本解析一个nvidia-smi shell命令,以获得cuda可以看到的gpu的id。
NB:请原谅,我会评论而不是“回答”-因为我不是直接解决你的问题,而是要求更多的细节-但我的声誉不够好T.T

5t7ly7z5

5t7ly7z52#

解决方案:增加shm尺寸

docker run -it \
    --shm-size=64g

原因:如果您在Docker容器上运行,可能是因为Docker的shm_size不够大。默认情况下,Docker容器分配了64MB的共享内存。这个共享内存不是内存限制,而是一个使用RAM来存储文件的/dev/shm临时文件存储文件系统。这是用于IPC的。进入容器后,您可以使用df来查看shm大小。

相关问题