PyTorch: Torch .cuda.内存不足错误:在设备0上的副本0中捕获OutOfMemory错误

vkc1a9a2  于 2023-01-13  发布在  其他
关注(0)|答案(1)|浏览(617)

我同时在多个GPU(如mobilenet, mobilenetv2)上训练多个模型。在训练和评估第一个模型后,我得到一个错误torch.cuda.OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.。我尝试了以下各种解决方案

    • 代码**
import time
import pathlib
from os.path import isfile

import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn

import models
from utils import *
from config import config
from data import DataLoader

# for ignore imagenet PIL EXIF UserWarning
import warnings
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)

best_acc1 = 0

def main():
    global opt, start_epoch, best_acc1
    opt = config()

    if opt.cuda and not torch.cuda.is_available():
        raise Exception('No GPU found, please run without --cuda')

    print('\n=> creating model \'{}\''.format(opt.arch))
    if opt.arch == 'shufflenet':
        model = models.__dict__[opt.arch](opt.dataset, opt.width_mult, opt.groups)
    else:
        model = models.__dict__[opt.arch](opt.dataset, opt.width_mult)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=opt.lr,
                          momentum=opt.momentum, weight_decay=opt.weight_decay,
                          nesterov=True)
    start_epoch = 0
    n_retrain = 0

    if opt.cuda:
        torch.cuda.set_device(opt.gpuids[0])
        with torch.cuda.device(opt.gpuids[0]):
            model = model.cuda()
            criterion = criterion.cuda()
        model = nn.DataParallel(model, device_ids=opt.gpuids,
                                output_device=opt.gpuids[0])
        cudnn.benchmark = True

    # checkpoint file
    ckpt_dir = pathlib.Path('checkpoint')
    ckpt_file = ckpt_dir / opt.arch / opt.dataset / opt.ckpt

    # for resuming training
    if opt.resume:
        if isfile(ckpt_file):
            print('==> Loading Checkpoint \'{}\''.format(opt.ckpt))
            checkpoint = load_model(model, ckpt_file, opt)

            start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])

            print('==> Loaded Checkpoint \'{}\' (epoch {})'.format(
                opt.ckpt, start_epoch))
        else:
            print('==> no checkpoint found \'{}\''.format(
                opt.ckpt))
            return

    # Data loading
    print('==> Load data..')
    train_loader, val_loader = DataLoader(opt.batch_size, opt.workers,
                                          opt.dataset, opt.datapath,
                                          opt.cuda)

    # for evaluation
    if opt.evaluate:
        if isfile(ckpt_file):
            print('==> Loading Checkpoint \'{}\''.format(opt.ckpt))
            checkpoint = load_model(model, ckpt_file, opt)

            start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])

            print('==> Loaded Checkpoint \'{}\' (epoch {})'.format(
                opt.ckpt, start_epoch))

            # evaluate on validation set
            print('\n===> [ Evaluation ]')
            start_time = time.time()
            acc1, acc5 = validate(val_loader, model, criterion)
            save_eval(['{}-{}-{}'.format(opt.arch, opt.dataset, opt.ckpt[:-4]),
                       str(acc1)[7:-18], str(acc5)[7:-18]], opt)
            elapsed_time = time.time() - start_time
            print('====> {:.2f} seconds to evaluate this model\n'.format(
                elapsed_time))
            return
        else:
            print('==> no checkpoint found \'{}\''.format(
                opt.ckpt))
            return

    # train...
    train_time = 0.0
    validate_time = 0.0
    for epoch in range(start_epoch, opt.epochs):
        adjust_learning_rate(optimizer, epoch, opt.lr)
        print('\n==> {}/{} training'.format(opt.arch, opt.dataset))
        print('==> Epoch: {}, lr = {}'.format(
            epoch, optimizer.param_groups[0]["lr"]))

        # train for one epoch
        print('===> [ Training ]')
        start_time = time.time()
        acc1_train, acc5_train = train(train_loader,
            epoch=epoch, model=model,
            criterion=criterion, optimizer=optimizer)
        elapsed_time = time.time() - start_time
        train_time += elapsed_time
        print('====> {:.2f} seconds to train this epoch\n'.format(
            elapsed_time))

        # evaluate on validation set
        print('===> [ Validation ]')
        start_time = time.time()
        acc1_valid, acc5_valid = validate(val_loader, model, criterion)
        elapsed_time = time.time() - start_time
        validate_time += elapsed_time
        print('====> {:.2f} seconds to validate this epoch\n'.format(
            elapsed_time))

        # remember best Acc@1 and save checkpoint and summary csv file
        is_best = acc1_valid > best_acc1
        best_acc1 = max(acc1_valid, best_acc1)
        state = {'epoch': epoch + 1,
                 'model': model.state_dict(),
                 'optimizer': optimizer.state_dict()}
        summary = [epoch,
                   str(acc1_train)[7:-18], str(acc5_train)[7:-18],
                   str(acc1_valid)[7:-18], str(acc5_valid)[7:-18]]
        save_model(state, epoch, is_best, opt)
        save_summary(summary, opt)

    avg_train_time = train_time / (opt.epochs-start_epoch)
    avg_valid_time = validate_time / (opt.epochs-start_epoch)
    total_train_time = train_time + validate_time
    print('====> average training time per epoch: {:,}m {:.2f}s'.format(
        int(avg_train_time//60), avg_train_time%60))
    print('====> average validation time per epoch: {:,}m {:.2f}s'.format(
        int(avg_valid_time//60), avg_valid_time%60))
    print('====> training time: {}h {}m {:.2f}s'.format(
        int(train_time//3600), int((train_time%3600)//60), train_time%60))
    print('====> validation time: {}h {}m {:.2f}s'.format(
        int(validate_time//3600), int((validate_time%3600)//60), validate_time%60))
    print('====> total training time: {}h {}m {:.2f}s'.format(
        int(total_train_time//3600), int((total_train_time%3600)//60), total_train_time%60))

def train(train_loader, **kwargs):
    epoch = kwargs.get('epoch')
    model = kwargs.get('model')
    criterion = kwargs.get('criterion')
    optimizer = kwargs.get('optimizer')

    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(len(train_loader), batch_time, data_time,
                             losses, top1, top5, prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        if opt.cuda:
            target = target.cuda(non_blocking=True)

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(acc1[0], input.size(0))
        top5.update(acc5[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)

        if i % opt.print_freq == 0:
            progress.print(i)

        end = time.time()

    print('====> Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
          .format(top1=top1, top5=top5))

    return top1.avg, top5.avg

def validate(val_loader, model, criterion):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(len(val_loader), batch_time, losses, top1, top5,
                             prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            if opt.cuda:
                target = target.cuda(non_blocking=True)

            # compute output
            output = model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(acc1[0], input.size(0))
            top5.update(acc5[0], input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)

            if i % opt.print_freq == 0:
                progress.print(i)

            end = time.time()

        print('====> Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))

    return top1.avg, top5.avg

if __name__ == '__main__':
    start_time = time.time()
    main()
    elapsed_time = time.time() - start_time
    print('====> total time: {}h {}m {:.2f}s'.format(
        int(elapsed_time//3600), int((elapsed_time%3600)//60), elapsed_time%60))
    • 解决方案**
gc.collect()
torch.cuda.empty_cache() # PyTorch thing
    • 追溯**
==> mobilenet/cifar10 training
==> Epoch: 17, lr = 0.07093217661806457
===> [ Training ]
Epoch: [17][0/9]    Time  2.638 ( 2.638)    Data  2.527 ( 2.527)    Loss 1.1166e+00 (1.1166e+00)    Acc@1  59.76 ( 59.76)   Acc@5  95.52 ( 95.52)
====> Acc@1 61.468 Acc@5 95.854
====> 4.97 seconds to train this epoch

===> [ Validation ]
Test: [0/2] Time  1.674 ( 1.674)    Loss 1.1883e+00 (1.1883e+00)    Acc@1  57.50 ( 57.50)   Acc@5  95.46 ( 95.46)
====> Acc@1 57.620 Acc@5 95.300
====> 1.84 seconds to validate this epoch

==> mobilenet/cifar10 training
==> Epoch: 18, lr = 0.06951353308570328
===> [ Training ]
Epoch: [18][0/9]    Time  2.582 ( 2.582)    Data  2.467 ( 2.467)    Loss 1.0763e+00 (1.0763e+00)    Acc@1  61.83 ( 61.83)   Acc@5  96.33 ( 96.33)
====> Acc@1 62.808 Acc@5 96.350
====> 4.92 seconds to train this epoch

===> [ Validation ]
Test: [0/2] Time  1.721 ( 1.721)    Loss 1.1518e+00 (1.1518e+00)    Acc@1  58.51 ( 58.51)   Acc@5  95.67 ( 95.67)
====> Acc@1 58.540 Acc@5 95.560
====> 1.88 seconds to validate this epoch

==> mobilenet/cifar10 training
==> Epoch: 19, lr = 0.06812326242398921
===> [ Training ]
Epoch: [19][0/9]    Time  2.441 ( 2.441)    Data  2.314 ( 2.314)    Loss 1.0599e+00 (1.0599e+00)    Acc@1  62.20 ( 62.20)   Acc@5  96.34 ( 96.34)
====> Acc@1 63.502 Acc@5 96.530
====> 4.75 seconds to train this epoch

===> [ Validation ]
Test: [0/2] Time  1.664 ( 1.664)    Loss 1.1191e+00 (1.1191e+00)    Acc@1  59.76 ( 59.76)   Acc@5  96.39 ( 96.39)
====> Acc@1 59.460 Acc@5 96.060
====> 1.83 seconds to validate this epoch

====> average training time per epoch: 0m 6.81s
====> average validation time per epoch: 0m 1.88s
====> training time: 0h 2m 16.22s
====> validation time: 0h 0m 37.55s
====> total training time: 0h 2m 53.77s
====> total time: 0h 3m 18.80s

=> creating model 'mobilenet'
==> Load data..
Files already downloaded and verified
Files already downloaded and verified
==> Loading Checkpoint '/home2/coremax/Documents/BoxMix/checkpoint/mobilenet/cifar10/ckpt_best.pth'
==> Loaded Checkpoint '/home2/coremax/Documents/BoxMix/checkpoint/mobilenet/cifar10/ckpt_best.pth' (epoch 20)

===> [ Evaluation ]
Test: [ 0/40]   Time  1.680 ( 1.680)    Loss 1.0908e+00 (1.0908e+00)    Acc@1  64.45 ( 64.45)   Acc@5  96.09 ( 96.09)
====> Acc@1 59.460 Acc@5 96.060
====> 2.21 seconds to evaluate this model

====> total time: 0h 0m 6.03s

=> creating model 'mobilenetv2'
==> Load data..
Files already downloaded and verified
Files already downloaded and verified

==> mobilenetv2/cifar10 training
==> Epoch: 0, lr = 0.1
===> [ Training ]
Traceback (most recent call last):
  File "/home2/coremax/Documents/BoxMix/main.py", line 257, in <module>
    main()
  File "/home2/coremax/Documents/BoxMix/main.py", line 117, in main
    acc1_train, acc5_train = train(train_loader,
  File "/home2/coremax/Documents/BoxMix/main.py", line 187, in train
    output = model(input)
  File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
    output.reraise()
  File "/home2/coremax/anaconda3/lib/python3.9/site-packages/torch/_utils.py", line 543, in reraise
    raise exception
torch.cuda.OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
t40tm48m

t40tm48m1#

我正在两个GPU (TESLA V100 16GB)上训练mobilenet,批量大小为6096,这是非常大的,但我仍然可以轻松地训练我的模型。当我同时训练多个模型(如mobilenet and mobilenetv2)时,我在mobilenetv2中得到replica错误。我尝试了gc.collect() and torch.cuda.empty_cache()解决方案,但它不适合我。

我通过将批量从6096显著减少到256解决了上述问题

相关问题