pytorch 在预训练的ViT上经过一些时期后,验证损失增加

7cwmlq89  于 2023-05-17  发布在  其他
关注(0)|答案(1)|浏览(171)

我正在TESLA V100 GPU上微调timm库中的vit_base_patch16_384模型。目前,我有3000个训练图像,分为15个类。准确率停留在46%,我认为超参数调整存在一些问题。如何获得好的超参数?

图形

编码

# Batch size
bs = 32
# Number of epochs
num_epochs = 100
# Number of classes
num_classes = 15
# Number of workers
num_cpu = multiprocessing.cpu_count()
# Size of image
imsize = int(args.size)

# timm model list
print(timm.list_models('resnet*', pretrained=True))

# finetune with vit_384
if args.net == "vit_timm_pretrained":
 size = 384

# Applying transforms to the data
image_transforms = {
 'train': transforms.Compose([
 transforms.RandomResizedCrop(size=32, scale=(0.8, 1.0)),
 transforms.Resize(size),
 transforms.RandomRotation(degrees=15),
 transforms.RandomHorizontalFlip(),
 transforms.CenterCrop(size=size),
 transforms.ToTensor(),
 transforms.Normalize([0.485, 0.456, 0.406],
 [0.229, 0.224, 0.225])
 ]),
 'valid': transforms.Compose([
 transforms.Resize(size=size),
 transforms.CenterCrop(size=size),
 transforms.ToTensor(),
 transforms.Normalize([0.485, 0.456, 0.406],
 [0.229, 0.224, 0.225])
 ])
}

# Load data from folders
dataset = {
 'train': datasets.ImageFolder(root=train_directory, transform=image_transforms['train']),
 'valid': datasets.ImageFolder(root=valid_directory, transform=image_transforms['valid'])
}

# Size of train and validation data
dataset_sizes = {
 'train': len(dataset['train']),
 'valid': len(dataset['valid'])
}

# Create iterators for data loading
dataloaders = {
 'train': data.DataLoader(dataset['train'], batch_size=bs, shuffle=True,
 num_workers=num_cpu, pin_memory=True, drop_last=True),
 'valid': data.DataLoader(dataset['valid'], batch_size=bs, shuffle=True,
 num_workers=num_cpu, pin_memory=True, drop_last=True)
}

# Class names or target labels
class_names = dataset['train'].classes
print("Classes:", class_names)

# Print the train and validation data sizes
print("Training-set size:", dataset_sizes['train'],
 "\nValidation-set size:", dataset_sizes['valid'])

# Set default device as gpu, if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Pre-trained vit_base_patch16_384, vit_large_patch32_384, vit_huge_patch14_224
if args.net == "vit_timm_pretrained":
 model_ft = timm.create_model("vit_base_patch16_384", pretrained=True)
 model_ft.head = nn.Linear(model_ft.head.in_features, num_classes)

# Utilize multiple GPUS
if 'cuda' in device:
 print(device)
 print("using data parallel")
 model_ft = torch.nn.DataParallel(model_ft) # make parallel
 cudnn.benchmark = True

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer 
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Learning rate decay
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

# Model training routine 
print("\nTraining:-\n")

model_ft.cuda()

def train_model(model, criterion, optimizer, scheduler, num_epochs=30):
 since = time.time()

 best_model_wts = copy.deepcopy(model.state_dict())
 best_acc = 0.0

 # Tensorboard summary
 writer = SummaryWriter()

 for epoch in range(num_epochs):
 print('Epoch {}/{}'.format(epoch, num_epochs - 1))
 print('-' * 10)

 # Each epoch has a training and validation phase
 for phase in ['train', 'valid']:
 if phase == 'train':
 model.train() # Set model to training mode
 else:
 model.eval() # Set model to evaluate mode

 running_loss = 0.0
 running_corrects = 0

 # Iterate over data.
 for inputs, labels in dataloaders[phase]:
 # inputs = inputs.to("cuda")
 # labels = labels.to("cuda")

 inputs = inputs.to(device, non_blocking=True)
 labels = labels.to(device, non_blocking=True)

 # zero the parameter gradients
 optimizer.zero_grad()

 # forward
 # track history if only in train
 with torch.set_grad_enabled(phase == 'train'):
 outputs = model(inputs)
 _, preds = torch.max(outputs, 1)
 loss = criterion(outputs, labels)

 # backward + optimize only if in training phase
 if phase == 'train':
 loss.backward()
 optimizer.step()

 # statistics
 running_loss += loss.item() * inputs.size(0)
 running_corrects += torch.sum(preds == labels.data)
 if phase == 'train':
 scheduler.step()

 epoch_loss = running_loss / dataset_sizes[phase]
 epoch_acc = running_corrects.double() / dataset_sizes[phase]

 print('{} Loss: {:.4f} Acc: {:.4f}'.format(
 phase, epoch_loss, epoch_acc))

 # Record training loss and accuracy for each phase
 if phase == 'train':
 writer.add_scalar('Train/Loss', epoch_loss, epoch)
 writer.add_scalar('Train/Accuracy', epoch_acc, epoch)
 writer.flush()
 else:
 writer.add_scalar('Valid/Loss', epoch_loss, epoch)
 writer.add_scalar('Valid/Accuracy', epoch_acc, epoch)
 writer.flush()

 # deep copy the model
 if phase == 'valid' and epoch_acc > best_acc:
 best_acc = epoch_acc
 best_model_wts = copy.deepcopy(model.state_dict())

 print()

 time_elapsed = time.time() - since
 print('Training complete in {:.0f}m {:.0f}s'.format(
 time_elapsed // 60, time_elapsed % 60))
 print('Best val Acc: {:4f}'.format(best_acc))

 # load best model weights
 model.load_state_dict(best_model_wts)
 return model

# Train the model
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
 num_epochs=num_epochs)

提示#2

优化器

optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
    
# Learning rate decay
exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_ft, 'min', patience=5, factor=0.5)

if phase == 'train':
  # scheduler.step()
    scheduler.step(loss)

结果

0md85ypi

0md85ypi1#

我认为这可能是你的学习进度表降低你的学习速度太快了(每7个epoch)。我建议尝试ReduceLROnPlateau,它会等到你停止进步几个时期,然后降低学习率。
如果我是你我会试着

lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

lr_scheduler.ReduceLROnPlateau(optimizer_ft, 'min', patience = 5, factor = 0.5)

另外,您需要将scheduler.step()替换为scheduler.step(loss)。这是因为它需要知道当前的损失,以便能够确定是否需要降低学习率。

相关问题