我正在TESLA V100 GPU上微调timm库中的vit_base_patch16_384
模型。目前,我有3000个训练图像,分为15个类。准确率停留在46%,我认为超参数调整存在一些问题。如何获得好的超参数?
图形
编码
# Batch size
bs = 32
# Number of epochs
num_epochs = 100
# Number of classes
num_classes = 15
# Number of workers
num_cpu = multiprocessing.cpu_count()
# Size of image
imsize = int(args.size)
# timm model list
print(timm.list_models('resnet*', pretrained=True))
# finetune with vit_384
if args.net == "vit_timm_pretrained":
size = 384
# Applying transforms to the data
image_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(size=32, scale=(0.8, 1.0)),
transforms.Resize(size),
transforms.RandomRotation(degrees=15),
transforms.RandomHorizontalFlip(),
transforms.CenterCrop(size=size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])
]),
'valid': transforms.Compose([
transforms.Resize(size=size),
transforms.CenterCrop(size=size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])
])
}
# Load data from folders
dataset = {
'train': datasets.ImageFolder(root=train_directory, transform=image_transforms['train']),
'valid': datasets.ImageFolder(root=valid_directory, transform=image_transforms['valid'])
}
# Size of train and validation data
dataset_sizes = {
'train': len(dataset['train']),
'valid': len(dataset['valid'])
}
# Create iterators for data loading
dataloaders = {
'train': data.DataLoader(dataset['train'], batch_size=bs, shuffle=True,
num_workers=num_cpu, pin_memory=True, drop_last=True),
'valid': data.DataLoader(dataset['valid'], batch_size=bs, shuffle=True,
num_workers=num_cpu, pin_memory=True, drop_last=True)
}
# Class names or target labels
class_names = dataset['train'].classes
print("Classes:", class_names)
# Print the train and validation data sizes
print("Training-set size:", dataset_sizes['train'],
"\nValidation-set size:", dataset_sizes['valid'])
# Set default device as gpu, if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Pre-trained vit_base_patch16_384, vit_large_patch32_384, vit_huge_patch14_224
if args.net == "vit_timm_pretrained":
model_ft = timm.create_model("vit_base_patch16_384", pretrained=True)
model_ft.head = nn.Linear(model_ft.head.in_features, num_classes)
# Utilize multiple GPUS
if 'cuda' in device:
print(device)
print("using data parallel")
model_ft = torch.nn.DataParallel(model_ft) # make parallel
cudnn.benchmark = True
# Loss function
criterion = nn.CrossEntropyLoss()
# Optimizer
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Learning rate decay
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
# Model training routine
print("\nTraining:-\n")
model_ft.cuda()
def train_model(model, criterion, optimizer, scheduler, num_epochs=30):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
# Tensorboard summary
writer = SummaryWriter()
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'valid']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
# inputs = inputs.to("cuda")
# labels = labels.to("cuda")
inputs = inputs.to(device, non_blocking=True)
labels = labels.to(device, non_blocking=True)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# Record training loss and accuracy for each phase
if phase == 'train':
writer.add_scalar('Train/Loss', epoch_loss, epoch)
writer.add_scalar('Train/Accuracy', epoch_acc, epoch)
writer.flush()
else:
writer.add_scalar('Valid/Loss', epoch_loss, epoch)
writer.add_scalar('Valid/Accuracy', epoch_acc, epoch)
writer.flush()
# deep copy the model
if phase == 'valid' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
# Train the model
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=num_epochs)
提示#2
优化器
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Learning rate decay
exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_ft, 'min', patience=5, factor=0.5)
if phase == 'train':
# scheduler.step()
scheduler.step(loss)
结果
1条答案
按热度按时间0md85ypi1#
我认为这可能是你的学习进度表降低你的学习速度太快了(每7个epoch)。我建议尝试ReduceLROnPlateau,它会等到你停止进步几个时期,然后降低学习率。
如果我是你我会试着
与
另外,您需要将
scheduler.step()
替换为scheduler.step(loss)
。这是因为它需要知道当前的损失,以便能够确定是否需要降低学习率。