我使用pytorch_lightning和monai包(basic unet)构建了一个2D分割模型,它们工作正常。
现在,我添加了一个CRF(条件随机场)内我的模型,以完善的结果。然而,当我运行的训练,它似乎是该程序被卡住运行我计算的CRF-损失的行,你会看到光标 Flink ,将不会有新的输出。
求你救救我!!
"这是我想要训练的模特"
class ShoulderNetBis(pytorch_lightning.LightningModule):
def __init__(self, images, labels):
super().__init__()
self.images = images
self.labels = labels
#self.save_hyperparameters()
# Basic Unet for our model
self._model = BasicUNet(
spatial_dims=2,
in_channels=1,
out_channels=2, # 2 classes : bones & background
)
self.CRF = CRF(num_tags=2)
''' Former non functional version, it works since I change UNet in BasicUNet
self._model = UNet(
spatial_dims = 2,
in_channels=1,
out_channels=2,
channels = (16, 32, 64, 128, 256),
strides = (1,1,1,1),
num_res_units = 2,
norm = Norm.BATCH,
act='softmax'
)
'''
'''
We use a DiceCELoss which is a combination of a Dice Loss and Cross Entropy Loss
'''
self.loss_function = DiceCELoss(include_background=True, to_onehot_y=True, sigmoid=True)
self.post_pred = Compose([EnsureType('tensor', device='cpu'), AsDiscrete(argmax=True, to_onehot=2), FillHoles()]) # Standardisation of the prediction
self.post_label = Compose([EnsureType('tensor', device='cpu'), AsDiscrete(to_onehot=2)]) # Standardisation of the labels
self.dice_metric = DiceMetric(include_background=True, reduction='mean', get_not_nans=False) # Our metric for computing the performance of our segmentation
self.best_val_dice = 0
self.best_val_epoch = 0
def load_weight(self):
self._model.load_state_dict(r'C:\Users\Demo 1A\Documents\fichier vs codes\Bone_segmentation_3axes.m\Data\weights\state_dict__3axes__model.pth')
def forward(self, x): # Forward Loop
logits = self._model(x)
probs = torch.sigmoid(logits)
if self.training:
return probs
else :
preds = self.crf.decode(probs)
return preds
def prepare_data(self):
data_dicts = [
{'image': image_name, 'label': label_name}
for image_name, label_name in zip(self.images, self.labels)
]
train_files, val_files = data_dicts[:-20], data_dicts[20:]
set_determinism(seed=0)
train_transforms = Compose([
AddChanneld(keys=['image', 'label']), # we need to add a channel fo the training
Orientationd(keys=['image', 'label'], axcodes='RAS'),
NormalizeIntensityd(keys=['image']),
# Applying some data augmentation
RandGaussianNoised(keys=["image"], prob=0.5, mean=0.5, std=0.3),
RandRotated(keys=["image", "label"], prob=0.5),
RandGaussianSmoothd(keys=["image"], prob=0.5),
RandFlipd(keys=["image", "label"], prob=0.5),
RandKSpaceSpikeNoised(keys=["image"], prob=0.5),
RandCoarseDropoutd(keys=["image"], prob=0.5, holes=50, spatial_size=20),
RandZoomd(keys=["image", "label"], prob=0.2, min_zoom=0.5, max_zoom= 1.5),
ToTensord(keys=["image", "label"], dtype=torch.float)
])
val_transforms = Compose([
AddChanneld(keys=["image", "label"]),
Orientationd(keys=["image", "label"], axcodes="RAS"),
NormalizeIntensityd(keys=['image']),
ToTensord(keys=["image", "label"], dtype=torch.float)
])
# Loading data in dataset module
self.train_ds = CacheDataset(
data=train_files, transform=train_transforms,
num_workers=4, cache_rate=1.0
)
self.val_ds = CacheDataset(
data=val_files, transform=val_transforms,
num_workers=4, cache_rate=1.0
)
def train_dataloader(self):
train_loader = DataLoader(
self.train_ds, shuffle=True,
num_workers=4, collate_fn = list_data_collate
)
return train_loader
def val_dataloader(self):
val_loader = DataLoader(
self.val_ds, num_workers=4)
return val_loader
def configure_optimizers(self):
optimizer = torch.optim.Adam(
[{'params' : self._model.parameters()} , {'params' : self.CRF.parameters(), 'lr' : 1e-3}]
)
return optimizer
def on_train_epoch_start(self):
self.CRF.eval()
def on_train_epoch_end(self):
self.CRF.train()
def permute_dimentions(self, x,y, mode):
if mode == 'crf_loss':
x = x.permute(0, 2, 3, 1).contiguous().view(-1,1, 2)# Convert to [batch_size, height, width, num_classes]
tags = y.contiguous().view(-1, 1).long()# labels
return x, tags
elif mode=='unet_out' :
out = out.view(1, 2, 512, 512).contiguous() # Convert back to [batch_size, num_classes, height, width]
tags = y.view(1, 1, out.shape[2], out.shape[3])
return out
def training_step(self, batch, batch_idx):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
images, labels = batch['image'].to(device), batch['label'].to(device)
output = self.forward(images)
print(" output shape of the forward method", output.shape)
print(" labels shape of the forward method", labels.shape)
loss = self.loss_function(output, labels)
output, labels =self.permute_dimentions(output,labels, mode='crf_loss')
print(" output shape of the forward method", output.shape, output.dtype)
print(" labels shape of the forward method", labels.shape, labels.dtype)
crf_loss = self.CRF(output, labels)
output, labels =self.permute_dimentions(output, labels, mode='unet_out')
print(" output shape of the forward method", output.shape, output.dtype)
print(" labels shape of the forward method", labels.shape, labels.dtype)
loss += crf_loss
tensorboard_logs = {'train_loss' : loss.item()}
self.log('train_loss', loss.item())
return {'loss':loss, 'log': tensorboard_logs}
def validation_step(self, batch, batch_idx):
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
images, labels = batch['image'].to(device), batch['label'].to(device)
roi_size = (512,512)
sw_batch_size = 4
outputs = sliding_window_inference(
images, roi_size, sw_batch_size, self.forward)
loss = self.loss_function(outputs, labels)
output, labels =self.permute_dimentions(output,labels, mode='crf_loss')
crf_loss = self.CRF(outputs, labels)
output, labels =self.permute_dimentions(output,labels, mode='unet_out')
loss += crf_loss
outputs = [self.post_pred(i) for i in decollate_batch(outputs)]
labels = [self.post_label(i) for i in decollate_batch(labels)]
self.dice_metric(y_pred=outputs, y=labels)
return {"val_loss": loss, "val_number": len(outputs)}
def validation_epoch_end(self, outputs):
val_loss, num_items = 0, 0
for output in outputs:
val_loss += output['val_loss'].sum().item()
num_items += output['val_number']
mean_val_dice = self.dice_metric.aggregate().item()
self.dice_metric.reset()
mean_val_loss = torch.tensor(val_loss / num_items)
tensorboard_logs = {
'val_dice': mean_val_dice,
'val_loss': mean_val_loss,
}
self.log('val_dice', mean_val_dice)
self.log('val_loss', mean_val_loss)
if mean_val_dice > self.best_val_dice:
self.best_val_dice = mean_val_dice
self.best_val_epoch = self.current_epoch
print(
f'current epoch: {self.current_epoch}'
f"current mean dice: {mean_val_dice:.4f}"
f"\nbest mean dice: {self.best_val_dice:.4f} "
f"at epoch: {self.best_val_epoch}"
)
return {"log": tensorboard_logs} # You can visualize our results on TensorBoard
我希望它能完成训练,但它被困在这里
crf_loss = self.CRF(outputs, labels)
此处为输出
C:\Users\Demo 1A\AppData\Roaming\Python\Python39\site-packages\pytorch_lightning\trainer\connectors\accelerator_connector.py:466: LightningDeprecationWarning: Setting `Trainer(gpus=1)` is deprecated in v1.7 and will be removed in
v2.0. Please use `Trainer(accelerator='gpu', devices=1)` instead.
rank_zero_deprecation(
C:\Users\Demo 1A\AppData\Roaming\Python\Python39\site-packages\pytorch_lightning\trainer\connectors\accelerator_connector.py:486: UserWarning: The flag `devices=1` will be ignored, instead the device specific number 1 will be used rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
C:\Users\Demo 1A\AppData\Roaming\Python\Python39\site-packages\monai\utils\deprecate_utils.py:107: FutureWarning: <class 'monai.transforms.utility.array.AddChannel'>: Class `AddChannel` has been deprecated since version 0.8. please use MetaTensor data type and monai.transforms.EnsureChannelFirst instead.
warn_deprecated(obj, msg, warning_category)
Loading dataset: 0%| | 0/304 [00:00<?, ?it/s]C:\Users\Demo 1A\AppData\Roaming\Python\Python39\site-packages\monai\transforms\spatial\array.py:704: UserWarning: axcodes ('RAS') length is smaller than the number of input spatial dimensions D=2.
Orientation: input spatial shape is torch.Size([512, 512]), num. channels is 1,please make sure the input is in the channel-first format.
warnings.warn(
Loading dataset: 100%|█████████████████████████████████████████████████████████| 304/304 [00:02<00:00, 133.93it/s]
Loading dataset: 100%|█████████████████████████████████████████████████████████| 304/304 [00:02<00:00, 137.12it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
---------------------------------------------
0 | _model | BasicUNet | 2.0 M
1 | CRF | CRF | 8
2 | loss_function | DiceCELoss | 0
---------------------------------------------
2.0 M Trainable params
0 Non-trainable params
2.0 M Total params
7.914 Total estimated model params size (MB)
Epoch 0: 0%| | 0/608 [00:00<?, ?it/s]
output shape of the forward method torch.Size([1, 2, 512, 512])
labels shape of the forward method torch.Size([1, 1, 512, 512])
output shape of the forward method torch.Size([262144, 1, 2]) torch.float32
labels shape of the forward method torch.Size([262144, 1]) torch.int64
1条答案
按热度按时间py49o6xq1#
可能是在CPU上运行。请尝试将模型移动到cuda:
self.CRF = CRF(num_tags=2).to(device)