pytorch 我的程序还在运行,控制台上的光标在 Flink ,训练还没有开始

34gzjxbg  于 2023-03-02  发布在  Flink
关注(0)|答案(1)|浏览(165)

我使用pytorch_lightning和monai包(basic unet)构建了一个2D分割模型,它们工作正常。
现在,我添加了一个CRF(条件随机场)内我的模型,以完善的结果。然而,当我运行的训练,它似乎是该程序被卡住运行我计算的CRF-损失的行,你会看到光标 Flink ,将不会有新的输出。
求你救救我!!
"这是我想要训练的模特"

class ShoulderNetBis(pytorch_lightning.LightningModule):
    def __init__(self, images, labels):
        super().__init__()
        
        self.images = images
        self.labels = labels
        #self.save_hyperparameters()
        # Basic Unet for our model
        self._model = BasicUNet(
            spatial_dims=2,
            in_channels=1,
            out_channels=2, # 2 classes : bones & background
        )
        
        self.CRF = CRF(num_tags=2)
        ''' Former non functional version, it works since I change UNet in BasicUNet

        self._model = UNet(
            spatial_dims = 2,
            in_channels=1,
            out_channels=2,
            channels = (16, 32, 64, 128, 256),
            strides = (1,1,1,1),
            num_res_units = 2,
            norm = Norm.BATCH,
            act='softmax'
        )
        '''

        ''' 
        We use a DiceCELoss which is a combination of a Dice Loss and Cross Entropy Loss
        '''

        self.loss_function = DiceCELoss(include_background=True, to_onehot_y=True, sigmoid=True) 
        self.post_pred = Compose([EnsureType('tensor', device='cpu'), AsDiscrete(argmax=True, to_onehot=2), FillHoles()]) # Standardisation of the prediction
        self.post_label = Compose([EnsureType('tensor', device='cpu'), AsDiscrete(to_onehot=2)]) # Standardisation of the labels
        self.dice_metric = DiceMetric(include_background=True, reduction='mean', get_not_nans=False) # Our metric for computing the performance of our segmentation
        
        self.best_val_dice = 0
        self.best_val_epoch = 0

    def load_weight(self):
        self._model.load_state_dict(r'C:\Users\Demo 1A\Documents\fichier vs codes\Bone_segmentation_3axes.m\Data\weights\state_dict__3axes__model.pth')

    def forward(self, x): # Forward Loop
        logits = self._model(x)
        probs = torch.sigmoid(logits)
        if self.training:
          return probs
        else :
          preds = self.crf.decode(probs)
          return preds

    def prepare_data(self):
        data_dicts = [
            {'image': image_name, 'label': label_name} 
            for image_name, label_name in zip(self.images, self.labels) 
        ]
        train_files, val_files = data_dicts[:-20], data_dicts[20:] 
        set_determinism(seed=0)
        
        train_transforms = Compose([
            AddChanneld(keys=['image', 'label']), # we need to add a channel fo the training
            Orientationd(keys=['image', 'label'], axcodes='RAS'),
            NormalizeIntensityd(keys=['image']),

            # Applying some data augmentation
            RandGaussianNoised(keys=["image"], prob=0.5, mean=0.5, std=0.3),
            RandRotated(keys=["image", "label"], prob=0.5),
            RandGaussianSmoothd(keys=["image"], prob=0.5),
            RandFlipd(keys=["image", "label"], prob=0.5),
            RandKSpaceSpikeNoised(keys=["image"], prob=0.5),
            RandCoarseDropoutd(keys=["image"], prob=0.5, holes=50, spatial_size=20),
            RandZoomd(keys=["image", "label"], prob=0.2, min_zoom=0.5, max_zoom= 1.5),


            ToTensord(keys=["image", "label"], dtype=torch.float)
        ])

        val_transforms = Compose([
            AddChanneld(keys=["image", "label"]),
            Orientationd(keys=["image", "label"], axcodes="RAS"),
            NormalizeIntensityd(keys=['image']),
            ToTensord(keys=["image", "label"], dtype=torch.float)
        ])
        # Loading data in dataset module
        self.train_ds = CacheDataset(
            data=train_files, transform=train_transforms,
            num_workers=4, cache_rate=1.0
        )
        
        self.val_ds = CacheDataset(
            data=val_files, transform=val_transforms,
            num_workers=4, cache_rate=1.0
        )
    
    def train_dataloader(self):
        train_loader = DataLoader(
            self.train_ds, shuffle=True,
            num_workers=4, collate_fn = list_data_collate
        )
        return train_loader

    def val_dataloader(self):  
        val_loader = DataLoader(
            self.val_ds, num_workers=4)
        return val_loader

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            [{'params' : self._model.parameters()} , {'params' : self.CRF.parameters(), 'lr' : 1e-3}]
            ) 
        return optimizer

    def on_train_epoch_start(self):
        self.CRF.eval()

    def on_train_epoch_end(self):
        self.CRF.train()
        
    def permute_dimentions(self, x,y, mode):
        if mode == 'crf_loss':
            x = x.permute(0, 2, 3, 1).contiguous().view(-1,1, 2)# Convert to [batch_size, height, width, num_classes]
            tags = y.contiguous().view(-1, 1).long()# labels
           
            return x, tags
        elif mode=='unet_out' :
            out = out.view(1, 2, 512, 512).contiguous()  # Convert back to [batch_size, num_classes, height, width]
            tags = y.view(1, 1, out.shape[2], out.shape[3])
            return out
        

    def training_step(self, batch, batch_idx):
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            images, labels = batch['image'].to(device), batch['label'].to(device)
            output = self.forward(images)
            print(" output shape of the forward method", output.shape)
            print(" labels shape of the forward method", labels.shape)
            loss = self.loss_function(output, labels)
            output, labels =self.permute_dimentions(output,labels, mode='crf_loss')
            print(" output shape of the forward method", output.shape, output.dtype)
            print(" labels shape of the forward method", labels.shape, labels.dtype)
            crf_loss = self.CRF(output, labels)
            output, labels =self.permute_dimentions(output, labels, mode='unet_out')
            print(" output shape of the forward method", output.shape, output.dtype)
            print(" labels shape of the forward method", labels.shape, labels.dtype)
            loss += crf_loss
            tensorboard_logs = {'train_loss' : loss.item()}
            self.log('train_loss', loss.item())
            return {'loss':loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        images, labels = batch['image'].to(device), batch['label'].to(device)
        roi_size = (512,512)
        sw_batch_size = 4
        outputs = sliding_window_inference(
            images, roi_size, sw_batch_size, self.forward)
        loss = self.loss_function(outputs, labels)
        output, labels =self.permute_dimentions(output,labels,  mode='crf_loss')
        crf_loss = self.CRF(outputs, labels)
        output, labels =self.permute_dimentions(output,labels,  mode='unet_out')
        loss += crf_loss
        outputs = [self.post_pred(i) for i in decollate_batch(outputs)]
        labels = [self.post_label(i) for i in decollate_batch(labels)]
        self.dice_metric(y_pred=outputs, y=labels)
        return {"val_loss": loss, "val_number": len(outputs)} 

    def validation_epoch_end(self, outputs):
        val_loss, num_items = 0, 0
        for output in outputs:
            val_loss += output['val_loss'].sum().item()
            num_items += output['val_number']
        mean_val_dice = self.dice_metric.aggregate().item()
        self.dice_metric.reset()
        mean_val_loss = torch.tensor(val_loss / num_items)
        tensorboard_logs = {
            'val_dice': mean_val_dice,
            'val_loss': mean_val_loss,
        }
        self.log('val_dice', mean_val_dice)
        self.log('val_loss', mean_val_loss)
        if mean_val_dice > self.best_val_dice:
            self.best_val_dice = mean_val_dice
            self.best_val_epoch = self.current_epoch
        print(
            f'current epoch: {self.current_epoch}'
            f"current mean dice: {mean_val_dice:.4f}"
            f"\nbest mean dice: {self.best_val_dice:.4f} "
            f"at epoch: {self.best_val_epoch}"
        )
        return {"log": tensorboard_logs} # You can visualize our results on TensorBoard

我希望它能完成训练,但它被困在这里

crf_loss = self.CRF(outputs, labels)

此处为输出

C:\Users\Demo 1A\AppData\Roaming\Python\Python39\site-packages\pytorch_lightning\trainer\connectors\accelerator_connector.py:466: LightningDeprecationWarning: Setting `Trainer(gpus=1)` is deprecated in v1.7 and will be removed in 
v2.0. Please use `Trainer(accelerator='gpu', devices=1)` instead.
  rank_zero_deprecation(
C:\Users\Demo 1A\AppData\Roaming\Python\Python39\site-packages\pytorch_lightning\trainer\connectors\accelerator_connector.py:486: UserWarning: The flag `devices=1` will be ignored, instead the device specific number 1 will be used  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
C:\Users\Demo 1A\AppData\Roaming\Python\Python39\site-packages\monai\utils\deprecate_utils.py:107: FutureWarning: <class 'monai.transforms.utility.array.AddChannel'>: Class `AddChannel` has been deprecated since version 0.8. please use MetaTensor data type and monai.transforms.EnsureChannelFirst instead.
  warn_deprecated(obj, msg, warning_category)
Loading dataset:   0%|                                                                    | 0/304 [00:00<?, ?it/s]C:\Users\Demo 1A\AppData\Roaming\Python\Python39\site-packages\monai\transforms\spatial\array.py:704: UserWarning: axcodes ('RAS') length is smaller than the number of input spatial dimensions D=2.
Orientation: input spatial shape is torch.Size([512, 512]), num. channels is 1,please make sure the input is in the channel-first format.
  warnings.warn(
Loading dataset: 100%|█████████████████████████████████████████████████████████| 304/304 [00:02<00:00, 133.93it/s]
Loading dataset: 100%|█████████████████████████████████████████████████████████| 304/304 [00:02<00:00, 137.12it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type       | Params
---------------------------------------------   
0 | _model        | BasicUNet  | 2.0 M
1 | CRF           | CRF        | 8
2 | loss_function | DiceCELoss | 0
---------------------------------------------   
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.914     Total estimated model params size (MB)
Epoch 0:   0%|                                                                            | 0/608 [00:00<?, ?it/s] 
output shape of the forward method torch.Size([1, 2, 512, 512])
 labels shape of the forward method torch.Size([1, 1, 512, 512])
 output shape of the forward method torch.Size([262144, 1, 2]) torch.float32
 labels shape of the forward method torch.Size([262144, 1]) torch.int64
py49o6xq

py49o6xq1#

可能是在CPU上运行。请尝试将模型移动到cuda:
self.CRF = CRF(num_tags=2).to(device)

相关问题