pytorch 从检查点恢复在Huggingface Transformers训练器中给出设备错误

toe95027  于 12个月前  发布在  其他
关注(0)|答案(1)|浏览(205)

我已经使用huggingface transformers库预训练了一个Bert模型,目前它正按预期工作,但我中途停止了训练。现在我有一个问题恢复训练。
对于我使用的训练:

from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast(tokenizer_file="tokenizer.json",max_len=512)

from transformers import BertConfig
from transformers import BertForMaskedLM    
config = BertConfig(vocab_size=50_000)
model = BertForMaskedLM(config=config)
model.num_parameters()

from transformers import DataCollatorForWholeWordMask

data_collator_wwm = DataCollatorForWholeWordMask(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./mybert",
    overwrite_output_dir=False,
    num_train_epochs=6,
    per_gpu_train_batch_size=32,

    logging_steps = 0.001,
    save_strategy = 'steps',
    save_steps= 0.05,
    save_total_limit=20,
    evaluation_strategy='steps',
    eval_steps = 0.05,

    tf32 = True,
    optim = "adamw_torch_fused",
    group_by_length = True,

    prediction_loss_only=True,
    #resume_from_checkpoint=True,

    hub_model_id = 'mybert',
    push_to_hub = True,
    hub_strategy = 'every_save',
    hub_private_repo = True,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator_wwm,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test']
    
)

trainer.train()
# trainer.train(resume_from_checkpoint=True)

现在,在为恢复训练之后,我将注解行改为启用resume_from_checkpoint。但我得到设备错误。

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[11], line 45
     35 trainer = Trainer(
     36     model=model,
     37     args=training_args,
   (...)
     41     
     42 )
     44 # trainer.train()
---> 45 trainer.train(resume_from_checkpoint=True)

File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\transformers\trainer.py:1539, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1534     self.model_wrapped = self.model
   1536 inner_training_loop = find_executable_batch_size(
   1537     self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
   1538 )
-> 1539 return inner_training_loop(
   1540     args=args,
   1541     resume_from_checkpoint=resume_from_checkpoint,
   1542     trial=trial,
   1543     ignore_keys_for_eval=ignore_keys_for_eval,
   1544 )

File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\transformers\trainer.py:1888, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1886     optimizer_was_run = scale_before <= scale_after
   1887 else:
-> 1888     self.optimizer.step()
   1889     optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
   1891 if optimizer_was_run:
   1892     # Delay optimizer scheduling until metrics are generated

File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\accelerate\optimizer.py:142, in AcceleratedOptimizer.step(self, closure)
    139     self._last_scale = scale_after
    141 else:
--> 142     self.optimizer.step(closure)

File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\lr_scheduler.py:69, in LRScheduler.__init__..with_counter..wrapper(*args, **kwargs)
     67 instance._step_count += 1
     68 wrapped = func.__get__(instance, cls)
---> 69 return wrapped(*args, **kwargs)

File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\optimizer.py:280, in Optimizer.profile_hook_step..wrapper(*args, **kwargs)
    276         else:
    277             raise RuntimeError(f"{func} must return None or a tuple of (new_args, new_kwargs),"
    278                                f"but got {result}.")
--> 280 out = func(*args, **kwargs)
    281 self._optimizer_step_code()
    283 # call optimizer step post hooks

File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\optimizer.py:33, in _use_grad_for_differentiable.._use_grad(self, *args, **kwargs)
     31 try:
     32     torch.set_grad_enabled(self.defaults['differentiable'])
---> 33     ret = func(self, *args, **kwargs)
     34 finally:
     35     torch.set_grad_enabled(prev_grad)

File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\adamw.py:171, in AdamW.step(self, closure)
    158     beta1, beta2 = group["betas"]
    160     self._init_group(
    161         group,
    162         params_with_grad,
   (...)
    168         state_steps,
    169     )
--> 171     adamw(
    172         params_with_grad,
    173         grads,
    174         exp_avgs,
    175         exp_avg_sqs,
    176         max_exp_avg_sqs,
    177         state_steps,
    178         amsgrad=amsgrad,
    179         beta1=beta1,
    180         beta2=beta2,
    181         lr=group["lr"],
    182         weight_decay=group["weight_decay"],
    183         eps=group["eps"],
    184         maximize=group["maximize"],
    185         foreach=group["foreach"],
    186         capturable=group["capturable"],
    187         differentiable=group["differentiable"],
    188         fused=group["fused"],
    189         grad_scale=getattr(self, "grad_scale", None),
    190         found_inf=getattr(self, "found_inf", None),
    191     )
    193 return loss

File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\adamw.py:321, in adamw(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)
    318 else:
    319     func = _single_tensor_adamw
--> 321 func(
    322     params,
    323     grads,
    324     exp_avgs,
    325     exp_avg_sqs,
    326     max_exp_avg_sqs,
    327     state_steps,
    328     amsgrad=amsgrad,
    329     beta1=beta1,
    330     beta2=beta2,
    331     lr=lr,
    332     weight_decay=weight_decay,
    333     eps=eps,
    334     maximize=maximize,
    335     capturable=capturable,
    336     differentiable=differentiable,
    337     grad_scale=grad_scale,
    338     found_inf=found_inf,
    339 )

File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\adamw.py:615, in _fused_adamw(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable)
    613     device_found_inf = found_inf_dict[device]
    614 torch._foreach_add_(device_state_steps, 1)
--> 615 torch._fused_adamw_(
    616     device_params,
    617     device_grads,
    618     device_exp_avgs,
    619     device_exp_avg_sqs,
    620     device_max_exp_avg_sqs,
    621     device_state_steps,
    622     amsgrad=amsgrad,
    623     lr=lr,
    624     beta1=beta1,
    625     beta2=beta2,
    626     weight_decay=weight_decay,
    627     eps=eps,
    628     maximize=maximize,
    629     grad_scale=device_grad_scale,
    630     found_inf=device_found_inf,
    631 )
    632 if device_found_inf is not None:
    633     torch._foreach_sub_(device_state_steps, [device_found_inf] * len(device_state_steps))

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument state_steps in method wrapper_CUDA___fused_adamw_)

我尝试的事情:
1.从检查点加载模型(尽管它应该自动加载最后一个检查点):model = BertForMaskedLM.from_pretrained('mybert\checkpoint-5410440')
1.使用model = model.to(“cuda”)将模型发送到cuda设备。
1.从args中删除resume_from_checkpoint

edit:4.我尝试将Transformer从4.31.0升级到4.33.1(不推荐,因为培训是在4.31.0中完成的),但得到了一个有点类似的错误:

`fused=True` requires all the params to be CUDA, floating point Tensor
vyswwuz2

vyswwuz21#

如在huggingface github中指出的,这是加载优化器的已知问题。
将trainer.py中的第2542行从

map_location = self.args.device if self.args.world_size > 1 else "cpu"

map_location = self.args.device

将解决这个问题。(huggingface Transformer v4.30.1,Pytorch v2.0.1)

相关问题