我已经使用huggingface transformers库预训练了一个Bert模型,目前它正按预期工作,但我中途停止了训练。现在我有一个问题恢复训练。
对于我使用的训练:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast(tokenizer_file="tokenizer.json",max_len=512)
from transformers import BertConfig
from transformers import BertForMaskedLM
config = BertConfig(vocab_size=50_000)
model = BertForMaskedLM(config=config)
model.num_parameters()
from transformers import DataCollatorForWholeWordMask
data_collator_wwm = DataCollatorForWholeWordMask(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./mybert",
overwrite_output_dir=False,
num_train_epochs=6,
per_gpu_train_batch_size=32,
logging_steps = 0.001,
save_strategy = 'steps',
save_steps= 0.05,
save_total_limit=20,
evaluation_strategy='steps',
eval_steps = 0.05,
tf32 = True,
optim = "adamw_torch_fused",
group_by_length = True,
prediction_loss_only=True,
#resume_from_checkpoint=True,
hub_model_id = 'mybert',
push_to_hub = True,
hub_strategy = 'every_save',
hub_private_repo = True,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator_wwm,
train_dataset=dataset['train'],
eval_dataset=dataset['test']
)
trainer.train()
# trainer.train(resume_from_checkpoint=True)
现在,在为恢复训练之后,我将注解行改为启用resume_from_checkpoint。但我得到设备错误。
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[11], line 45
35 trainer = Trainer(
36 model=model,
37 args=training_args,
(...)
41
42 )
44 # trainer.train()
---> 45 trainer.train(resume_from_checkpoint=True)
File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\transformers\trainer.py:1539, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1534 self.model_wrapped = self.model
1536 inner_training_loop = find_executable_batch_size(
1537 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1538 )
-> 1539 return inner_training_loop(
1540 args=args,
1541 resume_from_checkpoint=resume_from_checkpoint,
1542 trial=trial,
1543 ignore_keys_for_eval=ignore_keys_for_eval,
1544 )
File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\transformers\trainer.py:1888, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1886 optimizer_was_run = scale_before <= scale_after
1887 else:
-> 1888 self.optimizer.step()
1889 optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
1891 if optimizer_was_run:
1892 # Delay optimizer scheduling until metrics are generated
File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\accelerate\optimizer.py:142, in AcceleratedOptimizer.step(self, closure)
139 self._last_scale = scale_after
141 else:
--> 142 self.optimizer.step(closure)
File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\lr_scheduler.py:69, in LRScheduler.__init__..with_counter..wrapper(*args, **kwargs)
67 instance._step_count += 1
68 wrapped = func.__get__(instance, cls)
---> 69 return wrapped(*args, **kwargs)
File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\optimizer.py:280, in Optimizer.profile_hook_step..wrapper(*args, **kwargs)
276 else:
277 raise RuntimeError(f"{func} must return None or a tuple of (new_args, new_kwargs),"
278 f"but got {result}.")
--> 280 out = func(*args, **kwargs)
281 self._optimizer_step_code()
283 # call optimizer step post hooks
File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\optimizer.py:33, in _use_grad_for_differentiable.._use_grad(self, *args, **kwargs)
31 try:
32 torch.set_grad_enabled(self.defaults['differentiable'])
---> 33 ret = func(self, *args, **kwargs)
34 finally:
35 torch.set_grad_enabled(prev_grad)
File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\adamw.py:171, in AdamW.step(self, closure)
158 beta1, beta2 = group["betas"]
160 self._init_group(
161 group,
162 params_with_grad,
(...)
168 state_steps,
169 )
--> 171 adamw(
172 params_with_grad,
173 grads,
174 exp_avgs,
175 exp_avg_sqs,
176 max_exp_avg_sqs,
177 state_steps,
178 amsgrad=amsgrad,
179 beta1=beta1,
180 beta2=beta2,
181 lr=group["lr"],
182 weight_decay=group["weight_decay"],
183 eps=group["eps"],
184 maximize=group["maximize"],
185 foreach=group["foreach"],
186 capturable=group["capturable"],
187 differentiable=group["differentiable"],
188 fused=group["fused"],
189 grad_scale=getattr(self, "grad_scale", None),
190 found_inf=getattr(self, "found_inf", None),
191 )
193 return loss
File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\adamw.py:321, in adamw(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)
318 else:
319 func = _single_tensor_adamw
--> 321 func(
322 params,
323 grads,
324 exp_avgs,
325 exp_avg_sqs,
326 max_exp_avg_sqs,
327 state_steps,
328 amsgrad=amsgrad,
329 beta1=beta1,
330 beta2=beta2,
331 lr=lr,
332 weight_decay=weight_decay,
333 eps=eps,
334 maximize=maximize,
335 capturable=capturable,
336 differentiable=differentiable,
337 grad_scale=grad_scale,
338 found_inf=found_inf,
339 )
File c:\Users\user01\Documents\mybert\myvenv\lib\site-packages\torch\optim\adamw.py:615, in _fused_adamw(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable)
613 device_found_inf = found_inf_dict[device]
614 torch._foreach_add_(device_state_steps, 1)
--> 615 torch._fused_adamw_(
616 device_params,
617 device_grads,
618 device_exp_avgs,
619 device_exp_avg_sqs,
620 device_max_exp_avg_sqs,
621 device_state_steps,
622 amsgrad=amsgrad,
623 lr=lr,
624 beta1=beta1,
625 beta2=beta2,
626 weight_decay=weight_decay,
627 eps=eps,
628 maximize=maximize,
629 grad_scale=device_grad_scale,
630 found_inf=device_found_inf,
631 )
632 if device_found_inf is not None:
633 torch._foreach_sub_(device_state_steps, [device_found_inf] * len(device_state_steps))
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument state_steps in method wrapper_CUDA___fused_adamw_)
我尝试的事情:
1.从检查点加载模型(尽管它应该自动加载最后一个检查点):model = BertForMaskedLM.from_pretrained('mybert\checkpoint-5410440')
1.使用model = model.to(“cuda”)将模型发送到cuda设备。
1.从args中删除resume_from_checkpoint
edit:4.我尝试将Transformer从4.31.0升级到4.33.1(不推荐,因为培训是在4.31.0中完成的),但得到了一个有点类似的错误:
`fused=True` requires all the params to be CUDA, floating point Tensor
1条答案
按热度按时间vyswwuz21#
如在huggingface github中指出的,这是加载优化器的已知问题。
将trainer.py中的第2542行从
到
将解决这个问题。(huggingface Transformer v4.30.1,Pytorch v2.0.1)