官方代码测试: (python3.8) [baichuan@localhost baichuan-7B]$ python3 generate.py 模型权重未绑定。在使用 `infer_auto_device` 函数之前,请先使用 `tie_weights` 方法。 登鹳雀楼->王之涣 夜雨寄北->李商隐 过零丁洋->文天祥 己亥杂诗(其五)->龚自珍
j8ag8udp1#
解决24G 显存不足的问题,样例代码:
import os from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig import torch PRE_TRAINED_MODEL_PATH = "../model/" # 程序入口 def main(): os.environ["CUDA_VISIBLE_DEVICES"] = "2" tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_PATH, trust_remote_code=True) tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id # set as the <unk> token if tokenizer.pad_token_id == 64000: tokenizer.pad_token_id = 0 # for baichuan model (need fix) config = AutoConfig.from_pretrained(PRE_TRAINED_MODEL_PATH, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(PRE_TRAINED_MODEL_PATH, config=config, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto", low_cpu_mem_usage=True) with torch.autocast("cuda"): while True: try: input_txt = input("user:") inputs = tokenizer(input_txt, return_tensors='pt') inputs = inputs.to("cuda:0") response = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.1) response = tokenizer.decode(response.cpu()[0], skip_special_tokens=True) print("bot:", response) torch.cuda.empty_cache() except Exception as e: print(e) break if __name__ == '__main__': main()
y53ybaqx2#
你好,上面提出的解决方法失效了,请问有新的方法吗?
2条答案
按热度按时间j8ag8udp1#
解决24G 显存不足的问题,样例代码:
y53ybaqx2#
你好,上面提出的解决方法失效了,请问有新的方法吗?