这是我的环境版本:
torch:2.2.1
transformers: 4.39.0.dev0
vllm: custom compile at master@24aecf421a4ad5989697010963074904fead9a1b
我使用SqueezeLLM量化训练了我的llama-7B模型,并希望使用vllm加载。以下是我的代码和回溯。
#git clone https://github.com/SqueezeAILab/SqueezeLLM.git
#git clone https://github.com/kssteven418/SqueezeLLM-gradients.git
#conda create -n sqllm-grad python=3.9 -y
#conda activate sqllm-grad
#cd SqueezeLLM-gradients
#pip install -e .
#pip install -r requirements.txt(mod torch>=2.2.1)
### Compute gradients
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=16 python run.py --output_dir [gradients_path] --model_name_or_path [model_path]
#cd SqueezeLLM/
#pip install -e .
#cd squeezellm
python setup_cuda.py install
#cd ../quantization
### Chunk model weights and gradients
python chunk_models.py --model [model_path] --output [model_chunk_path] --model_type llama
python chunk_models.py --model [gradients_path] --output [gradients_chunk_path] --model_type llama
### (Optional for D+S quantization) Outlier configuration generation
python generate_outlier_config.py --model [model_chunk_path] --range 1.8 --output [outlier_config]
### K-means clustering
python nuq.py --bit 4 --model_type llama --model [model_chunk_path] --gradient [gradient_chunk_path] --output [lut_path] --outlier_config [outlier_config]/outlier_config_o0.45.json --sensitivity 0.05
### Packing
python pack.py --model [model_path] --wbits 4 --folder [lut_path] --save [pack_path] --include_sparse --balance
AutoModelForCausalLM可以成功加载SqueezeLLM模型。
# load_quant from https://github.com/SqueezeAILab/SqueezeLLM/blob/main/llama.py#L136
from squeezellm.modelutils import *
from squeezellm.quant import *
def load_quant(model, checkpoint, wbits, include_sparse, topX):
"""
topX is num_dense_channels.
Number of dense channel used for hybrid kernel.
"""
model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
model = model.eval()
layers = find_layers(model)
state_dict = torch.load(os.path.join(checkpoint, "pack_model.pt"))
# load sparse thresholds from checkpoint
if include_sparse:
num_vals = {}
for k, v in state_dict.items():
if "sparse_threshold." in k:
key = k.replace("sparse_threshold.", "")
num_vals[key] = v
for k, v in num_vals.items():
del state_dict["sparse_threshold." + k]
else:
num_vals = None
# replace layers
for name in ["lm_head"]:
if name in layers:
del layers[name]
make_quant_lut(
model, layers, wbits, include_sparse=include_sparse, numvals=num_vals, topX=topX
)
del layers
print("Loading model ...")
state_dict = torch.load(os.path.join(checkpoint, "pack_model.pt"))
model.load_state_dict(state_dict, strict=False)
model.seqlen = 2048
print("Done.")
return model
model = load_quant("llama-2", adapter_path, 4, include_sparse=True, topX=10)
model = model.to(DEV)
model.eval()
但是vllm加载失败,出现错误。
from vllm import LLM, SamplingParams
import torch
model_path = '/root/ckpt161_quantization_w4_s0.45'
if __name__ == '__main__':
llm = LLM(model=model_path, quantization="squeezellm", dtype=torch.float16)
prompts = [
"Hello, my name is"
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
堆栈跟踪:
Traceback (most recent call last):
File "/root/python/dictionary/train/testbatchvllm.py", line 58, in <module>
llm = LLM(model=model_path, quantization="squeezellm", dtype=torch.float16)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/python/github.com/vllm/vllm/entrypoints/llm.py", line 109, in __init__
self.llm_engine = LLMEngine.from_engine_args(engine_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/python/github.com/vllm/vllm/engine/llm_engine.py", line 412, in from_engine_args
engine = cls(*engine_configs,
^^^^^^^^^^^^^^^^^^^^
File "/root/python/github.com/vllm/vllm/engine/llm_engine.py", line 142, in __init__
self._init_workers()
File "/root/python/github.com/vllm/vllm/engine/llm_engine.py", line 200, in _init_workers
self._run_workers("load_model")
File "/root/python/github.com/vllm/vllm/engine/llm_engine.py", line 1086, in _run_workers
driver_worker_output = getattr(self.driver_worker,
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/python/github.com/vllm/vllm/worker/worker.py", line 99, in load_model
self.model_runner.load_model()
File "/root/python/github.com/vllm/vllm/worker/model_runner.py", line 88, in load_model
self.model = get_model(self.model_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/python/github.com/vllm/vllm/model_executor/utils.py", line 52, in get_model
return get_model_fn(model_config, device_config, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/python/github.com/vllm/vllm/model_executor/model_loader.py", line 86, in get_model
model.load_weights(model_config.model, model_config.download_dir,
File "/root/python/github.com/vllm/vllm/model_executor/models/llama.py", line 388, in load_weights
param = params_dict[name]
~~~~~~~~~~~^^^^^^
KeyError: 'model.layers.0.self_attn.qkv_proj.rows'
5条答案
按热度按时间qhhrdooz1#
@chooper1,你能检查一下这个吗?谢谢。@SoleMY展示了使用修改过的SqueezeLLM转换器代码可以加载和运行SqueezeLLM量化版本,但是vllm在加载阶段失败了。
weylhg0b2#
我在一个模型中也看到了相同的错误:KeyError: 'model.layers.0.self_attn.qkv_proj.rows'
yqyhoc1h3#
它实际上位于这里:'model.layers.0.self_attn.qkv_proj.qweight'
7cwmlq894#
你使用SqueezeLLM/VLLM得到了任何模型吗?我相信在与VLLM合并后,这个功能从未经过测试,应该被移除。
lb3vh1jj5#
这个错误是由于在加载模型权重时,索引超出了范围。要解决这个问题,你需要检查
model.layers.0.self_attn.qkv_proj.qweight
的值,并确保start_idx
和output_dim
在正确的范围内。你可以尝试以下方法:model.layers.0.self_attn.qkv_proj.qweight
的值,确保它是一个一维Tensor。start_idx
和output_dim
的值在[-1, 0]
范围内。start_idx
和output_dim
的值,使它们在正确的范围内。