当前环境
我使用的是0.4.3版本,通过pip安装,CUDA版本为12.0,A100 GPU
运行时错误:t == DeviceType::CUDA INTERNAL ASSERT FAILED
🐛 描述bug
INFO 06-02 03:21:56 model_runner.py:173] Loading model weights took 12.1389 GB
Traceback (most recent call last):
File "/export/aiops-data/yuhui/wanda/benchmark_throughput_xgen.py", line 402, in <module>
main(args)
File "/export/aiops-data/yuhui/wanda/benchmark_throughput_xgen.py", line 221, in main
elapsed_time = run_vllm(
File "/export/aiops-data/yuhui/wanda/benchmark_throughput_xgen.py", line 85, in run_vllm
llm = LLM(
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/entrypoints/llm.py", line 118, in __init__
self.llm_engine = LLMEngine.from_engine_args(
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/engine/llm_engine.py", line 277, in from_engine_args
engine = cls(
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/engine/llm_engine.py", line 160, in __init__
self._initialize_kv_caches()
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/engine/llm_engine.py", line 236, in _initialize_kv_caches
self.model_executor.determine_num_available_blocks())
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/executor/gpu_executor.py", line 111, in determine_num_available_blocks
return self.driver_worker.determine_num_available_blocks()
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/worker/worker.py", line 138, in determine_num_available_blocks
self.model_runner.profile_run()
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/worker/model_runner.py", line 927, in profile_run
self.execute_model(seqs, kv_caches)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/worker/model_runner.py", line 848, in execute_model
hidden_states = model_executable(**execute_model_kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/model_executor/models/llama.py", line 360, in forward
hidden_states = self.model(input_ids, positions, kv_caches,
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/model_executor/models/llama.py", line 286, in forward
hidden_states, residual = layer(
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/model_executor/models/llama.py", line 228, in forward
hidden_states = self.self_attn(
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/model_executor/models/llama.py", line 165, in forward
q, k = self.rotary_emb(positions, q, k)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/model_executor/layers/rotary_embedding.py", line 155, in forward
ops.rotary_embedding(positions, query, key, self.head_size,
File "/miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/_custom_ops.py", line 89, in rotary_embedding
vllm_ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache,
RuntimeError: t == DeviceType::CUDA INTERNAL ASSERT FAILED at "/opt/hostedtoolcache/Python/3.9.19/x64/lib/python3.9/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h":25, please report a bug to PyTorch.
Exception raised from CUDAGuardImpl at /opt/hostedtoolcache/Python/3.9.19/x64/lib/python3.9/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h:25 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7d84d8d81d87 in /miniconda/envs/sqllm/lib/python3.9/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, char const*) + 0x68 (0x7d84d8d32828 in /miniconda/envs/sqllm/lib/python3.9/site-packages/torch/lib/libc10.so)
frame #2: rotary_embedding(at::Tensor&, at::Tensor&, at::Tensor&, int, at::Tensor&, bool) + 0xdf5 (0x7d837cd36db5 in /miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/_C.cpython-39-x86_64-linux-gnu.so)
frame #3: <unknown function> + 0x9b06a (0x7d837cd5b06a in /miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/_C.cpython-39-x86_64-linux-gnu.so)
frame #4: <unknown function> + 0x98263 (0x7d837cd58263 in /miniconda/envs/sqllm/lib/python3.9/site-packages/vllm/_C.cpython-39-x86_64-linux-gnu.so)
frame #5: python() [0x507397]
<omitting python frames>
frame #8: python() [0x4f80b3]
frame #10: python() [0x4e69da]
frame #12: python() [0x505131]
frame #14: python() [0x4e69da]
frame #16: python() [0x505131]
frame #18: python() [0x4e69da]
frame #21: python() [0x5cb113]
frame #24: python() [0x4e69da]
frame #25: python() [0x50509d]
frame #28: python() [0x4e69da]
frame #29: python() [0x50509d]
frame #32: python() [0x4e69da]
frame #36: python() [0x5cb113]
frame #39: python() [0x4f80b3]
frame #40: python() [0x505131]
frame #42: python() [0x4e69da]
frame #44: python() [0x505131]
frame #46: python() [0x4e69da]
frame #49: python() [0x5cb113]
frame #52: python() [0x4e69da]
frame #54: python() [0x505131]
frame #56: python() [0x4e69da]
frame #58: python() [0x505131]
frame #60: python() [0x4e69da]
frame #63: python() [0x5cb113]
1条答案
按热度按时间0x6upsns1#
请yuhuixu1993同学分享一份完整的最小复现脚本。