Unable to load model onto multiple GPUs

#2
by bprice9 - opened

No issues loading to a single GPU. However, when attempting to distribute between two GPUs it fails. The previous Meta-Llama-3-8B-Instruct-quantized.w8a16 model loads onto two GPUs without issue.

This model and Meta-Llama-3.1-70B-Instruct-quantized.w8a16 both fail to load onto multiple GPUs

Using RTX a6000 x 2
VLLM version '0.5.3.post1'


RuntimeError Traceback (most recent call last)
Cell In [6], line 1
----> 1 llm = LLM(model=model_id, tensor_parallel_size=number_gpus, max_model_len=max_model_len)

File /opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py:155, in LLM.init(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, disable_custom_all_reduce, **kwargs)
132 raise TypeError(
133 "There is no need to pass vision-related arguments anymore.")
134 engine_args = EngineArgs(
135 model=model,
136 tokenizer=tokenizer,
(...)
153 **kwargs,
154 )
--> 155 self.llm_engine = LLMEngine.from_engine_args(
156 engine_args, usage_context=UsageContext.LLM_CLASS)
157 self.request_counter = Counter()

File /opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py:441, in LLMEngine.from_engine_args(cls, engine_args, usage_context, stat_loggers)
439 executor_class = cls._get_executor_cls(engine_config)
440 # Create the LLM engine.
--> 441 engine = cls(
442 **engine_config.to_dict(),
443 executor_class=executor_class,
444 log_stats=not engine_args.disable_log_stats,
445 usage_context=usage_context,
446 stat_loggers=stat_loggers,
447 )
449 return engine

File /opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py:265, in LLMEngine.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, multimodal_config, speculative_config, decoding_config, observability_config, prompt_adapter_config, executor_class, log_stats, usage_context, stat_loggers)
251 self.model_executor = executor_class(
252 model_config=model_config,
253 cache_config=cache_config,
(...)
261 prompt_adapter_config=prompt_adapter_config,
262 )
264 if not self.model_config.embedding_mode:
--> 265 self._initialize_kv_caches()
267 # If usage stat is enabled, collect relevant info.
268 if is_usage_stats_enabled():

File /opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py:364, in LLMEngine._initialize_kv_caches(self)
357 def _initialize_kv_caches(self) -> None:
358 """Initialize the KV cache in the worker(s).
359
360 The workers will determine the number of blocks in both the GPU cache
361 and the swap CPU cache.
362 """
363 num_gpu_blocks, num_cpu_blocks = (
--> 364 self.model_executor.determine_num_available_blocks())
366 if self.cache_config.num_gpu_blocks_override is not None:
367 num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override

File /opt/conda/lib/python3.10/site-packages/vllm/executor/distributed_gpu_executor.py:38, in DistributedGPUExecutor.determine_num_available_blocks(self)
28 """Determine the number of available KV blocks.
29
30 This invokes determine_num_available_blocks on each worker and takes
(...)
35 - tuple[num_gpu_blocks, num_cpu_blocks]
36 """
37 # Get the maximum number of blocks that can be allocated on GPU and CPU.
---> 38 num_blocks = self._run_workers("determine_num_available_blocks", )
39 print(f"num_blocks: {num_blocks}")
40 # Since we use a shared centralized controller, we take the minimum
41 # number of blocks across all workers to make sure all the memory
42 # operators can be applied to all workers.

File /opt/conda/lib/python3.10/site-packages/vllm/executor/multiproc_gpu_executor.py:178, in MultiprocessingGPUExecutor._run_workers(self, method, async_run_tensor_parallel_workers_only, max_concurrent_workers, *args, **kwargs)
172 worker_outputs = [
173 worker.execute_method(method, *args, **kwargs)
174 for worker in self.workers
175 ]
177 driver_worker_method = getattr(self.driver_worker, method)
--> 178 driver_worker_output = driver_worker_method(*args, **kwargs)
180 # Get the results of the workers.
181 return [driver_worker_output
182 ] + [output.get() for output in worker_outputs]

File /opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/vllm/worker/worker.py:179, in Worker.determine_num_available_blocks(self)
175 torch.cuda.empty_cache()
177 # Execute a forward pass with dummy inputs to profile the memory usage
178 # of the model.
--> 179 self.model_runner.profile_run()
181 # Calculate the number of blocks that can be allocated with the
182 # profiled peak memory.
183 torch.cuda.synchronize()

File /opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py:896, in GPUModelRunnerBase.profile_run(self)
891 if not get_pp_group().is_first_rank:
892 intermediate_tensors = self.model.make_empty_intermediate_tensors(
893 batch_size=batch_size,
894 dtype=self.model_config.dtype,
895 device=self.device)
--> 896 self.execute_model(model_input, kv_caches, intermediate_tensors)
897 torch.cuda.synchronize()
898 return

File /opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py:1314, in ModelRunner.execute_model(self, model_input, kv_caches, intermediate_tensors, num_steps)
1309 multi_modal_kwargs = model_input.multi_modal_kwargs or {}
1310 seqlen_agnostic_kwargs = {
1311 "finished_requests_ids": model_input.finished_requests_ids,
1312 "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
1313 } if self.has_seqlen_agnostic else {}
-> 1314 hidden_or_intermediate_states = model_executable(
1315 input_ids=model_input.input_tokens,
1316 positions=model_input.input_positions,
1317 kv_caches=kv_caches,
1318 attn_metadata=model_input.attn_metadata,
1319 intermediate_tensors=intermediate_tensors,
1320 **multi_modal_kwargs,
1321 **seqlen_agnostic_kwargs)
1323 # Compute the logits in the last pipeline stage.
1324 if not get_pp_group().is_last_rank:

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None

File /opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/llama.py:422, in LlamaForCausalLM.forward(self, input_ids, positions, kv_caches, attn_metadata, intermediate_tensors)
414 def forward(
415 self,
416 input_ids: torch.Tensor,
(...)
420 intermediate_tensors: Optional[IntermediateTensors] = None,
421 ) -> Union[torch.Tensor, IntermediateTensors]:
--> 422 model_output = self.model(input_ids, positions, kv_caches,
423 attn_metadata, intermediate_tensors)
424 return model_output

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None

File /opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/llama.py:322, in LlamaModel.forward(self, input_ids, positions, kv_caches, attn_metadata, intermediate_tensors, inputs_embeds)
320 for i in range(self.start_layer, self.end_layer):
321 layer = self.layers[i]
--> 322 hidden_states, residual = layer(
323 positions,
324 hidden_states,
325 kv_caches[i - self.start_layer],
326 attn_metadata,
327 residual,
328 )
330 if not get_pp_group().is_last_rank:
331 return IntermediateTensors({
332 "hidden_states": hidden_states,
333 "residual": residual
334 })

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None

File /opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/llama.py:245, in LlamaDecoderLayer.forward(self, positions, hidden_states, kv_cache, attn_metadata, residual)
242 else:
243 hidden_states, residual = self.input_layernorm(
244 hidden_states, residual)
--> 245 hidden_states = self.self_attn(
246 positions=positions,
247 hidden_states=hidden_states,
248 kv_cache=kv_cache,
249 attn_metadata=attn_metadata,
250 )
252 # Fully Connected
253 hidden_states, residual = self.post_attention_layernorm(
254 hidden_states, residual)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None

File /opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/llama.py:176, in LlamaAttention.forward(self, positions, hidden_states, kv_cache, attn_metadata)
174 q, k = self.rotary_emb(positions, q, k)
175 attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
--> 176 output, _ = self.o_proj(attn_output)
177 return output

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None

File /opt/conda/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py:787, in RowParallelLinear.forward(self, input_)
783 output_parallel = self.quant_method.apply(self,
784 input_parallel,
785 bias=bias_)
786 if self.reduce_results and self.tp_size > 1:
--> 787 output = tensor_model_parallel_all_reduce(output_parallel)
788 else:
789 output = output_parallel

File /opt/conda/lib/python3.10/site-packages/vllm/distributed/communication_op.py:11, in tensor_model_parallel_all_reduce(input_)
9 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
10 """All-reduce the input tensor across model parallel group."""
---> 11 return get_tp_group().all_reduce(input_)

File /opt/conda/lib/python3.10/site-packages/vllm/distributed/parallel_state.py:293, in GroupCoordinator.all_reduce(self, input_)
291 pynccl_comm.all_reduce(input_)
292 else:
--> 293 torch.distributed.all_reduce(input_, group=self.device_group)
294 return input_

RuntimeError: CUDA error: an illegal memory access was encountered

Neural Magic org
edited Aug 1

This is related to a known issue with that release that we have fixed on main. Please install a recent commit from the nightly. I tested this one works

pip uninstall vllm
pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/c8a7e93273ff4338d6f89f8a63ff16426ac240b8/vllm-0.5.3.post1-cp38-abi3-manylinux1_x86_64.whl
vllm serve neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a16 -tp 2

Yep that fixed the issue. Thanks!

bprice9 changed discussion status to closed

Sign up or log in to comment