OpenPipe/ART

Training crashes after sometime

Opened this issue · 1 comments

ERROR:asyncio:Exception in callback _log_task_completion(error_callback=>)(<Task finishe...sertions.\n')>) at /home/ubuntu/.venv/lib/python3.12/site-packages/vllm/engine/async_llm_engine.py:46
handle: <Handle _log_task_completion(error_callback=>)(<Task finishe...sertions.\n')>) at /home/ubuntu/.venv/lib/python3.12/site-packages/vllm/engine/async_llm_engine.py:46>
Traceback (most recent call last):
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/engine/async_llm_engine.py", line 56, in _log_task_completion
return_value = task.result()
^^^^^^^^^^^^^
File "/home/ubuntu/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/asyncio/futures.py", line 202, in result
raise self._exception.with_traceback(self._exception_tb)
File "/home/ubuntu/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/asyncio/tasks.py", line 314, in __step_run_and_handle_result
result = coro.send(None)
^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/engine/async_llm_engine.py", line 832, in run_engine_loop
result = task.result()
^^^^^^^^^^^^^
File "/home/ubuntu/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/asyncio/futures.py", line 202, in result
raise self._exception.with_traceback(self._exception_tb)
File "/home/ubuntu/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/asyncio/tasks.py", line 316, in __step_run_and_handle_result
result = coro.throw(exc)
^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/art/vllm/engine.py", line 76, in engine_step
return await _engine_step(virtual_engine)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/engine/async_llm_engine.py", line 755, in engine_step
request_outputs = await self.engine.step_async(virtual_engine)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/engine/async_llm_engine.py", line 354, in step_async
outputs = await self.model_executor.execute_model_async(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 242, in execute_model_async
output = await make_async(self.execute_model)(execute_model_req)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/asyncio/futures.py", line 289, in await
yield self # This tells Task to wait for completion.
^^^^^^^^^^
File "/home/ubuntu/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/asyncio/tasks.py", line 385, in __wakeup
future.result()
File "/home/ubuntu/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/asyncio/futures.py", line 202, in result
raise self._exception.with_traceback(self._exception_tb)
File "/home/ubuntu/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/concurrent/futures/thread.py", line 59, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 146, in execute_model
output = self.collective_rpc("execute_model",
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/utils/init.py", line 2985, in run_method
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 417, in execute_model
output = self.model_runner.execute_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/worker/multi_step_model_runner.py", line 590, in execute_model
outputs = self._final_process_outputs(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/worker/multi_step_model_runner.py", line 434, in _final_process_outputs
output.pythonize(model_input, self._copy_stream,
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/worker/multi_step_model_runner.py", line 101, in pythonize
self._pythonize_sampler_output(input_metadata, copy_stream,
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/worker/multi_step_model_runner.py", line 129, in _pythonize_sampler_output
self.sampler_output_ready_event.synchronize()
File "/home/ubuntu/.venv/lib/python3.12/site-packages/torch/cuda/streams.py", line 227, in synchronize
super().synchronize()
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "/home/ubuntu/.local/share/uv/python/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "/home/ubuntu/.venv/lib/python3.12/site-packages/vllm/engine/async_llm_engine.py", line 68, in _log_task_completion
raise AsyncEngineDeadError(
vllm.engine.async_llm_engine.AsyncEngineDeadError: Task finished unexpectedly. This should never happen! Please open an issue on GitHub. See stack trace above for the actual cause.

Hey, @aravindc26! Can you provide more details on how you run it? Can you please share your setup?