You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Support for running Phi-4 GGUF model format to enable execution on devices with lower VRAM
Alternatives
No response
Additional context
INFO 01-12 13:10:44 model_runner.py:1094] Starting to load model ./phi-4-q4_k_m.gguf...
ERROR 01-12 13:10:54 engine.py:366]
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/engine.py", line 357, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/engine.py", line 119, in from_engine_args
return cls(ipc_path=ipc_path,
^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/engine.py", line 71, in __init__
self.engine = LLMEngine(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/llm_engine.py", line 273, in __init__
self.model_executor = executor_class(vllm_config=vllm_config, )
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/executor/executor_base.py", line 36, in __init__
self._init_executor()
File "/usr/local/lib/python3.11/dist-packages/vllm/executor/gpu_executor.py", line 35, in _init_executor
self.driver_worker.load_model()
File "/usr/local/lib/python3.11/dist-packages/vllm/worker/worker.py", line 155, in load_model
self.model_runner.load_model()
File "/usr/local/lib/python3.11/dist-packages/vllm/worker/model_runner.py", line 1096, in load_model
self.model = get_model(vllm_config=self.vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/model_loader/__init__.py", line 12, in get_model
return loader.load_model(vllm_config=vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/model_loader/loader.py", line 1234, in load_model
model.load_weights(
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/llama.py", line 594, in load_weights
return loader.load_weights(
^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 237, in load_weights
autoloaded_weights = set(self._load_module("", self.module, weights))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 198, in _load_module
yield from self._load_module(prefix,
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 175, in _load_module
loaded_params = module_load_weights(weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/llama.py", line 432, in load_weights
weight_loader(param, loaded_weight)
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/layers/linear.py", line 852, in weight_loader
assert param_data.shape == loaded_weight.shape
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
Process SpawnProcess-1:
Traceback (most recent call last):
File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/engine.py", line 368, in run_mp_engine
raise e
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/engine.py", line 357, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/engine.py", line 119, in from_engine_args
return cls(ipc_path=ipc_path,
^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/engine.py", line 71, in __init__
self.engine = LLMEngine(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/llm_engine.py", line 273, in __init__
self.model_executor = executor_class(vllm_config=vllm_config, )
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/executor/executor_base.py", line 36, in __init__
self._init_executor()
File "/usr/local/lib/python3.11/dist-packages/vllm/executor/gpu_executor.py", line 35, in _init_executor
self.driver_worker.load_model()
File "/usr/local/lib/python3.11/dist-packages/vllm/worker/worker.py", line 155, in load_model
self.model_runner.load_model()
File "/usr/local/lib/python3.11/dist-packages/vllm/worker/model_runner.py", line 1096, in load_model
self.model = get_model(vllm_config=self.vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/model_loader/__init__.py", line 12, in get_model
return loader.load_model(vllm_config=vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/model_loader/loader.py", line 1234, in load_model
model.load_weights(
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/llama.py", line 594, in load_weights
return loader.load_weights(
^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 237, in load_weights
autoloaded_weights = set(self._load_module("", self.module, weights))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 198, in _load_module
yield from self._load_module(prefix,
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 175, in _load_module
loaded_params = module_load_weights(weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/llama.py", line 432, in load_weights
weight_loader(param, loaded_weight)
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/layers/linear.py", line 852, in weight_loader
assert param_data.shape == loaded_weight.shape
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
[rank0]:[W112 13:10:54.224924824 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
Task exception was never retrieved
future: <Task finished name='Task-2' coro=<MQLLMEngineClient.run_output_handler_loop() done, defined at /usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/client.py:178> exception=ZMQError('Operation not supported')>
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/client.py", line 184, in run_output_handler_loop
while await self.output_socket.poll(timeout=VLLM_RPC_TIMEOUT
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/zmq/_future.py", line 400, in poll
raise _zmq.ZMQError(_zmq.ENOTSUP)
zmq.error.ZMQError: Operation not supported
Task exception was never retrieved
future: <Task finished name='Task-3' coro=<MQLLMEngineClient.run_output_handler_loop() done, defined at /usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/client.py:178> exception=ZMQError('Operation not supported')>
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/client.py", line 184, in run_output_handler_loop
while await self.output_socket.poll(timeout=VLLM_RPC_TIMEOUT
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/zmq/_future.py", line 400, in poll
raise _zmq.ZMQError(_zmq.ENOTSUP)
zmq.error.ZMQError: Operation not supported
Traceback (most recent call last):
File "/usr/local/bin/vllm", line 8, in <module>
sys.exit(main())
^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/scripts.py", line 201, in main
args.dispatch_function(args)
File "/usr/local/lib/python3.11/dist-packages/vllm/scripts.py", line 42, in serve
uvloop.run(run_server(args))
File "/usr/local/lib/python3.11/dist-packages/uvloop/__init__.py", line 105, in run
return runner.run(wrapper())
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/usr/local/lib/python3.11/dist-packages/uvloop/__init__.py", line 61, in wrapper
return await main
^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/entrypoints/openai/api_server.py", line 740, in run_server
async with build_async_engine_client(args) as engine_client:
File "/usr/lib/python3.11/contextlib.py", line 204, in __aenter__
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/entrypoints/openai/api_server.py", line 118, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
File "/usr/lib/python3.11/contextlib.py", line 204, in __aenter__
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/entrypoints/openai/api_server.py", line 223, in build_async_engine_client_from_engine_args
raise RuntimeError(
RuntimeError: Engine process failed to start. See stack trace for the root cause.ghts(
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/llama.py", line 594, in load_weights
return loader.load_weights(
^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 237, in load_weights
autoloaded_weights = set(self._load_module("", self.module, weights))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 198, in _load_module
yield from self._load_module(prefix,
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/utils.py", line 175, in _load_module
loaded_params = module_load_weights(weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/llama.py", line 432, in load_weights
weight_loader(param, loaded_weight)
File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/layers/linear.py", line 852, in weight_loader
assert param_data.shape == loaded_weight.shape
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
[rank0]:[W112 13:10:54.224924824 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
Task exception was never retrieved
future: <Task finished name='Task-2' coro=<MQLLMEngineClient.run_output_handler_loop() done, defined at /usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/client.py:178> exception=ZMQError('Operation not supported')>
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/client.py", line 184, in run_output_handler_loop
while await self.output_socket.poll(timeout=VLLM_RPC_TIMEOUT
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/zmq/_future.py", line 400, in poll
raise _zmq.ZMQError(_zmq.ENOTSUP)
zmq.error.ZMQError: Operation not supported
Task exception was never retrieved
future: <Task finished name='Task-3' coro=<MQLLMEngineClient.run_output_handler_loop() done, defined at /usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/client.py:178> exception=ZMQError('Operation not supported')>
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/vllm/engine/multiprocessing/client.py", line 184, in run_output_handler_loop
while await self.output_socket.poll(timeout=VLLM_RPC_TIMEOUT
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/zmq/_future.py", line 400, in poll
raise _zmq.ZMQError(_zmq.ENOTSUP)
zmq.error.ZMQError: Operation not supported
Traceback (most recent call last):
File "/usr/local/bin/vllm", line 8, in <module>
sys.exit(main())
^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/scripts.py", line 201, in main
args.dispatch_function(args)
File "/usr/local/lib/python3.11/dist-packages/vllm/scripts.py", line 42, in serve
uvloop.run(run_server(args))
File "/usr/local/lib/python3.11/dist-packages/uvloop/__init__.py", line 105, in run
return runner.run(wrapper())
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/usr/local/lib/python3.11/dist-packages/uvloop/__init__.py", line 61, in wrapper
return await main
^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/entrypoints/openai/api_server.py", line 740, in run_server
async with build_async_engine_client(args) as engine_client:
File "/usr/lib/python3.11/contextlib.py", line 204, in __aenter__
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/entrypoints/openai/api_server.py", line 118, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
File "/usr/lib/python3.11/contextlib.py", line 204, in __aenter__
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/vllm/entrypoints/openai/api_server.py", line 223, in build_async_engine_client_from_engine_args
raise RuntimeError(
RuntimeError: Engine process failed to start. See stack trace for the root cause.
Before submitting a new issue...
Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.
The text was updated successfully, but these errors were encountered:
🚀 The feature, motivation and pitch
Support for running Phi-4 GGUF model format to enable execution on devices with lower VRAM
Alternatives
No response
Additional context
Before submitting a new issue...
The text was updated successfully, but these errors were encountered: