From 6b1b8047de1da136793b62eef0e8056a758a9903 Mon Sep 17 00:00:00 2001 From: Haiyang Shi Date: Mon, 25 Nov 2024 20:22:17 -0800 Subject: [PATCH] Fix workflow optimization w/ kv cache Fixes 28ef00e04a152ee2f94e6e3b65e120d107b305f2 ("Optimize workflow w/ kv cache") Signed-off-by: Haiyang Shi --- vllm/worker/model_runner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index ac919bfc0d8f5..a5b49c2ebc9ea 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1564,6 +1564,13 @@ def execute_model( # set to None. Thus, if we encounter None for input_tokens, we just # skip sampling and return an empty outputs. if model_input.input_tokens is None: + if not self.is_driver_worker: + return [] + + # Invoke the async callback if any. + if model_input.async_callback is not None: + model_input.async_callback() + outputs=[ CompletionSequenceGroupOutput( samples=[],