diff --git a/vllm/config.py b/vllm/config.py index 2ac31657979f2..63a5acc50b943 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -322,8 +322,9 @@ def verify_with_parallel_config( "BitAndBytes quantization with TP or PP is not supported yet.") if self.quantization == "bitsandbytes" and self.enforce_eager is False: - raise ValueError( - "BitAndBytes with enforce_eager = False is not supported yet.") + logger.warning("CUDA graph is not supported on BitAndBytes yet, " + "fallback to the eager mode.") + self.enforce_eager = True def get_hf_config_sliding_window(self) -> Optional[int]: """Get the sliding window size, or None if disabled."""