From 33c9e25c61e140a803a4acbcaddb520a1eeba84e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 07:53:30 +0000 Subject: [PATCH 01/30] Introduce `is_list_of` --- vllm/inputs/data.py | 20 ++++++++++---------- vllm/multimodal/image.py | 6 ++++-- vllm/utils.py | 25 ++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 86c2901dc4c80..9df0bd2041d91 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,8 +1,10 @@ from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence, - TypedDict, Union, cast, overload) + TypedDict, Union, overload) from typing_extensions import NotRequired +from vllm.utils import is_list_of + if TYPE_CHECKING: from vllm.multimodal import MultiModalDataDict @@ -41,25 +43,23 @@ def parse_and_batch_prompt( if len(prompt) == 0: raise ValueError("please provide at least one prompt") - if isinstance(prompt[0], str): + if is_list_of(prompt, str): # case 2: array of strings return [ - ParsedText(content=elem, is_tokens=False) - for elem in cast(List[str], prompt) + ParsedText(content=elem, is_tokens=False) for elem in prompt ] - if isinstance(prompt[0], int): + if is_list_of(prompt, int): # case 3: array of tokens - elem = cast(List[int], prompt) - return [ParsedTokens(content=elem, is_tokens=True)] - if isinstance(prompt[0], list): + return [ParsedTokens(content=prompt, is_tokens=True)] + if is_list_of(prompt, list): if len(prompt[0]) == 0: raise ValueError("please provide at least one prompt") - if isinstance(prompt[0][0], int): + if is_list_of(prompt[0], int): # case 4: array of token arrays return [ ParsedTokens(content=elem, is_tokens=True) - for elem in cast(List[List[int]], prompt) + for elem in prompt ] raise ValueError("prompt must be a string, array of strings, " diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index b6a3909e95632..db50229bda319 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -10,6 +10,7 @@ from vllm.logger import init_logger from vllm.transformers_utils.image_processor import get_image_processor from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.utils import is_list_of from .base import MultiModalInputs, MultiModalPlugin @@ -113,7 +114,8 @@ def _get_hf_image_processor(self, model_config: ModelConfig): def _default_input_mapper(self, ctx: InputContext, data: object) -> MultiModalInputs: model_config = ctx.model_config - if isinstance(data, (Image.Image, list)): + + if isinstance(data, Image.Image) or is_list_of(data, Image.Image): image_processor = self._get_hf_image_processor(model_config) if image_processor is None: raise RuntimeError("No HuggingFace processor is available " @@ -127,7 +129,7 @@ def _default_input_mapper(self, ctx: InputContext, raise return MultiModalInputs(batch_data) - elif isinstance(data, torch.Tensor): + elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor): raise NotImplementedError("Embeddings input is not supported yet") raise TypeError(f"Invalid image type: {type(data)}") diff --git a/vllm/utils.py b/vllm/utils.py index 61e3bb0bfc333..413f6ce62276f 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -17,15 +17,15 @@ from functools import lru_cache, partial, wraps from platform import uname from typing import (Any, AsyncGenerator, Awaitable, Callable, Dict, Generic, - Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar, - Union, overload) + Hashable, List, Literal, Optional, OrderedDict, Set, Tuple, + Type, TypeVar, Union, overload) import numpy as np import numpy.typing as npt import psutil import torch import torch.types -from typing_extensions import ParamSpec +from typing_extensions import ParamSpec, TypeGuard, assert_never import vllm.envs as envs from vllm import _custom_ops as ops @@ -807,6 +807,24 @@ def get_dtype_size(dtype: torch.dtype) -> int: return torch.tensor([], dtype=dtype).element_size() +# `collections` helpers +def is_list_of( + value: object, + typ: Type[T], + *, + check: Literal["first", "all"] = "first", +) -> TypeGuard[List[T]]: + if not isinstance(value, list): + return False + + if check == "first": + return len(value) == 0 or isinstance(value[0], typ) + elif check == "all": + return all(isinstance(v, typ) for v in value) + + assert_never(check) + + def merge_dicts(dict1: Dict[K, List[T]], dict2: Dict[K, List[T]]) -> Dict[K, List[T]]: """Merge 2 dicts that have key -> List of items. @@ -954,6 +972,7 @@ def enable_trace_function_call_for_thread() -> None: enable_trace_function_call(log_path) +# `functools` helpers def identity(value: T) -> T: return value From e6dd6f5b6ea5f2eb7febf631d86983a9f439120d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 07:56:30 +0000 Subject: [PATCH 02/30] Avoid circular imports --- examples/offline_inference_encoder_decoder.py | 4 +- tests/conftest.py | 6 +- tests/test_inputs.py | 2 +- vllm/engine/llm_engine.py | 4 +- vllm/entrypoints/llm.py | 4 +- vllm/entrypoints/openai/serving_engine.py | 2 +- vllm/inputs/__init__.py | 20 ++- vllm/inputs/data.py | 145 +++--------------- vllm/inputs/parse.py | 125 +++++++++++++++ vllm/sequence.py | 2 +- vllm/utils.py | 29 ---- 11 files changed, 171 insertions(+), 172 deletions(-) create mode 100644 vllm/inputs/parse.py diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference_encoder_decoder.py index 79b284554f172..c05e8e8bb6f11 100644 --- a/examples/offline_inference_encoder_decoder.py +++ b/examples/offline_inference_encoder_decoder.py @@ -4,8 +4,8 @@ ''' from vllm import LLM, SamplingParams -from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt -from vllm.utils import zip_enc_dec_prompt_lists +from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, + TokensPrompt, zip_enc_dec_prompt_lists) dtype = "float" diff --git a/tests/conftest.py b/tests/conftest.py index c0bf9897c97f2..b0adfc58bcda1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,13 +21,13 @@ from vllm.connections import global_http_connection from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel) -from vllm.inputs import TextPrompt +from vllm.inputs import (TextPrompt, to_enc_dec_tuple_list, + zip_enc_dec_prompt_lists) from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sequence import SampleLogprobs from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, - is_cpu, to_enc_dec_tuple_list, - zip_enc_dec_prompt_lists) + is_cpu) logger = init_logger(__name__) diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 887c7101decda..3725d8687f255 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -2,7 +2,7 @@ import pytest -from vllm.inputs import parse_and_batch_prompt +from vllm.inputs.parse import parse_and_batch_prompt STRING_INPUTS = [ '', diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 75c6d7e6c9b21..10913efbd8890 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -22,8 +22,8 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group from vllm.executor.executor_base import ExecutorBase from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import (INPUT_REGISTRY, LLMInputs, PromptInputs, - get_prompt_type) +from vllm.inputs import INPUT_REGISTRY, LLMInputs, PromptInputs +from vllm.inputs.parse import get_prompt_type from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import (EmbeddingRequestOutput, RequestOutput, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index eaa1572094936..175f418a1294f 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -6,8 +6,8 @@ from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine -from vllm.inputs import (PromptInputs, TextPrompt, TokensPrompt, - parse_and_batch_prompt) +from vllm.inputs import PromptInputs, TextPrompt, TokensPrompt +from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding import ( diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index df4932d8fe185..8d8b5ea4bdf5d 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -22,7 +22,7 @@ TokenizeCompletionRequest, TokenizeRequest) # yapf: enable -from vllm.inputs import parse_and_batch_prompt +from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding import ( diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index e22b88f2fc38a..1dcd1ad343b3f 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,7 +1,7 @@ -from .data import (ExplicitEncoderDecoderPrompt, LLMInputs, ParsedText, - ParsedTokens, PromptInputs, SingletonPromptInputs, - TextPrompt, TokensPrompt, get_prompt_type, - is_valid_encoder_decoder_llm_inputs, parse_and_batch_prompt) +from .data import (ExplicitEncoderDecoderPrompt, LLMInputs, PromptInputs, + SingletonPromptInputs, TextPrompt, TokensPrompt, + build_explicit_enc_dec_prompt, to_enc_dec_tuple_list, + zip_enc_dec_prompt_lists) from .registry import InputContext, InputRegistry INPUT_REGISTRY = InputRegistry() @@ -14,18 +14,16 @@ """ __all__ = [ - "ParsedText", - "ParsedTokens", - "parse_and_batch_prompt", "TextPrompt", "TokensPrompt", "PromptInputs", + "ExplicitEncoderDecoderPrompt", + "SingletonPromptInputs", "LLMInputs", + "build_explicit_enc_dec_prompt", + "to_enc_dec_tuple_list", + "zip_enc_dec_prompt_lists", "INPUT_REGISTRY", "InputContext", "InputRegistry", - "get_prompt_type", - "is_valid_encoder_decoder_llm_inputs", - "ExplicitEncoderDecoderPrompt", - "SingletonPromptInputs", ] diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 9df0bd2041d91..4cee911b43984 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,71 +1,11 @@ -from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence, - TypedDict, Union, overload) +from typing import TYPE_CHECKING, List, Optional, Tuple, TypedDict, Union from typing_extensions import NotRequired -from vllm.utils import is_list_of - if TYPE_CHECKING: from vllm.multimodal import MultiModalDataDict -class ParsedText(TypedDict): - content: str - is_tokens: Literal[False] - - -class ParsedTokens(TypedDict): - content: List[int] - is_tokens: Literal[True] - - -# https://github.com/vllm-project/vllm/pull/4028 -@overload -def parse_and_batch_prompt( - prompt: Union[str, List[str]]) -> Sequence[ParsedText]: - ... - - -@overload -def parse_and_batch_prompt( - prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]: - ... - - -def parse_and_batch_prompt( - prompt: Union[str, List[str], List[int], List[List[int]]], -) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]: - if isinstance(prompt, str): - # case 1: a string - return [ParsedText(content=prompt, is_tokens=False)] - - if isinstance(prompt, list): - if len(prompt) == 0: - raise ValueError("please provide at least one prompt") - - if is_list_of(prompt, str): - # case 2: array of strings - return [ - ParsedText(content=elem, is_tokens=False) for elem in prompt - ] - if is_list_of(prompt, int): - # case 3: array of tokens - return [ParsedTokens(content=prompt, is_tokens=True)] - if is_list_of(prompt, list): - if len(prompt[0]) == 0: - raise ValueError("please provide at least one prompt") - - if is_list_of(prompt[0], int): - # case 4: array of token arrays - return [ - ParsedTokens(content=elem, is_tokens=True) - for elem in prompt - ] - - raise ValueError("prompt must be a string, array of strings, " - "array of tokens, or array of token arrays") - - class TextPrompt(TypedDict): """Schema for a text prompt.""" @@ -150,56 +90,6 @@ class ExplicitEncoderDecoderPrompt(TypedDict): """ -def _has_required_keys( - d: dict, - required_keys: set, -) -> bool: - return required_keys.issubset(d.keys()) - - -def get_prompt_type(prompt: Optional[PromptInputs]) -> Optional[str]: - """ - Get the type-name of the prompt argument instance, given that - isinstance() cannot apply to TypedDict subclasses directly. - If the prompt is None, return 'None' as the type name. - - Arguments: - - * prompt: LLM input prompt or None - - Returns: - - * String representation of prompt type - """ - - if prompt is None: - return 'None' - - required_keys_dict = { - 'TextPrompt': {'prompt'}, - 'TokensPrompt': {'prompt_token_ids'}, - 'ExplicitEncoderDecoder': {'encoder_prompt', 'decoder_prompt'}, - } - - if isinstance(prompt, dict): - for (ptype, required_keys) in required_keys_dict.items(): - # Ignore type checking in the conditional below because type - # checker does not understand that is_dict(prompt) narrows - # down the possible types - if _has_required_keys( - prompt, # type: ignore - required_keys): - return ptype - - raise ValueError(f"Invalid prompt {prompt}, valid types are " - "required_keys_dict={required_keys_dict}") - - if isinstance(prompt, str): - return "str" - - raise ValueError(f"Invalid prompt {prompt}") - - class LLMInputs(TypedDict): """ The inputs in :class:`~vllm.LLMEngine` before they are @@ -229,13 +119,28 @@ class LLMInputs(TypedDict): """ -def is_valid_encoder_decoder_llm_inputs(inputs: LLMInputs) -> bool: - """ - Return True if the LLMInputs instance has the correct configuration - for encoder/decoder. - """ +def build_explicit_enc_dec_prompt( + encoder_prompt: SingletonPromptInputs, + decoder_prompt: SingletonPromptInputs, +) -> ExplicitEncoderDecoderPrompt: + return ExplicitEncoderDecoderPrompt(encoder_prompt=encoder_prompt, + decoder_prompt=decoder_prompt) + + +def zip_enc_dec_prompt_lists( + enc_prompt_list: List[SingletonPromptInputs], + dec_prompt_list: List[SingletonPromptInputs], +) -> List[ExplicitEncoderDecoderPrompt]: + return [ + build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt) + for (encoder_prompt, + decoder_prompt) in zip(enc_prompt_list, dec_prompt_list) + ] + - # True if encoder prompt token ids field exists & - # is not None - return ('encoder_prompt_token_ids' in inputs - and inputs['encoder_prompt_token_ids'] is not None) +def to_enc_dec_tuple_list( + enc_dec_prompts: List[ExplicitEncoderDecoderPrompt], +) -> List[Tuple[PromptInputs, PromptInputs]]: + return [(enc_dec_prompt['encoder_prompt'], + enc_dec_prompt['decoder_prompt']) + for enc_dec_prompt in enc_dec_prompts] diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py new file mode 100644 index 0000000000000..42bd9858bcbe1 --- /dev/null +++ b/vllm/inputs/parse.py @@ -0,0 +1,125 @@ +from typing import (List, Literal, Optional, Sequence, TypedDict, Union, + overload) + +from vllm.utils import is_list_of + +from .data import LLMInputs, PromptInputs + + +class ParsedText(TypedDict): + content: str + is_tokens: Literal[False] + + +class ParsedTokens(TypedDict): + content: List[int] + is_tokens: Literal[True] + + +# https://github.com/vllm-project/vllm/pull/4028 +@overload +def parse_and_batch_prompt( + prompt: Union[str, List[str]]) -> Sequence[ParsedText]: + ... + + +@overload +def parse_and_batch_prompt( + prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]: + ... + + +def parse_and_batch_prompt( + prompt: Union[str, List[str], List[int], List[List[int]]], +) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]: + if isinstance(prompt, str): + # case 1: a string + return [ParsedText(content=prompt, is_tokens=False)] + + if isinstance(prompt, list): + if len(prompt) == 0: + raise ValueError("please provide at least one prompt") + + if is_list_of(prompt, str): + # case 2: array of strings + return [ + ParsedText(content=elem, is_tokens=False) for elem in prompt + ] + if is_list_of(prompt, int): + # case 3: array of tokens + return [ParsedTokens(content=prompt, is_tokens=True)] + if is_list_of(prompt, list): + if len(prompt[0]) == 0: + raise ValueError("please provide at least one prompt") + + if is_list_of(prompt[0], int): + # case 4: array of token arrays + return [ + ParsedTokens(content=elem, is_tokens=True) + for elem in prompt + ] + + raise ValueError("prompt must be a string, array of strings, " + "array of tokens, or array of token arrays") + + +def _has_required_keys( + d: dict, + required_keys: set, +) -> bool: + return required_keys.issubset(d.keys()) + + +def get_prompt_type(prompt: Optional[PromptInputs]) -> Optional[str]: + """ + Get the type-name of the prompt argument instance, given that + isinstance() cannot apply to TypedDict subclasses directly. + If the prompt is None, return 'None' as the type name. + + Arguments: + + * prompt: LLM input prompt or None + + Returns: + + * String representation of prompt type + """ + + if prompt is None: + return 'None' + + required_keys_dict = { + 'TextPrompt': {'prompt'}, + 'TokensPrompt': {'prompt_token_ids'}, + 'ExplicitEncoderDecoder': {'encoder_prompt', 'decoder_prompt'}, + } + + if isinstance(prompt, dict): + for (ptype, required_keys) in required_keys_dict.items(): + # Ignore type checking in the conditional below because type + # checker does not understand that is_dict(prompt) narrows + # down the possible types + if _has_required_keys( + prompt, # type: ignore + required_keys): + return ptype + + raise ValueError(f"Invalid prompt {prompt}, valid types are " + f"required_keys_dict={required_keys_dict}") + + if isinstance(prompt, str): + return "str" + + raise ValueError(f"Invalid prompt {prompt}") + + +def is_valid_encoder_decoder_llm_inputs(inputs: LLMInputs) -> bool: + """ + Return True if the LLMInputs instance has the correct configuration + for encoder/decoder. + """ + + # True if encoder prompt token ids field exists & + # is not None + return ('encoder_prompt_token_ids' in inputs + and inputs['encoder_prompt_token_ids'] is not None) diff --git a/vllm/sequence.py b/vllm/sequence.py index 6347855333822..fbd148001cc7e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -11,7 +11,7 @@ import torch -from vllm.inputs import is_valid_encoder_decoder_llm_inputs +from vllm.inputs.parse import is_valid_encoder_decoder_llm_inputs from vllm.lora.request import LoRARequest from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest diff --git a/vllm/utils.py b/vllm/utils.py index 413f6ce62276f..eb88fce4af0ce 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -29,8 +29,6 @@ import vllm.envs as envs from vllm import _custom_ops as ops -from vllm.inputs import (ExplicitEncoderDecoderPrompt, PromptInputs, - SingletonPromptInputs) from vllm.logger import enable_trace_function_call, init_logger logger = init_logger(__name__) @@ -1164,30 +1162,3 @@ def is_embedding_model_config(model_config) -> bool: ''' return model_config is not None and \ model_config.embedding_mode - - -def build_explicit_enc_dec_prompt( - encoder_prompt: SingletonPromptInputs, - decoder_prompt: SingletonPromptInputs, -) -> ExplicitEncoderDecoderPrompt: - return ExplicitEncoderDecoderPrompt(encoder_prompt=encoder_prompt, - decoder_prompt=decoder_prompt) - - -def zip_enc_dec_prompt_lists( - enc_prompt_list: List[SingletonPromptInputs], - dec_prompt_list: List[SingletonPromptInputs], -) -> List[ExplicitEncoderDecoderPrompt]: - return [ - build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt) - for (encoder_prompt, - decoder_prompt) in zip(enc_prompt_list, dec_prompt_list) - ] - - -def to_enc_dec_tuple_list( - enc_dec_prompts: List[ExplicitEncoderDecoderPrompt], -) -> List[Tuple[PromptInputs, PromptInputs]]: - return [(enc_dec_prompt['encoder_prompt'], - enc_dec_prompt['decoder_prompt']) - for enc_dec_prompt in enc_dec_prompts] From f938c8690274bc61f20720ff8b227beb9e980b11 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 09:09:20 +0000 Subject: [PATCH 03/30] Refactor prompt parsing and extend this to async engine --- .github/workflows/mypy.yaml | 2 +- requirements-common.txt | 2 +- requirements-lint.txt | 2 +- requirements-openvino.txt | 2 +- vllm/engine/async_llm_engine.py | 155 ++++++++++++--- vllm/engine/llm_engine.py | 196 ++++++++----------- vllm/entrypoints/openai/logits_processors.py | 8 +- vllm/inputs/__init__.py | 9 +- vllm/inputs/data.py | 16 +- vllm/inputs/parse.py | 68 ++----- 10 files changed, 252 insertions(+), 208 deletions(-) diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 8d423657630c2..f7b84eebc8b6a 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -25,7 +25,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install mypy==1.9.0 + pip install mypy==1.11.1 pip install types-setuptools pip install types-PyYAML pip install types-requests diff --git a/requirements-common.txt b/requirements-common.txt index d8c95bf772405..ebd0fca519198 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer == 0.10.3 outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0 -typing_extensions +typing_extensions >= 4.10 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 pyzmq gguf == 0.9.1 diff --git a/requirements-lint.txt b/requirements-lint.txt index bd34227d3e824..d0b2fef6deaef 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -8,7 +8,7 @@ isort==5.13.2 clang-format==18.1.5 # type checking -mypy==1.9.0 +mypy==1.11.1 types-PyYAML types-requests types-setuptools diff --git a/requirements-openvino.txt b/requirements-openvino.txt index 2dd971d6400be..dc0ae55c92539 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -22,7 +22,7 @@ prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer == 0.10.3 outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0 -typing_extensions +typing_extensions >= 4.10 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 pyzmq gguf == 0.9.1 diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index b4a9520e623ea..2200003e4b841 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -5,6 +5,7 @@ Optional, Set, Tuple, Type, Union) from transformers import PreTrainedTokenizer +from typing_extensions import assert_never import vllm.envs as envs from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig, @@ -16,9 +17,12 @@ from vllm.engine.metrics import StatLoggerBase from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.ray_utils import initialize_ray_cluster, ray -from vllm.inputs import LLMInputs, PromptInputs +from vllm.inputs import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs, + SingletonPromptInputs) +from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalDataDict from vllm.outputs import EmbeddingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest @@ -291,38 +295,140 @@ async def stop_remote_worker_execution_loop_async(self) -> None: """Stop the remote worker execution loop.""" await self.model_executor.stop_remote_worker_execution_loop_async() - async def process_model_inputs_async( + async def _tokenize_prompt_async( self, + prompt: str, request_id: str, - inputs: PromptInputs, lora_request: Optional[LoRARequest] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> LLMInputs: - if isinstance(inputs, str): - inputs = {"prompt": inputs} + ) -> List[int]: + ''' + Wrapper around application of the model's + tokenizer. + + Arguments: + + * prompt + * request_id + * lora_request + + Returns: + + * prompt token ids + ''' + + tokenizer = self.get_tokenizer_group("prompts must be None if " + "skip_tokenizer_init is True") + + return await tokenizer.encode_async(request_id=request_id, + prompt=prompt, + lora_request=lora_request) + + async def _extract_prompt_components_async( + self, + inputs: SingletonPromptInputs, + request_id: str, + lora_request: Optional[LoRARequest] = None, + ) -> Tuple[Optional[str], List[int], Optional[MultiModalDataDict]]: + ''' + Extract the components of any single encoder or decoder input prompt. - if "prompt_token_ids" not in inputs: - tokenizer = self.get_tokenizer_group("prompts must be None if " - "skip_tokenizer_init is True") + Arguments: - prompt_token_ids = await tokenizer.encode_async( + * request_id + * inputs: single encoder or decoder input prompt + * lora_request: this is only valid for decoder prompts + + Returns: + + * prompt + * prompt_token_ids + * multi_modal_data + ''' + + if isinstance(inputs, str): + prompt = inputs + prompt_token_ids = await self._tokenize_prompt_async( + prompt, request_id=request_id, - prompt=inputs["prompt"], - lora_request=lora_request) + lora_request=lora_request, + ) + multi_modal_data = None + elif isinstance(inputs, dict): + if "prompt_token_ids" in inputs: + prompt = None + prompt_token_ids = inputs["prompt_token_ids"] + else: + # NOTE: This extra assignment is required to pass mypy + prompt = parsed_prompt = inputs["prompt"] + prompt_token_ids = await self._tokenize_prompt_async( + parsed_prompt, + request_id=request_id, + lora_request=lora_request, + ) + + multi_modal_data = inputs.get("multi_modal_data") else: - prompt_token_ids = inputs["prompt_token_ids"] + assert_never(inputs) + + return prompt, prompt_token_ids, multi_modal_data + + async def _process_decoder_only_prompt_async( + self, + inputs: SingletonPromptInputs, + request_id: str, + lora_request: Optional[LoRARequest] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> LLMInputs: + + ( + prompt, + prompt_token_ids, + multi_modal_data, + ) = await self._extract_prompt_components_async( + inputs, + request_id=request_id, + lora_request=lora_request, + ) if prompt_adapter_request: - prompt_token_ids = [ - 0 - ] * prompt_adapter_request.prompt_adapter_num_virtual_tokens + \ - prompt_token_ids + prompt_token_ids = ( + [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens + + prompt_token_ids) + + return LLMInputs(prompt_token_ids=prompt_token_ids, + prompt=prompt, + multi_modal_data=multi_modal_data) + + async def process_model_inputs_async( + self, + inputs: PromptInputs, + request_id: str, + lora_request: Optional[LoRARequest] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> Union[LLMInputs, EncoderDecoderLLMInputs]: + if self.is_encoder_decoder_model(): + # TODO: Make this async + # Encoder-decoder model requires special mapping of + # input prompts to encoder & decoder - llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids, - prompt=inputs.get("prompt"), - multi_modal_data=inputs.get("multi_modal_data")) + model_inputs = self._process_encoder_decoder_prompt( + inputs, + request_id=request_id, + ) + else: + if is_explicit_encoder_decoder_prompt(inputs): + raise ValueError("Cannot pass encoder-decoder prompt " + "to decoder-only models") + + # Decoder-only operation + model_inputs = await self._process_decoder_only_prompt_async( + inputs, + request_id=request_id, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) - return self.input_processor(llm_inputs) + return self.input_processor(model_inputs) async def add_request_async( self, @@ -341,10 +447,11 @@ async def add_request_async( arrival_time = time.time() processed_inputs = await self.process_model_inputs_async( + inputs, request_id=request_id, - inputs=inputs, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + prompt_adapter_request=prompt_adapter_request, + ) self._add_processed_request( request_id=request_id, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 10913efbd8890..5044ea8d620c6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -5,6 +5,8 @@ from typing import Sequence as GenericSequence from typing import Set, Tuple, Type, TypeVar, Union +from typing_extensions import assert_never + import vllm.envs as envs from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoadConfig, LoRAConfig, ModelConfig, @@ -22,10 +24,12 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group from vllm.executor.executor_base import ExecutorBase from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import INPUT_REGISTRY, LLMInputs, PromptInputs -from vllm.inputs.parse import get_prompt_type +from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs, LLMInputs, + PromptInputs, SingletonPromptInputs) +from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalDataDict from vllm.outputs import (EmbeddingRequestOutput, RequestOutput, RequestOutputFactory) from vllm.pooling_params import PoolingParams @@ -553,7 +557,7 @@ def _get_decoder_start_token_id(self, ) -> Optional[int]: def _add_processed_request( self, request_id: str, - processed_inputs: LLMInputs, + processed_inputs: Union[LLMInputs, EncoderDecoderLLMInputs], params: Union[SamplingParams, PoolingParams], arrival_time: float, lora_request: Optional[LoRARequest], @@ -613,7 +617,7 @@ def _add_processed_request( def stop_remote_worker_execution_loop(self) -> None: self.model_executor.stop_remote_worker_execution_loop() - _LLMInputComponentsType = Tuple[str, List[int], ] + _LLMInputComponentsType = Tuple[str, List[int]] def _prepare_decoder_input_ids_for_generation( self, @@ -646,7 +650,7 @@ def _prepare_decoder_input_ids_for_generation( if decoder_input_ids is None: # no decoder prompt input -> # use decoder_start_token_id as decoder_input_ids - (decoder_input_ids) = self._get_default_enc_dec_decoder_prompt() + decoder_input_ids = self._get_default_enc_dec_decoder_prompt() if (len(decoder_input_ids) == 0 or decoder_input_ids[0] != decoder_start_token_id): @@ -657,8 +661,8 @@ def _prepare_decoder_input_ids_for_generation( def _tokenize_prompt( self, prompt: str, - request_id: Optional[str] = None, - lora_request: Optional[str] = None, + request_id: str, + lora_request: Optional[LoRARequest] = None, ) -> List[int]: ''' Wrapper around application of the model's @@ -678,87 +682,60 @@ def _tokenize_prompt( tokenizer = self.get_tokenizer_group("prompts must be None if " "skip_tokenizer_init is True") - prompt_token_ids = tokenizer.encode(request_id=request_id, - prompt=prompt, - lora_request=lora_request) - - return prompt_token_ids + return tokenizer.encode(request_id=request_id, + prompt=prompt, + lora_request=lora_request) - def _extract_single_prompt_for_enc_dec_input( + def _extract_prompt_components( self, - inputs: Optional[PromptInputs], - request_id: Optional[str] = None, - ptype: Optional[str] = None, - is_encoder_prompt: bool = False, - ) -> Tuple[Optional[str], List[int]]: + inputs: SingletonPromptInputs, + request_id: str, + lora_request: Optional[LoRARequest] = None, + ) -> Tuple[Optional[str], List[int], Optional[MultiModalDataDict]]: ''' - Only for encoder/decoder models: - Extract prompt & prompt_token_ids from any single - encoder or decoder input prompt. For encoder input prompts - in particular, also extract multi-modal data. - - This function handles the following scenarios: - 1. The user supplied a singleton encoder prompt - & the prompt/prompt-token-ids must be extracted. - 2. The user supplied an explicit encoder/decoder - prompt & the prompt/prompt-token-ids must be - extracted from either the encoder and decoder prompts. - - For decoder prompts in particular (scenario 2), special - processing is applied to the returned decoder token ids. + Extract the components of any single encoder or decoder input prompt. Arguments: * request_id - * ptype: str representation of the input prompt type. - If `ptype` is `None`, assume that the prompt - type is unknown and must be inferred. This is the - case for ExplicitEncoderDecoder sub-prompts. * inputs: single encoder or decoder input prompt - * is_encoder_prompt: True if encoder input prompt. - If False, decoder prompt tokens - are preprocessed. + * lora_request: this is only valid for decoder prompts Returns: * prompt * prompt_token_ids + * multi_modal_data ''' - prompt_token_ids = None - ptype = (get_prompt_type(inputs) if ptype is None else ptype) - if inputs is None: - prompt = None - elif ptype == 'str': + if isinstance(inputs, str): prompt = inputs prompt_token_ids = self._tokenize_prompt( prompt, request_id=request_id, + lora_request=lora_request, ) - elif ptype == 'TokensPrompt': - prompt = None - prompt_token_ids = inputs['prompt_token_ids'] + multi_modal_data = None + elif isinstance(inputs, dict): + if "prompt_token_ids" in inputs: + prompt = None + prompt_token_ids = inputs["prompt_token_ids"] + else: + # NOTE: This extra assignment is required to pass mypy + prompt = parsed_prompt = inputs["prompt"] + prompt_token_ids = self._tokenize_prompt( + parsed_prompt, + request_id=request_id, + lora_request=lora_request, + ) + + multi_modal_data = inputs.get("multi_modal_data") else: - prompt = inputs['prompt'] - prompt_token_ids = self._tokenize_prompt( - prompt, - request_id=request_id, - ) - - if not is_encoder_prompt: - # Apply special pre-processing to - # decoder prompts - prompt_token_ids = (self._prepare_decoder_input_ids_for_generation( - prompt_token_ids, )) - - assert prompt_token_ids is not None + assert_never(inputs) - return ( - prompt, - prompt_token_ids, - ) + return prompt, prompt_token_ids, multi_modal_data - def _get_default_enc_dec_decoder_prompt(self, ) -> List[int]: + def _get_default_enc_dec_decoder_prompt(self) -> List[int]: ''' Specifically for encoder/decoder models: generate a default decoder prompt for when @@ -798,8 +775,8 @@ def _get_default_enc_dec_decoder_prompt(self, ) -> List[int]: def _process_encoder_decoder_prompt( self, inputs: PromptInputs, - request_id: Optional[str] = None, - ) -> LLMInputs: + request_id: str, + ) -> EncoderDecoderLLMInputs: ''' For encoder/decoder models only: Process an input prompt @@ -830,20 +807,17 @@ def _process_encoder_decoder_prompt( Returns: - * `LLMInputs` instance + * `EncoderDecoderLLMInputs` instance ''' - ptype = get_prompt_type(inputs) - # Obtain encoder and decoder prompt tokens. Note # that, no matter what, the decoder # prompt type is unknown. - if ptype == "ExplicitEncoderDecoder": + if is_explicit_encoder_decoder_prompt(inputs): # If input is explicit encoder/decoder prompt, # then it remains to be determined what type # of encoder prompt we have extracted_encoder_prompt = inputs.get('encoder_prompt') - encoder_ptype = None # Extract decoder prompt from explicit # encoder/decoder prompt extracted_decoder_prompt = inputs.get('decoder_prompt') @@ -851,7 +825,6 @@ def _process_encoder_decoder_prompt( # If input is singleton encoder prompt, then # we know the encoder prompt type extracted_encoder_prompt = inputs - encoder_ptype = ptype # Decoder prompt is always unknown if # encoder/decoder prompt is not explicit extracted_decoder_prompt = None @@ -865,32 +838,35 @@ def _process_encoder_decoder_prompt( ( encoder_prompt, encoder_prompt_token_ids, - ) = self._extract_single_prompt_for_enc_dec_input( + _, + ) = self._extract_prompt_components( extracted_encoder_prompt, request_id=request_id, - ptype=encoder_ptype, - is_encoder_prompt=True, ) # Invoke helper method to obtain # decoder prompt and prompt token ids. # - # The helper method will detect the decoder - # prompt type. - # # Helper method will also apply special # preprocessing unique to decoder prompts. - ( - decoder_prompt, - decoder_prompt_token_ids, - ) = self._extract_single_prompt_for_enc_dec_input( - extracted_decoder_prompt, - request_id=request_id, - ptype=None, - is_encoder_prompt=False, - ) + if extracted_decoder_prompt is None: + decoder_prompt_token_ids = encoder_prompt_token_ids + decoder_prompt = encoder_prompt + else: + ( + decoder_prompt, + decoder_prompt_token_ids, + _, + ) = self._extract_prompt_components( + extracted_decoder_prompt, + request_id=request_id, + ) - return LLMInputs( + decoder_prompt_token_ids = ( + self._prepare_decoder_input_ids_for_generation( + decoder_prompt_token_ids)) + + return EncoderDecoderLLMInputs( prompt_token_ids=decoder_prompt_token_ids, prompt=decoder_prompt, encoder_prompt_token_ids=encoder_prompt_token_ids, @@ -899,9 +875,9 @@ def _process_encoder_decoder_prompt( def _process_decoder_only_prompt( self, - inputs: PromptInputs, + inputs: SingletonPromptInputs, + request_id: str, lora_request: Optional[LoRARequest] = None, - request_id: Optional[str] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> LLMInputs: ''' @@ -912,8 +888,8 @@ def _process_decoder_only_prompt( Arguments: * inputs: input prompt - * lora_request * request_id + * lora_request * prompt_adapter_request Returns: @@ -921,18 +897,15 @@ def _process_decoder_only_prompt( * `LLMInputs` instance ''' - if isinstance(inputs, str): - inputs = {"prompt": inputs} - prompt = inputs.get("prompt") - - if "prompt_token_ids" not in inputs: - prompt_token_ids = self._tokenize_prompt( - prompt, - request_id=request_id, - lora_request=lora_request, - ) - else: - prompt_token_ids = inputs["prompt_token_ids"] + ( + prompt, + prompt_token_ids, + multi_modal_data, + ) = self._extract_prompt_components( + inputs, + request_id=request_id, + lora_request=lora_request, + ) if prompt_adapter_request: prompt_token_ids = ( @@ -941,15 +914,15 @@ def _process_decoder_only_prompt( return LLMInputs(prompt_token_ids=prompt_token_ids, prompt=prompt, - multi_modal_data=inputs.get("multi_modal_data")) + multi_modal_data=multi_modal_data) def process_model_inputs( self, - request_id: str, inputs: PromptInputs, + request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> LLMInputs: + ) -> Union[LLMInputs, EncoderDecoderLLMInputs]: if self.is_encoder_decoder_model(): # Encoder-decoder model requires special mapping of @@ -960,6 +933,10 @@ def process_model_inputs( request_id=request_id, ) else: + if is_explicit_encoder_decoder_prompt(inputs): + raise ValueError("Cannot pass encoder-decoder prompt " + "to decoder-only models") + # Decoder-only operation model_inputs = self._process_decoder_only_prompt( inputs, @@ -1029,10 +1006,11 @@ def add_request( arrival_time = time.time() processed_inputs = self.process_model_inputs( + inputs, request_id=request_id, - inputs=inputs, lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + prompt_adapter_request=prompt_adapter_request, + ) self._add_processed_request( request_id=request_id, diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py index 84871fc83ef5f..c0cd820e30c0d 100644 --- a/vllm/entrypoints/openai/logits_processors.py +++ b/vllm/entrypoints/openai/logits_processors.py @@ -40,9 +40,11 @@ def _get_allowed_token_ids_logits_processor( return AllowedTokenIdsLogitsProcessor(allowed_token_ids) -def logit_bias_logits_processor(logit_bias: Dict[str, - float], token_ids: List[int], - logits: torch.Tensor) -> torch.Tensor: +def logit_bias_logits_processor( + logit_bias: Dict[int, float], + token_ids: List[int], + logits: torch.Tensor, +) -> torch.Tensor: for token_id, bias in logit_bias.items(): logits[token_id] += bias return logits diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 1dcd1ad343b3f..0e1e7c828a71d 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,7 +1,7 @@ -from .data import (ExplicitEncoderDecoderPrompt, LLMInputs, PromptInputs, - SingletonPromptInputs, TextPrompt, TokensPrompt, - build_explicit_enc_dec_prompt, to_enc_dec_tuple_list, - zip_enc_dec_prompt_lists) +from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt, + LLMInputs, PromptInputs, SingletonPromptInputs, TextPrompt, + TokensPrompt, build_explicit_enc_dec_prompt, + to_enc_dec_tuple_list, zip_enc_dec_prompt_lists) from .registry import InputContext, InputRegistry INPUT_REGISTRY = InputRegistry() @@ -20,6 +20,7 @@ "ExplicitEncoderDecoderPrompt", "SingletonPromptInputs", "LLMInputs", + "EncoderDecoderLLMInputs", "build_explicit_enc_dec_prompt", "to_enc_dec_tuple_list", "zip_enc_dec_prompt_lists", diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 4cee911b43984..8732aea3a557d 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -103,7 +103,15 @@ class LLMInputs(TypedDict): The original prompt text corresponding to the token IDs, if available. """ - encoder_prompt_token_ids: NotRequired[List[int]] + multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] + """ + Optional multi-modal data to pass to the model, + if the model supports it. + """ + + +class EncoderDecoderLLMInputs(LLMInputs): + encoder_prompt_token_ids: List[int] """The token IDs of the encoder prompt.""" encoder_prompt: NotRequired[Optional[str]] @@ -112,12 +120,6 @@ class LLMInputs(TypedDict): available. """ - multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] - """ - Optional multi-modal data to pass to the model, - if the model supports it. - """ - def build_explicit_enc_dec_prompt( encoder_prompt: SingletonPromptInputs, diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 42bd9858bcbe1..984140f3651ce 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -1,9 +1,11 @@ -from typing import (List, Literal, Optional, Sequence, TypedDict, Union, - overload) +from typing import List, Literal, Sequence, TypedDict, Union, overload + +from typing_extensions import TypeIs from vllm.utils import is_list_of -from .data import LLMInputs, PromptInputs +from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt, + LLMInputs, PromptInputs) class ParsedText(TypedDict): @@ -63,63 +65,15 @@ def parse_and_batch_prompt( "array of tokens, or array of token arrays") -def _has_required_keys( - d: dict, - required_keys: set, -) -> bool: - return required_keys.issubset(d.keys()) +def is_explicit_encoder_decoder_prompt( + inputs: PromptInputs) -> TypeIs[ExplicitEncoderDecoderPrompt]: + return isinstance(inputs, dict) and "encoder_prompt" in inputs -def get_prompt_type(prompt: Optional[PromptInputs]) -> Optional[str]: - """ - Get the type-name of the prompt argument instance, given that - isinstance() cannot apply to TypedDict subclasses directly. - If the prompt is None, return 'None' as the type name. - - Arguments: - - * prompt: LLM input prompt or None - - Returns: - - * String representation of prompt type - """ - - if prompt is None: - return 'None' - - required_keys_dict = { - 'TextPrompt': {'prompt'}, - 'TokensPrompt': {'prompt_token_ids'}, - 'ExplicitEncoderDecoder': {'encoder_prompt', 'decoder_prompt'}, - } - - if isinstance(prompt, dict): - for (ptype, required_keys) in required_keys_dict.items(): - # Ignore type checking in the conditional below because type - # checker does not understand that is_dict(prompt) narrows - # down the possible types - if _has_required_keys( - prompt, # type: ignore - required_keys): - return ptype - - raise ValueError(f"Invalid prompt {prompt}, valid types are " - f"required_keys_dict={required_keys_dict}") - - if isinstance(prompt, str): - return "str" - - raise ValueError(f"Invalid prompt {prompt}") - - -def is_valid_encoder_decoder_llm_inputs(inputs: LLMInputs) -> bool: +def is_valid_encoder_decoder_llm_inputs( + inputs: LLMInputs) -> TypeIs[EncoderDecoderLLMInputs]: """ Return True if the LLMInputs instance has the correct configuration for encoder/decoder. """ - - # True if encoder prompt token ids field exists & - # is not None - return ('encoder_prompt_token_ids' in inputs - and inputs['encoder_prompt_token_ids'] is not None) + return "encoder_prompt_token_ids" in inputs From 6332d1eb9ffa14039ac55e084bc1e0b2d291a367 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 09:16:09 +0000 Subject: [PATCH 04/30] Remove unnecessary comments --- vllm/engine/llm_engine.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5044ea8d620c6..f380b6e817893 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -829,12 +829,6 @@ def _process_encoder_decoder_prompt( # encoder/decoder prompt is not explicit extracted_decoder_prompt = None - # Invoke helper function to obtain encoder - # prompt and prompt token ids, either from - # singleton encoder prompt or from the - # encoder sub-prompt of an explicit - # encoder/decode scenario 2), special - # processing is applied to the returned decoder token ids ( encoder_prompt, encoder_prompt_token_ids, @@ -844,11 +838,6 @@ def _process_encoder_decoder_prompt( request_id=request_id, ) - # Invoke helper method to obtain - # decoder prompt and prompt token ids. - # - # Helper method will also apply special - # preprocessing unique to decoder prompts. if extracted_decoder_prompt is None: decoder_prompt_token_ids = encoder_prompt_token_ids decoder_prompt = encoder_prompt From 07b4d211ecb080a4a2aa9c455823a37d83e46c1b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 09:31:19 +0000 Subject: [PATCH 05/30] Enable full async --- vllm/engine/async_llm_engine.py | 92 +++++++++++++++++++-------------- vllm/engine/llm_engine.py | 68 +++++++++++++----------- 2 files changed, 91 insertions(+), 69 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 2200003e4b841..10dba270717e7 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -301,21 +301,7 @@ async def _tokenize_prompt_async( request_id: str, lora_request: Optional[LoRARequest] = None, ) -> List[int]: - ''' - Wrapper around application of the model's - tokenizer. - - Arguments: - - * prompt - * request_id - * lora_request - - Returns: - - * prompt token ids - ''' - + """Async version of :meth:`_tokenize_prompt`.""" tokenizer = self.get_tokenizer_group("prompts must be None if " "skip_tokenizer_init is True") @@ -329,22 +315,7 @@ async def _extract_prompt_components_async( request_id: str, lora_request: Optional[LoRARequest] = None, ) -> Tuple[Optional[str], List[int], Optional[MultiModalDataDict]]: - ''' - Extract the components of any single encoder or decoder input prompt. - - Arguments: - - * request_id - * inputs: single encoder or decoder input prompt - * lora_request: this is only valid for decoder prompts - - Returns: - - * prompt - * prompt_token_ids - * multi_modal_data - ''' - + """Async version of :meth:`_extract_prompt_components`.""" if isinstance(inputs, str): prompt = inputs prompt_token_ids = await self._tokenize_prompt_async( @@ -372,6 +343,51 @@ async def _extract_prompt_components_async( return prompt, prompt_token_ids, multi_modal_data + async def _process_encoder_decoder_prompt_async( + self, + inputs: PromptInputs, + request_id: str, + ) -> EncoderDecoderLLMInputs: + """Async version of :meth:`_process_encoder_decoder_prompt`.""" + explicit_inputs = self._to_explicit_encoder_decoder_prompt(inputs) + extracted_encoder_prompt = explicit_inputs["encoder_prompt"] + extracted_decoder_prompt = explicit_inputs["decoder_prompt"] + + ( + encoder_prompt, + encoder_prompt_token_ids, + _, + ) = await self._extract_prompt_components_async( + extracted_encoder_prompt, + request_id=request_id, + ) + + # Avoid repeated processing if the inputs was originally in singleton + # form, see self._to_explicit_encoder_decoder_prompt + if extracted_decoder_prompt is extracted_encoder_prompt: + decoder_prompt_token_ids = encoder_prompt_token_ids + decoder_prompt = encoder_prompt + else: + ( + decoder_prompt, + decoder_prompt_token_ids, + _, + ) = await self._extract_prompt_components_async( + extracted_decoder_prompt, + request_id=request_id, + ) + + decoder_prompt_token_ids = ( + self._prepare_decoder_input_ids_for_generation( + decoder_prompt_token_ids)) + + return EncoderDecoderLLMInputs( + prompt_token_ids=decoder_prompt_token_ids, + prompt=decoder_prompt, + encoder_prompt_token_ids=encoder_prompt_token_ids, + encoder_prompt=encoder_prompt, + ) + async def _process_decoder_only_prompt_async( self, inputs: SingletonPromptInputs, @@ -379,7 +395,7 @@ async def _process_decoder_only_prompt_async( lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> LLMInputs: - + """Async version of :meth:`_process_decoder_only_prompt`.""" ( prompt, prompt_token_ids, @@ -390,10 +406,8 @@ async def _process_decoder_only_prompt_async( lora_request=lora_request, ) - if prompt_adapter_request: - prompt_token_ids = ( - [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens - + prompt_token_ids) + prompt_token_ids = self._apply_prompt_adapter( + prompt_token_ids, prompt_adapter_request=prompt_adapter_request) return LLMInputs(prompt_token_ids=prompt_token_ids, prompt=prompt, @@ -406,12 +420,11 @@ async def process_model_inputs_async( lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> Union[LLMInputs, EncoderDecoderLLMInputs]: + """Async version of :meth:`process_model_inputs`.""" if self.is_encoder_decoder_model(): - # TODO: Make this async # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder - - model_inputs = self._process_encoder_decoder_prompt( + model_inputs = await self._process_encoder_decoder_prompt_async( inputs, request_id=request_id, ) @@ -440,6 +453,7 @@ async def add_request_async( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> None: + """Async version of :meth:`add_request`.""" if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f380b6e817893..204a5c28867e2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -24,8 +24,9 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group from vllm.executor.executor_base import ExecutorBase from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs, LLMInputs, - PromptInputs, SingletonPromptInputs) +from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs, + ExplicitEncoderDecoderPrompt, LLMInputs, PromptInputs, + SingletonPromptInputs) from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -735,6 +736,18 @@ def _extract_prompt_components( return prompt, prompt_token_ids, multi_modal_data + def _apply_prompt_adapter( + self, + prompt_token_ids: List[int], + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[int]: + if prompt_adapter_request: + prompt_token_ids = ( + [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens + + prompt_token_ids) + + return prompt_token_ids + def _get_default_enc_dec_decoder_prompt(self) -> List[int]: ''' Specifically for encoder/decoder models: @@ -769,8 +782,19 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: bos_token_id = self._get_bos_token_id() assert bos_token_id is not None - prompt_token_ids: List[int] = [bos_token_id] - return prompt_token_ids + return [bos_token_id] + + def _to_explicit_encoder_decoder_prompt( + self, + inputs: PromptInputs, + ) -> ExplicitEncoderDecoderPrompt: + if is_explicit_encoder_decoder_prompt(inputs): + return inputs + + return ExplicitEncoderDecoderPrompt( + encoder_prompt=inputs, + decoder_prompt=inputs, + ) def _process_encoder_decoder_prompt( self, @@ -779,8 +803,8 @@ def _process_encoder_decoder_prompt( ) -> EncoderDecoderLLMInputs: ''' For encoder/decoder models only: - Process an input prompt - into an `LLMInputs` instance. + Process an input prompt into an + :class:`EncoderDecoderLLMInputs` instance. There are two types of input prompts: singleton prompts which carry only the @@ -810,24 +834,9 @@ def _process_encoder_decoder_prompt( * `EncoderDecoderLLMInputs` instance ''' - # Obtain encoder and decoder prompt tokens. Note - # that, no matter what, the decoder - # prompt type is unknown. - if is_explicit_encoder_decoder_prompt(inputs): - # If input is explicit encoder/decoder prompt, - # then it remains to be determined what type - # of encoder prompt we have - extracted_encoder_prompt = inputs.get('encoder_prompt') - # Extract decoder prompt from explicit - # encoder/decoder prompt - extracted_decoder_prompt = inputs.get('decoder_prompt') - else: - # If input is singleton encoder prompt, then - # we know the encoder prompt type - extracted_encoder_prompt = inputs - # Decoder prompt is always unknown if - # encoder/decoder prompt is not explicit - extracted_decoder_prompt = None + explicit_inputs = self._to_explicit_encoder_decoder_prompt(inputs) + extracted_encoder_prompt = explicit_inputs["encoder_prompt"] + extracted_decoder_prompt = explicit_inputs["decoder_prompt"] ( encoder_prompt, @@ -838,7 +847,9 @@ def _process_encoder_decoder_prompt( request_id=request_id, ) - if extracted_decoder_prompt is None: + # Avoid repeated processing if the inputs was originally in singleton + # form, see self._to_explicit_encoder_decoder_prompt + if extracted_decoder_prompt is extracted_encoder_prompt: decoder_prompt_token_ids = encoder_prompt_token_ids decoder_prompt = encoder_prompt else: @@ -896,10 +907,8 @@ def _process_decoder_only_prompt( lora_request=lora_request, ) - if prompt_adapter_request: - prompt_token_ids = ( - [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens - + prompt_token_ids) + prompt_token_ids = self._apply_prompt_adapter( + prompt_token_ids, prompt_adapter_request=prompt_adapter_request) return LLMInputs(prompt_token_ids=prompt_token_ids, prompt=prompt, @@ -916,7 +925,6 @@ def process_model_inputs( if self.is_encoder_decoder_model(): # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder - model_inputs = self._process_encoder_decoder_prompt( inputs, request_id=request_id, From e29864cdcc9ff7837b9e05a250d0129b89b6f6e2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 09:32:44 +0000 Subject: [PATCH 06/30] grammar --- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/llm_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 10dba270717e7..5626f3a2a3d8f 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -362,7 +362,7 @@ async def _process_encoder_decoder_prompt_async( request_id=request_id, ) - # Avoid repeated processing if the inputs was originally in singleton + # Avoid repeated processing if the input was originally in singleton # form, see self._to_explicit_encoder_decoder_prompt if extracted_decoder_prompt is extracted_encoder_prompt: decoder_prompt_token_ids = encoder_prompt_token_ids diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 204a5c28867e2..d9e6f6912fcb7 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -847,7 +847,7 @@ def _process_encoder_decoder_prompt( request_id=request_id, ) - # Avoid repeated processing if the inputs was originally in singleton + # Avoid repeated processing if the input was originally in singleton # form, see self._to_explicit_encoder_decoder_prompt if extracted_decoder_prompt is extracted_encoder_prompt: decoder_prompt_token_ids = encoder_prompt_token_ids From c9dfb401f7963910a1e416fc9c31344b7c31ed32 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 09:40:10 +0000 Subject: [PATCH 07/30] Add description --- vllm/inputs/data.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 8732aea3a557d..b65e5d5f06869 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -94,6 +94,8 @@ class LLMInputs(TypedDict): """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. + + This includes the data required for decoder-only models. """ prompt_token_ids: List[int] """The token IDs of the prompt.""" @@ -111,6 +113,12 @@ class LLMInputs(TypedDict): class EncoderDecoderLLMInputs(LLMInputs): + """ + The inputs in :class:`~vllm.LLMEngine` before they are + passed to the model executor. + + This includes the required data for encoder-decoder models. + """ encoder_prompt_token_ids: List[int] """The token IDs of the encoder prompt.""" From 123319227828d1c4e1b82ffee7dc67f1012ea7aa Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 09:50:15 +0000 Subject: [PATCH 08/30] Fix wrong type annotations --- tests/conftest.py | 15 +++++++-------- vllm/inputs/data.py | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b0adfc58bcda1..5bfb8fc132a8f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,8 +21,8 @@ from vllm.connections import global_http_connection from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel) -from vllm.inputs import (TextPrompt, to_enc_dec_tuple_list, - zip_enc_dec_prompt_lists) +from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, + to_enc_dec_tuple_list, zip_enc_dec_prompt_lists) from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sequence import SampleLogprobs @@ -125,9 +125,8 @@ def example_prompts() -> List[str]: @pytest.fixture -def example_encoder_decoder_prompts() \ - -> Dict[DecoderPromptType, - Tuple[List[str], List[Optional[str]]]]: +def example_encoder_decoder_prompts( +) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]: ''' Returns an encoder prompt list and a decoder prompt list, wherein each pair of same-index entries in both lists corresponds to an (encoder prompt, @@ -444,7 +443,7 @@ def generate_greedy_logprobs_limit( def generate_encoder_decoder_greedy_logprobs_limit( self, - encoder_decoder_prompts: Tuple[List[str], List[str]], + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt], max_tokens: int, num_logprobs: int, **kwargs: Any, @@ -608,7 +607,7 @@ def generate_w_logprobs( def generate_encoder_decoder_w_logprobs( self, - encoder_decoder_prompts: Tuple[List[str], List[str]], + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt], sampling_params: SamplingParams, ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ''' @@ -653,7 +652,7 @@ def generate_greedy_logprobs( def generate_encoder_decoder_greedy_logprobs( self, - encoder_decoder_prompts: Tuple[List[str], List[str]], + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt], max_tokens: int, num_logprobs: int, ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index b65e5d5f06869..1d5b6b3fcdc09 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -150,7 +150,7 @@ def zip_enc_dec_prompt_lists( def to_enc_dec_tuple_list( enc_dec_prompts: List[ExplicitEncoderDecoderPrompt], -) -> List[Tuple[PromptInputs, PromptInputs]]: +) -> List[Tuple[SingletonPromptInputs, SingletonPromptInputs]]: return [(enc_dec_prompt['encoder_prompt'], enc_dec_prompt['decoder_prompt']) for enc_dec_prompt in enc_dec_prompts] From dcdebee669a85192d73c89ddb65732fc3c9594a3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 09:58:23 +0000 Subject: [PATCH 09/30] Remove redundant docs --- vllm/inputs/data.py | 4 ++-- vllm/inputs/parse.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 1d5b6b3fcdc09..d83297a32cbb1 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -95,7 +95,7 @@ class LLMInputs(TypedDict): The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. - This includes the data required for decoder-only models. + This specifies the data required for decoder-only models. """ prompt_token_ids: List[int] """The token IDs of the prompt.""" @@ -117,7 +117,7 @@ class EncoderDecoderLLMInputs(LLMInputs): The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. - This includes the required data for encoder-decoder models. + This specifies the required data for encoder-decoder models. """ encoder_prompt_token_ids: List[int] """The token IDs of the encoder prompt.""" diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 984140f3651ce..840bc8a49fb38 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -72,8 +72,4 @@ def is_explicit_encoder_decoder_prompt( def is_valid_encoder_decoder_llm_inputs( inputs: LLMInputs) -> TypeIs[EncoderDecoderLLMInputs]: - """ - Return True if the LLMInputs instance has the correct configuration - for encoder/decoder. - """ return "encoder_prompt_token_ids" in inputs From 65db3f1914f0b39bf4b71eba6835f76461587db1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 09:59:37 +0000 Subject: [PATCH 10/30] Be more strict --- vllm/inputs/parse.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 840bc8a49fb38..b55f6003d575d 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -71,5 +71,6 @@ def is_explicit_encoder_decoder_prompt( def is_valid_encoder_decoder_llm_inputs( - inputs: LLMInputs) -> TypeIs[EncoderDecoderLLMInputs]: + inputs: Union[LLMInputs, EncoderDecoderLLMInputs], +) -> TypeIs[EncoderDecoderLLMInputs]: return "encoder_prompt_token_ids" in inputs From 9ffeb222f84d0ac8139e550b1cbbf5ecefc7e484 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 10:03:28 +0000 Subject: [PATCH 11/30] Fix docs --- vllm/inputs/__init__.py | 2 +- vllm/inputs/data.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 0e1e7c828a71d..e8f8a40fbd184 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -17,8 +17,8 @@ "TextPrompt", "TokensPrompt", "PromptInputs", - "ExplicitEncoderDecoderPrompt", "SingletonPromptInputs", + "ExplicitEncoderDecoderPrompt", "LLMInputs", "EncoderDecoderLLMInputs", "build_explicit_enc_dec_prompt", diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index d83297a32cbb1..57f3af9d54209 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -43,14 +43,14 @@ class TokensPrompt(TypedDict): which encapsulates multiple prompts, i.e. of the sort which may be utilized for encoder/decoder models when the user desires to express both the encoder & decoder -prompts explicitly, i.e. ExplicitEncoderDecoderPrompt +prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt` -A prompt of type SingletonPromptInputs may be employed +A prompt of type :class:`SingletonPromptInputs` may be employed as (1) input to a decoder-only model, (2) input to the encoder of an encoder/decoder model, in the scenario where the decoder-prompt is not specified explicitly, or (3) as a member of a larger data structure encapsulating -more than one prompt, i.e. ExplicitEncoderDecoderPrompt +more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt` """ @@ -61,7 +61,7 @@ class ExplicitEncoderDecoderPrompt(TypedDict): The encoder and decoder prompts, respectively, may formatted according to any of the - SingletonPromptInputs schemas, and are not + :class:`SingletonPromptInputs` schemas, and are not required to have the same schema. Only the encoder prompt may have multi-modal data. @@ -69,8 +69,8 @@ class ExplicitEncoderDecoderPrompt(TypedDict): Note that an ExplicitEncoderDecoderPrompt may not be used as an input to a decoder-only model, and that the `encoder_prompt` and `decoder_prompt` - fields of this data structure may not themselves - must be SingletonPromptInputs instances. + fields of this data structure themselves must be + :class:`SingletonPromptInputs` instances. """ encoder_prompt: SingletonPromptInputs From c9e0b081561f6a2299027721302f8b99c1f99174 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 10:05:38 +0000 Subject: [PATCH 12/30] Fix 2 --- vllm/engine/llm_engine.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d9e6f6912fcb7..54d81ea5587a9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -666,8 +666,7 @@ def _tokenize_prompt( lora_request: Optional[LoRARequest] = None, ) -> List[int]: ''' - Wrapper around application of the model's - tokenizer. + Wrapper around application of the model's tokenizer. Arguments: @@ -831,7 +830,7 @@ def _process_encoder_decoder_prompt( Returns: - * `EncoderDecoderLLMInputs` instance + * :class:`EncoderDecoderLLMInputs` instance ''' explicit_inputs = self._to_explicit_encoder_decoder_prompt(inputs) @@ -882,8 +881,7 @@ def _process_decoder_only_prompt( ) -> LLMInputs: ''' For decoder-only models: - Process an input prompt - into an `LLMInputs` instance. + Process an input prompt into an :class:`LLMInputs` instance. Arguments: @@ -894,7 +892,7 @@ def _process_decoder_only_prompt( Returns: - * `LLMInputs` instance + * :class:`LLMInputs` instance ''' ( From 14bca1ff1fa90f44b00aa65a5d664115fecb5c55 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 10:11:59 +0000 Subject: [PATCH 13/30] Disallow multi-modal data for enc/dec models --- vllm/engine/async_llm_engine.py | 13 +++++++++++-- vllm/engine/llm_engine.py | 13 +++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 5626f3a2a3d8f..e53ebdd35e268 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -356,27 +356,36 @@ async def _process_encoder_decoder_prompt_async( ( encoder_prompt, encoder_prompt_token_ids, - _, + encoder_multi_modal_data, ) = await self._extract_prompt_components_async( extracted_encoder_prompt, request_id=request_id, ) + if encoder_multi_modal_data is not None: + raise ValueError("Multi-modal data is not supported for " + "(language) encoder-decoder models") + # Avoid repeated processing if the input was originally in singleton # form, see self._to_explicit_encoder_decoder_prompt if extracted_decoder_prompt is extracted_encoder_prompt: decoder_prompt_token_ids = encoder_prompt_token_ids decoder_prompt = encoder_prompt + decoder_multi_modal_data = encoder_multi_modal_data else: ( decoder_prompt, decoder_prompt_token_ids, - _, + decoder_multi_modal_data, ) = await self._extract_prompt_components_async( extracted_decoder_prompt, request_id=request_id, ) + if decoder_multi_modal_data is not None: + raise ValueError("Multi-modal data is not supported for " + "(language) encoder-decoder models") + decoder_prompt_token_ids = ( self._prepare_decoder_input_ids_for_generation( decoder_prompt_token_ids)) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 54d81ea5587a9..09685b4586d25 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -840,27 +840,36 @@ def _process_encoder_decoder_prompt( ( encoder_prompt, encoder_prompt_token_ids, - _, + encoder_multi_modal_data, ) = self._extract_prompt_components( extracted_encoder_prompt, request_id=request_id, ) + if encoder_multi_modal_data is not None: + raise ValueError("Multi-modal data is not supported for " + "(language) encoder-decoder models") + # Avoid repeated processing if the input was originally in singleton # form, see self._to_explicit_encoder_decoder_prompt if extracted_decoder_prompt is extracted_encoder_prompt: decoder_prompt_token_ids = encoder_prompt_token_ids decoder_prompt = encoder_prompt + decoder_multi_modal_data = encoder_multi_modal_data else: ( decoder_prompt, decoder_prompt_token_ids, - _, + decoder_multi_modal_data, ) = self._extract_prompt_components( extracted_decoder_prompt, request_id=request_id, ) + if decoder_multi_modal_data is not None: + raise ValueError("Multi-modal data is not supported for " + "(language) encoder-decoder models") + decoder_prompt_token_ids = ( self._prepare_decoder_input_ids_for_generation( decoder_prompt_token_ids)) From 8fc7099c48935ce0bf253d1a1367f32077a7e6c5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 10:22:45 +0000 Subject: [PATCH 14/30] Improve type narrowing behavior using `TypeIs` --- vllm/model_executor/models/interfaces.py | 22 +++++++++++----------- vllm/utils.py | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 6fdacd4469788..db0d6b429d64d 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,7 +1,7 @@ from typing import (ClassVar, Dict, List, Literal, Optional, Protocol, Type, Union, overload, runtime_checkable) -from typing_extensions import TypeGuard +from typing_extensions import TypeIs from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig from vllm.logger import init_logger @@ -37,18 +37,18 @@ def __call__(self, *, multimodal_config: MultiModalConfig) -> None: @overload -def supports_vision(model: Type[object]) -> TypeGuard[Type[SupportsVision]]: +def supports_vision(model: Type[object]) -> TypeIs[Type[SupportsVision]]: ... @overload -def supports_vision(model: object) -> TypeGuard[SupportsVision]: +def supports_vision(model: object) -> TypeIs[SupportsVision]: ... def supports_vision( model: Union[Type[object], object], -) -> Union[TypeGuard[Type[SupportsVision]], TypeGuard[SupportsVision]]: +) -> Union[TypeIs[Type[SupportsVision]], TypeIs[SupportsVision]]: if isinstance(model, type): return isinstance(model, _SupportsVisionType) @@ -94,18 +94,18 @@ def __call__(self, *, lora_config: Optional[LoRAConfig] = None) -> None: @overload -def supports_lora(model: Type[object]) -> TypeGuard[Type[SupportsLoRA]]: +def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]: ... @overload -def supports_lora(model: object) -> TypeGuard[SupportsLoRA]: +def supports_lora(model: object) -> TypeIs[SupportsLoRA]: ... def supports_lora( model: Union[Type[object], object], -) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]: +) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]: result = _supports_lora(model) if not result: @@ -137,7 +137,7 @@ def supports_lora( def _supports_lora( model: Union[Type[object], object], -) -> Union[TypeGuard[Type[SupportsLoRA]], TypeGuard[SupportsLoRA]]: +) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]: if isinstance(model, type): return isinstance(model, _SupportsLoRAType) @@ -172,18 +172,18 @@ def __init__(self, @overload -def has_inner_state(model: object) -> TypeGuard[HasInnerState]: +def has_inner_state(model: object) -> TypeIs[HasInnerState]: ... @overload -def has_inner_state(model: Type[object]) -> TypeGuard[Type[HasInnerState]]: +def has_inner_state(model: Type[object]) -> TypeIs[Type[HasInnerState]]: ... def has_inner_state( model: Union[Type[object], object] -) -> Union[TypeGuard[Type[HasInnerState]], TypeGuard[HasInnerState]]: +) -> Union[TypeIs[Type[HasInnerState]], TypeIs[HasInnerState]]: if isinstance(model, type): return isinstance(model, _HasInnerStateType) diff --git a/vllm/utils.py b/vllm/utils.py index eb88fce4af0ce..fcfdfe85ed145 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -25,7 +25,7 @@ import psutil import torch import torch.types -from typing_extensions import ParamSpec, TypeGuard, assert_never +from typing_extensions import ParamSpec, TypeIs, assert_never import vllm.envs as envs from vllm import _custom_ops as ops @@ -811,7 +811,7 @@ def is_list_of( typ: Type[T], *, check: Literal["first", "all"] = "first", -) -> TypeGuard[List[T]]: +) -> TypeIs[List[T]]: if not isinstance(value, list): return False From 3a8a072d16a6ec4305498ab54abe707dcdee4483 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 11:02:58 +0000 Subject: [PATCH 15/30] Avoid sequential await --- vllm/engine/async_llm_engine.py | 48 +++++++++++------------ vllm/engine/llm_engine.py | 68 +++++++++++++-------------------- 2 files changed, 49 insertions(+), 67 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index e53ebdd35e268..8c3d591a56397 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -349,40 +349,36 @@ async def _process_encoder_decoder_prompt_async( request_id: str, ) -> EncoderDecoderLLMInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" - explicit_inputs = self._to_explicit_encoder_decoder_prompt(inputs) - extracted_encoder_prompt = explicit_inputs["encoder_prompt"] - extracted_decoder_prompt = explicit_inputs["decoder_prompt"] - - ( - encoder_prompt, - encoder_prompt_token_ids, - encoder_multi_modal_data, - ) = await self._extract_prompt_components_async( - extracted_encoder_prompt, - request_id=request_id, - ) + if is_explicit_encoder_decoder_prompt(inputs): + encoder_task = self._extract_prompt_components_async( + inputs["encoder_prompt"], + request_id=request_id, + ) - if encoder_multi_modal_data is not None: - raise ValueError("Multi-modal data is not supported for " - "(language) encoder-decoder models") + decoder_task = self._extract_prompt_components_async( + inputs["decoder_prompt"], + request_id=request_id, + ) - # Avoid repeated processing if the input was originally in singleton - # form, see self._to_explicit_encoder_decoder_prompt - if extracted_decoder_prompt is extracted_encoder_prompt: - decoder_prompt_token_ids = encoder_prompt_token_ids - decoder_prompt = encoder_prompt - decoder_multi_modal_data = encoder_multi_modal_data + ( + (encoder_prompt, encoder_prompt_token_ids, encoder_mm_data), + (decoder_prompt, decoder_prompt_token_ids, decoder_mm_data), + ) = await asyncio.gather(encoder_task, decoder_task) else: ( - decoder_prompt, - decoder_prompt_token_ids, - decoder_multi_modal_data, + encoder_prompt, + encoder_prompt_token_ids, + encoder_mm_data, ) = await self._extract_prompt_components_async( - extracted_decoder_prompt, + inputs, request_id=request_id, ) - if decoder_multi_modal_data is not None: + decoder_prompt_token_ids = encoder_prompt_token_ids + decoder_prompt = encoder_prompt + decoder_mm_data = encoder_mm_data + + if encoder_mm_data is not None or decoder_mm_data is not None: raise ValueError("Multi-modal data is not supported for " "(language) encoder-decoder models") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 09685b4586d25..7501327ef2712 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -24,9 +24,8 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group from vllm.executor.executor_base import ExecutorBase from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs, - ExplicitEncoderDecoderPrompt, LLMInputs, PromptInputs, - SingletonPromptInputs) +from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs, LLMInputs, + PromptInputs, SingletonPromptInputs) from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -783,18 +782,6 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: assert bos_token_id is not None return [bos_token_id] - def _to_explicit_encoder_decoder_prompt( - self, - inputs: PromptInputs, - ) -> ExplicitEncoderDecoderPrompt: - if is_explicit_encoder_decoder_prompt(inputs): - return inputs - - return ExplicitEncoderDecoderPrompt( - encoder_prompt=inputs, - decoder_prompt=inputs, - ) - def _process_encoder_decoder_prompt( self, inputs: PromptInputs, @@ -833,40 +820,39 @@ def _process_encoder_decoder_prompt( * :class:`EncoderDecoderLLMInputs` instance ''' - explicit_inputs = self._to_explicit_encoder_decoder_prompt(inputs) - extracted_encoder_prompt = explicit_inputs["encoder_prompt"] - extracted_decoder_prompt = explicit_inputs["decoder_prompt"] - - ( - encoder_prompt, - encoder_prompt_token_ids, - encoder_multi_modal_data, - ) = self._extract_prompt_components( - extracted_encoder_prompt, - request_id=request_id, - ) - - if encoder_multi_modal_data is not None: - raise ValueError("Multi-modal data is not supported for " - "(language) encoder-decoder models") + if is_explicit_encoder_decoder_prompt(inputs): + ( + encoder_prompt, + encoder_prompt_token_ids, + encoder_mm_data, + ) = self._extract_prompt_components( + inputs["encoder_prompt"], + request_id=request_id, + ) - # Avoid repeated processing if the input was originally in singleton - # form, see self._to_explicit_encoder_decoder_prompt - if extracted_decoder_prompt is extracted_encoder_prompt: - decoder_prompt_token_ids = encoder_prompt_token_ids - decoder_prompt = encoder_prompt - decoder_multi_modal_data = encoder_multi_modal_data - else: ( decoder_prompt, decoder_prompt_token_ids, - decoder_multi_modal_data, + decoder_mm_data, ) = self._extract_prompt_components( - extracted_decoder_prompt, + inputs["decoder_prompt"], request_id=request_id, ) + else: + ( + encoder_prompt, + encoder_prompt_token_ids, + encoder_mm_data, + ) = self._extract_prompt_components( + inputs, + request_id=request_id, + ) + + decoder_prompt_token_ids = encoder_prompt_token_ids + decoder_prompt = encoder_prompt + decoder_mm_data = encoder_mm_data - if decoder_multi_modal_data is not None: + if encoder_mm_data is not None or decoder_mm_data is not None: raise ValueError("Multi-modal data is not supported for " "(language) encoder-decoder models") From ef5327c24506392d2c635cab445fb7614fdfba8c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 11:36:40 +0000 Subject: [PATCH 16/30] Fix type annotations based on test files --- examples/offline_inference_encoder_decoder.py | 6 +-- tests/conftest.py | 23 ++++++---- ...t_basic_distributed_correctness_enc_dec.py | 2 +- tests/models/test_bart.py | 3 +- tests/models/utils.py | 11 ----- vllm/inputs/__init__.py | 4 +- vllm/inputs/data.py | 42 +++++++++++-------- 7 files changed, 46 insertions(+), 45 deletions(-) diff --git a/examples/offline_inference_encoder_decoder.py b/examples/offline_inference_encoder_decoder.py index c05e8e8bb6f11..0f266d7918853 100644 --- a/examples/offline_inference_encoder_decoder.py +++ b/examples/offline_inference_encoder_decoder.py @@ -5,7 +5,7 @@ from vllm import LLM, SamplingParams from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - TokensPrompt, zip_enc_dec_prompt_lists) + TokensPrompt, zip_enc_dec_prompts) dtype = "float" @@ -61,9 +61,9 @@ ) # - Finally, here's a useful helper function for zipping encoder and -# decoder prompt lists together into a list of ExplicitEncoderDecoderPrompt +# decoder prompts together into a list of ExplicitEncoderDecoderPrompt # instances -zipped_prompt_list = zip_enc_dec_prompt_lists( +zipped_prompt_list = zip_enc_dec_prompts( ['An encoder prompt', 'Another encoder prompt'], ['A decoder prompt', 'Another decoder prompt']) diff --git a/tests/conftest.py b/tests/conftest.py index 5bfb8fc132a8f..5163b5c186e72 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import os import sys from collections import UserList +from enum import Enum from typing import Any, Dict, List, Optional, Tuple, TypedDict, TypeVar, Union import pytest @@ -14,7 +15,6 @@ AutoModelForVision2Seq, AutoTokenizer, BatchEncoding, BatchFeature) -from tests.models.utils import DecoderPromptType from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset from vllm.config import TokenizerPoolConfig @@ -22,7 +22,7 @@ from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel) from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, - to_enc_dec_tuple_list, zip_enc_dec_prompt_lists) + to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sequence import SampleLogprobs @@ -124,6 +124,13 @@ def example_prompts() -> List[str]: return prompts +class DecoderPromptType(Enum): + """For encoder/decoder models only.""" + CUSTOM = 1 + NONE = 2 + EMPTY_STR = 3 + + @pytest.fixture def example_encoder_decoder_prompts( ) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]: @@ -149,11 +156,11 @@ def example_encoder_decoder_prompts( # NONE decoder prompt type return { DecoderPromptType.NONE: - zip_enc_dec_prompt_lists(encoder_prompts, none_decoder_prompts), + zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts), DecoderPromptType.EMPTY_STR: - zip_enc_dec_prompt_lists(encoder_prompts, empty_str_decoder_prompts), + zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts), DecoderPromptType.CUSTOM: - zip_enc_dec_prompt_lists(encoder_prompts, custom_decoder_prompts), + zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts), } @@ -443,7 +450,7 @@ def generate_greedy_logprobs_limit( def generate_encoder_decoder_greedy_logprobs_limit( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt], + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str]], max_tokens: int, num_logprobs: int, **kwargs: Any, @@ -607,7 +614,7 @@ def generate_w_logprobs( def generate_encoder_decoder_w_logprobs( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt], + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str]], sampling_params: SamplingParams, ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ''' @@ -652,7 +659,7 @@ def generate_greedy_logprobs( def generate_encoder_decoder_greedy_logprobs( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt], + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str]], max_tokens: int, num_logprobs: int, ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: diff --git a/tests/distributed/test_basic_distributed_correctness_enc_dec.py b/tests/distributed/test_basic_distributed_correctness_enc_dec.py index 69eae62ca7320..9850c823ff5da 100644 --- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py +++ b/tests/distributed/test_basic_distributed_correctness_enc_dec.py @@ -11,9 +11,9 @@ import pytest -from tests.models.utils import DecoderPromptType from vllm.utils import cuda_device_count_stateless +from ..conftest import DecoderPromptType from ..models.utils import check_logprobs_close from ..utils import fork_new_process_for_each_test diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index 9c26b7163ff62..becf1b5b5df9e 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -11,8 +11,7 @@ import pytest - from tests.models.utils import DecoderPromptType - + from ..conftest import DecoderPromptType from .utils import check_logprobs_close MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] diff --git a/tests/models/utils.py b/tests/models/utils.py index d96301b853c85..ff29a0ae81d6e 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,5 +1,4 @@ import warnings -from enum import Enum from typing import Dict, List, Optional, Sequence, Tuple, Union from vllm.sequence import SampleLogprobs @@ -136,13 +135,3 @@ def check_logprobs_close( warnings.simplefilter("always") warnings.warn(fail_msg, stacklevel=2) - - -class DecoderPromptType(Enum): - ''' - For encoder/decoder models only - - - ''' - CUSTOM = 1 - NONE = 2 - EMPTY_STR = 3 diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index e8f8a40fbd184..0b08e9691f915 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,7 +1,7 @@ from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt, LLMInputs, PromptInputs, SingletonPromptInputs, TextPrompt, TokensPrompt, build_explicit_enc_dec_prompt, - to_enc_dec_tuple_list, zip_enc_dec_prompt_lists) + to_enc_dec_tuple_list, zip_enc_dec_prompts) from .registry import InputContext, InputRegistry INPUT_REGISTRY = InputRegistry() @@ -23,7 +23,7 @@ "EncoderDecoderLLMInputs", "build_explicit_enc_dec_prompt", "to_enc_dec_tuple_list", - "zip_enc_dec_prompt_lists", + "zip_enc_dec_prompts", "INPUT_REGISTRY", "InputContext", "InputRegistry", diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 57f3af9d54209..0081d3c0f59bf 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,6 +1,7 @@ -from typing import TYPE_CHECKING, List, Optional, Tuple, TypedDict, Union +from typing import (TYPE_CHECKING, Generic, Iterable, List, Optional, Tuple, + Union) -from typing_extensions import NotRequired +from typing_extensions import NotRequired, TypedDict, TypeVar if TYPE_CHECKING: from vllm.multimodal import MultiModalDataDict @@ -53,8 +54,10 @@ class TokensPrompt(TypedDict): more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt` """ +_T = TypeVar("_T", bound=SingletonPromptInputs, default=SingletonPromptInputs) -class ExplicitEncoderDecoderPrompt(TypedDict): + +class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T]): """Represents an encoder/decoder model input prompt, comprising an explicit encoder prompt and a decoder prompt. @@ -73,9 +76,9 @@ class ExplicitEncoderDecoderPrompt(TypedDict): :class:`SingletonPromptInputs` instances. """ - encoder_prompt: SingletonPromptInputs + encoder_prompt: _T - decoder_prompt: SingletonPromptInputs + decoder_prompt: Optional[_T] PromptInputs = Union[SingletonPromptInputs, ExplicitEncoderDecoderPrompt] @@ -130,27 +133,30 @@ class EncoderDecoderLLMInputs(LLMInputs): def build_explicit_enc_dec_prompt( - encoder_prompt: SingletonPromptInputs, - decoder_prompt: SingletonPromptInputs, -) -> ExplicitEncoderDecoderPrompt: + encoder_prompt: _T, + decoder_prompt: Optional[_T], +) -> ExplicitEncoderDecoderPrompt[_T]: return ExplicitEncoderDecoderPrompt(encoder_prompt=encoder_prompt, decoder_prompt=decoder_prompt) -def zip_enc_dec_prompt_lists( - enc_prompt_list: List[SingletonPromptInputs], - dec_prompt_list: List[SingletonPromptInputs], -) -> List[ExplicitEncoderDecoderPrompt]: +def zip_enc_dec_prompts( + enc_prompts: Iterable[_T], + dec_prompts: Iterable[Optional[_T]], +) -> List[ExplicitEncoderDecoderPrompt[_T]]: + """ + Zip encoder and decoder prompts together into a list of + :class:`ExplicitEncoderDecoderPrompt` instances. + """ return [ build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt) - for (encoder_prompt, - decoder_prompt) in zip(enc_prompt_list, dec_prompt_list) + for (encoder_prompt, decoder_prompt) in zip(enc_prompts, dec_prompts) ] def to_enc_dec_tuple_list( - enc_dec_prompts: List[ExplicitEncoderDecoderPrompt], -) -> List[Tuple[SingletonPromptInputs, SingletonPromptInputs]]: - return [(enc_dec_prompt['encoder_prompt'], - enc_dec_prompt['decoder_prompt']) + enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T]], +) -> List[Tuple[_T, Optional[_T]]]: + return [(enc_dec_prompt["encoder_prompt"], + enc_dec_prompt["decoder_prompt"]) for enc_dec_prompt in enc_dec_prompts] From 8a835cc7914235c24fdf226188397d661f8adeb4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 11:37:33 +0000 Subject: [PATCH 17/30] Properly handle `inputs["decoder_prompt"]=None` --- vllm/engine/async_llm_engine.py | 15 +++++++++++---- vllm/engine/llm_engine.py | 23 +++++++++++++++-------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 8c3d591a56397..de85953d4e21b 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -355,10 +355,17 @@ async def _process_encoder_decoder_prompt_async( request_id=request_id, ) - decoder_task = self._extract_prompt_components_async( - inputs["decoder_prompt"], - request_id=request_id, - ) + if (decoder_input := inputs["decoder_prompt"]) is None: + + async def dummy_task(): + return None, None, None + + decoder_task = dummy_task() + else: + decoder_task = self._extract_prompt_components_async( + decoder_input, + request_id=request_id, + ) ( (encoder_prompt, encoder_prompt_token_ids, encoder_mm_data), diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 7501327ef2712..67f37d6c8a656 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -830,14 +830,21 @@ def _process_encoder_decoder_prompt( request_id=request_id, ) - ( - decoder_prompt, - decoder_prompt_token_ids, - decoder_mm_data, - ) = self._extract_prompt_components( - inputs["decoder_prompt"], - request_id=request_id, - ) + if (decoder_input := inputs["decoder_prompt"]) is None: + ( + decoder_prompt, + decoder_prompt_token_ids, + decoder_mm_data, + ) = None, None, None + else: + ( + decoder_prompt, + decoder_prompt_token_ids, + decoder_mm_data, + ) = self._extract_prompt_components( + decoder_input, + request_id=request_id, + ) else: ( encoder_prompt, From e0024c29f4570480dc62407671ae09d6ab0826ac Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 11:47:55 +0000 Subject: [PATCH 18/30] Clean --- vllm/engine/async_llm_engine.py | 42 ++++++++++++++++++++------------- vllm/engine/llm_engine.py | 22 ++++++++--------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index de85953d4e21b..e6bc9eef41d68 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -355,33 +355,42 @@ async def _process_encoder_decoder_prompt_async( request_id=request_id, ) - if (decoder_input := inputs["decoder_prompt"]) is None: - - async def dummy_task(): - return None, None, None - - decoder_task = dummy_task() + decoder_input = inputs["decoder_prompt"] + if decoder_input is None: + ( + encoder_prompt, + encoder_prompt_ids, + encoder_mm_data, + ) = await encoder_task + + ( + decoder_prompt, + decoder_prompt_ids, + decoder_mm_data, + ) = None, None, None else: decoder_task = self._extract_prompt_components_async( decoder_input, request_id=request_id, ) - ( - (encoder_prompt, encoder_prompt_token_ids, encoder_mm_data), - (decoder_prompt, decoder_prompt_token_ids, decoder_mm_data), - ) = await asyncio.gather(encoder_task, decoder_task) + # NOTE: mypy crashes without the intermediate assignment to + # (a, b) + ( + (encoder_prompt, encoder_prompt_ids, encoder_mm_data), + (decoder_prompt, decoder_prompt_ids, decoder_mm_data), + ) = a, b = await asyncio.gather(encoder_task, decoder_task) else: ( encoder_prompt, - encoder_prompt_token_ids, + encoder_prompt_ids, encoder_mm_data, ) = await self._extract_prompt_components_async( inputs, request_id=request_id, ) - decoder_prompt_token_ids = encoder_prompt_token_ids + decoder_prompt_ids = encoder_prompt_ids decoder_prompt = encoder_prompt decoder_mm_data = encoder_mm_data @@ -389,14 +398,13 @@ async def dummy_task(): raise ValueError("Multi-modal data is not supported for " "(language) encoder-decoder models") - decoder_prompt_token_ids = ( - self._prepare_decoder_input_ids_for_generation( - decoder_prompt_token_ids)) + decoder_prompt_ids = ( + self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids)) return EncoderDecoderLLMInputs( - prompt_token_ids=decoder_prompt_token_ids, + prompt_token_ids=decoder_prompt_ids, prompt=decoder_prompt, - encoder_prompt_token_ids=encoder_prompt_token_ids, + encoder_prompt_token_ids=encoder_prompt_ids, encoder_prompt=encoder_prompt, ) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 67f37d6c8a656..3f3720781d1f9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -823,23 +823,24 @@ def _process_encoder_decoder_prompt( if is_explicit_encoder_decoder_prompt(inputs): ( encoder_prompt, - encoder_prompt_token_ids, + encoder_prompt_ids, encoder_mm_data, ) = self._extract_prompt_components( inputs["encoder_prompt"], request_id=request_id, ) - if (decoder_input := inputs["decoder_prompt"]) is None: + decoder_input = inputs["decoder_prompt"] + if decoder_input is None: ( decoder_prompt, - decoder_prompt_token_ids, + decoder_prompt_ids, decoder_mm_data, ) = None, None, None else: ( decoder_prompt, - decoder_prompt_token_ids, + decoder_prompt_ids, decoder_mm_data, ) = self._extract_prompt_components( decoder_input, @@ -848,14 +849,14 @@ def _process_encoder_decoder_prompt( else: ( encoder_prompt, - encoder_prompt_token_ids, + encoder_prompt_ids, encoder_mm_data, ) = self._extract_prompt_components( inputs, request_id=request_id, ) - decoder_prompt_token_ids = encoder_prompt_token_ids + decoder_prompt_ids = encoder_prompt_ids decoder_prompt = encoder_prompt decoder_mm_data = encoder_mm_data @@ -863,14 +864,13 @@ def _process_encoder_decoder_prompt( raise ValueError("Multi-modal data is not supported for " "(language) encoder-decoder models") - decoder_prompt_token_ids = ( - self._prepare_decoder_input_ids_for_generation( - decoder_prompt_token_ids)) + decoder_prompt_ids = ( + self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids)) return EncoderDecoderLLMInputs( - prompt_token_ids=decoder_prompt_token_ids, + prompt_token_ids=decoder_prompt_ids, prompt=decoder_prompt, - encoder_prompt_token_ids=encoder_prompt_token_ids, + encoder_prompt_token_ids=encoder_prompt_ids, encoder_prompt=encoder_prompt, ) From 76af1724f5f18aa4f3a31fb7c212b9158567163e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 11:55:28 +0000 Subject: [PATCH 19/30] Clean --- vllm/engine/async_llm_engine.py | 46 ++++++++++++--------------------- vllm/engine/llm_engine.py | 44 +++++++++++++------------------ 2 files changed, 34 insertions(+), 56 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index e6bc9eef41d68..5b9d49f513a19 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -13,7 +13,8 @@ from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout -from vllm.engine.llm_engine import LLMEngine +from vllm.engine.llm_engine import (DecoderPromptComponents, LLMEngine, + PromptComponents) from vllm.engine.metrics import StatLoggerBase from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.ray_utils import initialize_ray_cluster, ray @@ -22,7 +23,6 @@ from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.multimodal import MultiModalDataDict from vllm.outputs import EmbeddingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest @@ -314,7 +314,7 @@ async def _extract_prompt_components_async( inputs: SingletonPromptInputs, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> Tuple[Optional[str], List[int], Optional[MultiModalDataDict]]: + ) -> PromptComponents: """Async version of :meth:`_extract_prompt_components`.""" if isinstance(inputs, str): prompt = inputs @@ -349,50 +349,36 @@ async def _process_encoder_decoder_prompt_async( request_id: str, ) -> EncoderDecoderLLMInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" + encoder_comps: PromptComponents + decoder_comps: DecoderPromptComponents + if is_explicit_encoder_decoder_prompt(inputs): encoder_task = self._extract_prompt_components_async( inputs["encoder_prompt"], request_id=request_id, ) - decoder_input = inputs["decoder_prompt"] - if decoder_input is None: - ( - encoder_prompt, - encoder_prompt_ids, - encoder_mm_data, - ) = await encoder_task - - ( - decoder_prompt, - decoder_prompt_ids, - decoder_mm_data, - ) = None, None, None + if (decoder_input := inputs["decoder_prompt"]) is None: + encoder_comps = await encoder_task + decoder_comps = None, None, None else: decoder_task = self._extract_prompt_components_async( decoder_input, request_id=request_id, ) - # NOTE: mypy crashes without the intermediate assignment to - # (a, b) - ( - (encoder_prompt, encoder_prompt_ids, encoder_mm_data), - (decoder_prompt, decoder_prompt_ids, decoder_mm_data), - ) = a, b = await asyncio.gather(encoder_task, decoder_task) + encoder_comps, decoder_comps = await asyncio.gather( + encoder_task, decoder_task) else: - ( - encoder_prompt, - encoder_prompt_ids, - encoder_mm_data, - ) = await self._extract_prompt_components_async( + encoder_comps = await self._extract_prompt_components_async( inputs, request_id=request_id, ) - decoder_prompt_ids = encoder_prompt_ids - decoder_prompt = encoder_prompt - decoder_mm_data = encoder_mm_data + decoder_comps = encoder_comps + + encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps + decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps if encoder_mm_data is not None or decoder_mm_data is not None: raise ValueError("Multi-modal data is not supported for " diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3f3720781d1f9..66a870d99a836 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -70,6 +70,11 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: _O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput) +PromptComponents = Tuple[Optional[str], List[int], + Optional[MultiModalDataDict]] +DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]], + Optional[MultiModalDataDict]] + class LLMEngine: """An LLM engine that receives requests and generates texts. @@ -690,7 +695,7 @@ def _extract_prompt_components( inputs: SingletonPromptInputs, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> Tuple[Optional[str], List[int], Optional[MultiModalDataDict]]: + ) -> PromptComponents: ''' Extract the components of any single encoder or decoder input prompt. @@ -820,45 +825,32 @@ def _process_encoder_decoder_prompt( * :class:`EncoderDecoderLLMInputs` instance ''' + encoder_comps: PromptComponents + decoder_comps: DecoderPromptComponents + if is_explicit_encoder_decoder_prompt(inputs): - ( - encoder_prompt, - encoder_prompt_ids, - encoder_mm_data, - ) = self._extract_prompt_components( + encoder_comps = self._extract_prompt_components( inputs["encoder_prompt"], request_id=request_id, ) - decoder_input = inputs["decoder_prompt"] - if decoder_input is None: - ( - decoder_prompt, - decoder_prompt_ids, - decoder_mm_data, - ) = None, None, None + if (decoder_input := inputs["decoder_prompt"]) is None: + decoder_comps = None, None, None else: - ( - decoder_prompt, - decoder_prompt_ids, - decoder_mm_data, - ) = self._extract_prompt_components( + decoder_comps = self._extract_prompt_components( decoder_input, request_id=request_id, ) else: - ( - encoder_prompt, - encoder_prompt_ids, - encoder_mm_data, - ) = self._extract_prompt_components( + encoder_comps = self._extract_prompt_components( inputs, request_id=request_id, ) - decoder_prompt_ids = encoder_prompt_ids - decoder_prompt = encoder_prompt - decoder_mm_data = encoder_mm_data + decoder_comps = encoder_comps + + encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps + decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps if encoder_mm_data is not None or decoder_mm_data is not None: raise ValueError("Multi-modal data is not supported for " From 5c16f2e90f4c93c782676f4580eca2ec5f7c3c3b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 12:00:41 +0000 Subject: [PATCH 20/30] Fix incorrect decoder inputs in singleton case --- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/llm_engine.py | 2 +- vllm/inputs/data.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 5b9d49f513a19..973721c0f928f 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -375,7 +375,7 @@ async def _process_encoder_decoder_prompt_async( request_id=request_id, ) - decoder_comps = encoder_comps + decoder_comps = None, None, None encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 66a870d99a836..9b2cd3b5430db 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -847,7 +847,7 @@ def _process_encoder_decoder_prompt( request_id=request_id, ) - decoder_comps = encoder_comps + decoder_comps = None, None, None encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 0081d3c0f59bf..d7883a7a60fcd 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -69,7 +69,7 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T]): Only the encoder prompt may have multi-modal data. - Note that an ExplicitEncoderDecoderPrompt may not + Note that an :class:`ExplicitEncoderDecoderPrompt` may not be used as an input to a decoder-only model, and that the `encoder_prompt` and `decoder_prompt` fields of this data structure themselves must be From e239ba9deefd32697251eaa0efc51c6e07a67d16 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 12:05:24 +0000 Subject: [PATCH 21/30] Clean --- vllm/engine/llm_engine.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9b2cd3b5430db..dec326210070d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -533,7 +533,7 @@ def _get_eos_token_id(self, return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id - def _get_decoder_start_token_id(self, ) -> Optional[int]: + def _get_decoder_start_token_id(self) -> Optional[int]: ''' Obtain the decoder start token id employed by an encoder/decoder model. Returns None for non-encoder/decoder models or if the @@ -648,8 +648,7 @@ def _prepare_decoder_input_ids_for_generation( * Processed token list """ - decoder_start_token_id: Optional[int] = ( - self._get_decoder_start_token_id()) + decoder_start_token_id = self._get_decoder_start_token_id() assert decoder_start_token_id is not None if decoder_input_ids is None: From 4b0e3dff5ab0d4975fe8facbec51fd3ecd59ed69 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 12:12:43 +0000 Subject: [PATCH 22/30] Move functions to a more appropriate place --- vllm/config.py | 10 ++++++++++ vllm/engine/llm_engine.py | 7 +++---- vllm/utils.py | 20 -------------------- vllm/worker/worker.py | 6 ++---- 4 files changed, 15 insertions(+), 28 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index ec6d587e7925b..d912f17a0aa33 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -457,6 +457,16 @@ def _get_num_seqlen_agnostic_layers( if t != "attention" ]) + @property + def is_encoder_decoder_model(self) -> bool: + """Extract the HF encoder/decoder model flag.""" + return getattr(self.hf_config, "is_encoder_decoder", False) + + @property + def is_embedding_model(self) -> bool: + """Extract the embedding model flag.""" + return self.embedding_mode + class CacheConfig: """Configuration for the KV cache. diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dec326210070d..6edc002457fe9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -47,8 +47,7 @@ AnyTokenizer, BaseTokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) -from vllm.utils import (Counter, is_embedding_model_config, - is_encoder_decoder_model_config) +from vllm.utils import Counter from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -1563,7 +1562,7 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time) def is_encoder_decoder_model(self): - return is_encoder_decoder_model_config(self.model_config) + return self.model_config.is_encoder_decoder_model def is_embedding_model(self): - return is_embedding_model_config(self.model_config) + return self.model_config.is_embedding_model diff --git a/vllm/utils.py b/vllm/utils.py index fcfdfe85ed145..782b13920e915 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1142,23 +1142,3 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, """Utility function to run async task in a lock""" async with lock: return await task(*args, **kwargs) - - -def is_encoder_decoder_model_config(model_config) -> bool: - ''' - Extract the HF encoder/decoder model flag from the ModelConfig instance. - Return False if model_config is None. - ''' - return model_config is not None and \ - getattr(model_config.hf_config, - "is_encoder_decoder", - False) - - -def is_embedding_model_config(model_config) -> bool: - ''' - Extract the embedding model flag from the ModelConfig instance. - Return False if model_config is None. - ''' - return model_config is not None and \ - model_config.embedding_mode diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index ad6f6750ff980..45751eceacbca 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,8 +19,6 @@ from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import (is_embedding_model_config, - is_encoder_decoder_model_config) from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner @@ -113,10 +111,10 @@ def __init__( self.gpu_cache: Optional[List[List[torch.Tensor]]] = None def _is_encoder_decoder_model(self): - return is_encoder_decoder_model_config(self.model_config) + return self.model_config.is_encoder_decoder_model def _is_embedding_model(self): - return is_embedding_model_config(self.model_config) + return self.model_config.is_embedding_model def init_device(self) -> None: if self.device_config.device.type == "cuda": From 53f7f50d717e2da4783a063fe96951a437a504e6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 12:19:52 +0000 Subject: [PATCH 23/30] Remove outdated comment --- vllm/inputs/parse.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index b55f6003d575d..b5e8ef7860598 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -18,7 +18,6 @@ class ParsedTokens(TypedDict): is_tokens: Literal[True] -# https://github.com/vllm-project/vllm/pull/4028 @overload def parse_and_batch_prompt( prompt: Union[str, List[str]]) -> Sequence[ParsedText]: From 3afdbc548cafdaf0bea2ba72a011ecdda693035f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 12:55:18 +0000 Subject: [PATCH 24/30] Fix mismatch between hf and vllm output text --- tests/models/test_bart.py | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py index becf1b5b5df9e..9bca5a86f1241 100644 --- a/tests/models/test_bart.py +++ b/tests/models/test_bart.py @@ -2,6 +2,8 @@ Run `pytest tests/models/test_bart.py`. """ +from typing import List, Optional, Tuple + from vllm.utils import is_cpu if not is_cpu(): @@ -11,21 +13,31 @@ import pytest + from vllm.sequence import SampleLogprobs + from ..conftest import DecoderPromptType from .utils import check_logprobs_close MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] - DECODER_PROMPT_TYPES = ([ - DecoderPromptType.CUSTOM, DecoderPromptType.EMPTY_STR, - DecoderPromptType.NONE - ]) + def vllm_to_hf_output( + vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], + decoder_prompt_type: DecoderPromptType, + ): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "" + if decoder_prompt_type == DecoderPromptType.NONE: + hf_output_str = "" + hf_output_str + + return output_ids, hf_output_str, out_logprobs @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) - @pytest.mark.parametrize("decoder_prompt_type", DECODER_PROMPT_TYPES) + @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) def test_models( hf_runner, vllm_runner, @@ -145,8 +157,13 @@ def test_models( hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE else 0) - check_logprobs_close(outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens) + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) From c61b01f0f6f2c501e25a3af2e2a38702892964cc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 13:10:11 +0000 Subject: [PATCH 25/30] Factor out duplicate code --- vllm/engine/async_llm_engine.py | 17 +------------- vllm/engine/llm_engine.py | 39 +++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 32 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 973721c0f928f..ecf75a27bb11b 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -377,22 +377,7 @@ async def _process_encoder_decoder_prompt_async( decoder_comps = None, None, None - encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps - decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps - - if encoder_mm_data is not None or decoder_mm_data is not None: - raise ValueError("Multi-modal data is not supported for " - "(language) encoder-decoder models") - - decoder_prompt_ids = ( - self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids)) - - return EncoderDecoderLLMInputs( - prompt_token_ids=decoder_prompt_ids, - prompt=decoder_prompt, - encoder_prompt_token_ids=encoder_prompt_ids, - encoder_prompt=encoder_prompt, - ) + return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps) async def _process_decoder_only_prompt_async( self, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6edc002457fe9..c9261be5a4d42 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -785,6 +785,28 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: assert bos_token_id is not None return [bos_token_id] + def _build_enc_dec_llm_inputs( + self, + encoder_comps: PromptComponents, + decoder_comps: DecoderPromptComponents, + ) -> EncoderDecoderLLMInputs: + encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps + decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps + + if encoder_mm_data is not None or decoder_mm_data is not None: + raise ValueError("Multi-modal data is not supported for " + "(language) encoder-decoder models") + + decoder_prompt_ids = ( + self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids)) + + return EncoderDecoderLLMInputs( + prompt_token_ids=decoder_prompt_ids, + prompt=decoder_prompt, + encoder_prompt_token_ids=encoder_prompt_ids, + encoder_prompt=encoder_prompt, + ) + def _process_encoder_decoder_prompt( self, inputs: PromptInputs, @@ -847,22 +869,7 @@ def _process_encoder_decoder_prompt( decoder_comps = None, None, None - encoder_prompt, encoder_prompt_ids, encoder_mm_data = encoder_comps - decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps - - if encoder_mm_data is not None or decoder_mm_data is not None: - raise ValueError("Multi-modal data is not supported for " - "(language) encoder-decoder models") - - decoder_prompt_ids = ( - self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids)) - - return EncoderDecoderLLMInputs( - prompt_token_ids=decoder_prompt_ids, - prompt=decoder_prompt, - encoder_prompt_token_ids=encoder_prompt_ids, - encoder_prompt=encoder_prompt, - ) + return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps) def _process_decoder_only_prompt( self, From f8ed373f506abe057c070d61a3a2b60910e91c77 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 13:13:46 +0000 Subject: [PATCH 26/30] Factor out more duplicate code --- vllm/engine/async_llm_engine.py | 16 +++++----------- vllm/engine/llm_engine.py | 30 +++++++++++++++++++----------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index ecf75a27bb11b..af606292c35b2 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -387,22 +387,16 @@ async def _process_decoder_only_prompt_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> LLMInputs: """Async version of :meth:`_process_decoder_only_prompt`.""" - ( - prompt, - prompt_token_ids, - multi_modal_data, - ) = await self._extract_prompt_components_async( + prompt_comps = await self._extract_prompt_components_async( inputs, request_id=request_id, lora_request=lora_request, ) - prompt_token_ids = self._apply_prompt_adapter( - prompt_token_ids, prompt_adapter_request=prompt_adapter_request) - - return LLMInputs(prompt_token_ids=prompt_token_ids, - prompt=prompt, - multi_modal_data=multi_modal_data) + return self._build_decoder_only_llm_inputs( + prompt_comps, + prompt_adapter_request=prompt_adapter_request, + ) async def process_model_inputs_async( self, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c9261be5a4d42..70917333efa34 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -871,6 +871,20 @@ def _process_encoder_decoder_prompt( return self._build_enc_dec_llm_inputs(encoder_comps, decoder_comps) + def _build_decoder_only_llm_inputs( + self, + prompt_comps: PromptComponents, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> LLMInputs: + prompt, prompt_token_ids, multi_modal_data = prompt_comps + + prompt_token_ids = self._apply_prompt_adapter( + prompt_token_ids, prompt_adapter_request=prompt_adapter_request) + + return LLMInputs(prompt_token_ids=prompt_token_ids, + prompt=prompt, + multi_modal_data=multi_modal_data) + def _process_decoder_only_prompt( self, inputs: SingletonPromptInputs, @@ -894,22 +908,16 @@ def _process_decoder_only_prompt( * :class:`LLMInputs` instance ''' - ( - prompt, - prompt_token_ids, - multi_modal_data, - ) = self._extract_prompt_components( + prompt_comps = self._extract_prompt_components( inputs, request_id=request_id, lora_request=lora_request, ) - prompt_token_ids = self._apply_prompt_adapter( - prompt_token_ids, prompt_adapter_request=prompt_adapter_request) - - return LLMInputs(prompt_token_ids=prompt_token_ids, - prompt=prompt, - multi_modal_data=multi_modal_data) + return self._build_decoder_only_llm_inputs( + prompt_comps, + prompt_adapter_request=prompt_adapter_request, + ) def process_model_inputs( self, From a4df70ab9715ca40fed135f606df48cdb29270b2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 13:16:00 +0000 Subject: [PATCH 27/30] Remove default values to avoid accidentally miss those arguments --- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/llm_engine.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index af606292c35b2..21643852b029e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -299,7 +299,7 @@ async def _tokenize_prompt_async( self, prompt: str, request_id: str, - lora_request: Optional[LoRARequest] = None, + lora_request: Optional[LoRARequest], ) -> List[int]: """Async version of :meth:`_tokenize_prompt`.""" tokenizer = self.get_tokenizer_group("prompts must be None if " diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 70917333efa34..1bf7e220e713e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -625,7 +625,7 @@ def stop_remote_worker_execution_loop(self) -> None: def _prepare_decoder_input_ids_for_generation( self, - decoder_input_ids: Optional[List[int]] = None, + decoder_input_ids: Optional[List[int]], ) -> List[int]: """ Prepares `decoder_input_ids` for generation with encoder-decoder models. @@ -665,7 +665,7 @@ def _tokenize_prompt( self, prompt: str, request_id: str, - lora_request: Optional[LoRARequest] = None, + lora_request: Optional[LoRARequest], ) -> List[int]: ''' Wrapper around application of the model's tokenizer. @@ -740,7 +740,7 @@ def _extract_prompt_components( def _apply_prompt_adapter( self, prompt_token_ids: List[int], - prompt_adapter_request: Optional[PromptAdapterRequest] = None, + prompt_adapter_request: Optional[PromptAdapterRequest], ) -> List[int]: if prompt_adapter_request: prompt_token_ids = ( @@ -874,7 +874,7 @@ def _process_encoder_decoder_prompt( def _build_decoder_only_llm_inputs( self, prompt_comps: PromptComponents, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, + prompt_adapter_request: Optional[PromptAdapterRequest], ) -> LLMInputs: prompt, prompt_token_ids, multi_modal_data = prompt_comps From 5240bb335abffc3ce65c1e1b96e2eeebf0544fa6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 14:55:43 +0000 Subject: [PATCH 28/30] Add test for serving encoder/decoder model with OpenAI server --- .../openai/test_encoder_decoder.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/entrypoints/openai/test_encoder_decoder.py diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py new file mode 100644 index 0000000000000..85f1c6f18bf36 --- /dev/null +++ b/tests/entrypoints/openai/test_encoder_decoder.py @@ -0,0 +1,50 @@ +import openai +import pytest + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "facebook/bart-base" + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--dtype", + "bfloat16", + "--enforce-eager", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.fixture(scope="module") +def client(server): + return server.get_async_client() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): + completion = await client.completions.create(model=model_name, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + + choice = completion.choices[0] + assert len(choice.text) >= 5 + assert choice.finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=2, total_tokens=7) + + # test using token IDs + completion = await client.completions.create( + model=model_name, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + assert len(completion.choices[0].text) >= 1 From d321c82ee490048ba40e8be749c0fb42b91ee6a8 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 7 Aug 2024 15:47:37 +0000 Subject: [PATCH 29/30] Use two type variables --- tests/conftest.py | 6 ++--- vllm/entrypoints/chat_utils.py | 5 ++-- vllm/inputs/data.py | 44 +++++++++++++++++++++++----------- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5163b5c186e72..d565da5a1019c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -450,7 +450,7 @@ def generate_greedy_logprobs_limit( def generate_encoder_decoder_greedy_logprobs_limit( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str]], + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, **kwargs: Any, @@ -614,7 +614,7 @@ def generate_w_logprobs( def generate_encoder_decoder_w_logprobs( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str]], + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], sampling_params: SamplingParams, ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ''' @@ -659,7 +659,7 @@ def generate_greedy_logprobs( def generate_encoder_decoder_greedy_logprobs( self, - encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str]], + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 12634c3261856..1197c70d88ae3 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -2,8 +2,7 @@ from dataclasses import dataclass from functools import lru_cache from pathlib import Path -from typing import (Any, Awaitable, Iterable, List, Optional, Tuple, Union, - cast, final) +from typing import Any, Awaitable, Iterable, List, Optional, Tuple, Union, cast # yapf conflicts with isort for this block # yapf: disable @@ -59,7 +58,7 @@ class CustomChatCompletionMessageParam(TypedDict, total=False): CustomChatCompletionMessageParam] -@final # So that it should be compatible with Dict[str, str] +# TODO: Make fields ReadOnly once mypy supports it class ConversationMessage(TypedDict): role: str content: str diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index d7883a7a60fcd..75ab0c770155b 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -54,10 +54,18 @@ class TokensPrompt(TypedDict): more than one prompt, i.e. :class:`ExplicitEncoderDecoderPrompt` """ -_T = TypeVar("_T", bound=SingletonPromptInputs, default=SingletonPromptInputs) - - -class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T]): +_T1_co = TypeVar("_T1_co", + bound=SingletonPromptInputs, + default=SingletonPromptInputs, + covariant=True) +_T2_co = TypeVar("_T2_co", + bound=SingletonPromptInputs, + default=SingletonPromptInputs, + covariant=True) + + +# TODO: Make fields ReadOnly once mypy supports it +class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): """Represents an encoder/decoder model input prompt, comprising an explicit encoder prompt and a decoder prompt. @@ -76,9 +84,9 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T]): :class:`SingletonPromptInputs` instances. """ - encoder_prompt: _T + encoder_prompt: _T1_co - decoder_prompt: Optional[_T] + decoder_prompt: Optional[_T2_co] PromptInputs = Union[SingletonPromptInputs, ExplicitEncoderDecoderPrompt] @@ -132,18 +140,26 @@ class EncoderDecoderLLMInputs(LLMInputs): """ +_T1 = TypeVar("_T1", + bound=SingletonPromptInputs, + default=SingletonPromptInputs) +_T2 = TypeVar("_T2", + bound=SingletonPromptInputs, + default=SingletonPromptInputs) + + def build_explicit_enc_dec_prompt( - encoder_prompt: _T, - decoder_prompt: Optional[_T], -) -> ExplicitEncoderDecoderPrompt[_T]: + encoder_prompt: _T1, + decoder_prompt: Optional[_T2], +) -> ExplicitEncoderDecoderPrompt[_T1, _T2]: return ExplicitEncoderDecoderPrompt(encoder_prompt=encoder_prompt, decoder_prompt=decoder_prompt) def zip_enc_dec_prompts( - enc_prompts: Iterable[_T], - dec_prompts: Iterable[Optional[_T]], -) -> List[ExplicitEncoderDecoderPrompt[_T]]: + enc_prompts: Iterable[_T1], + dec_prompts: Iterable[Optional[_T2]], +) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of :class:`ExplicitEncoderDecoderPrompt` instances. @@ -155,8 +171,8 @@ def zip_enc_dec_prompts( def to_enc_dec_tuple_list( - enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T]], -) -> List[Tuple[_T, Optional[_T]]]: + enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]], +) -> List[Tuple[_T1, Optional[_T2]]]: return [(enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"]) for enc_dec_prompt in enc_dec_prompts] From e4c5c21e492c2d66b90afcc54472185d0de2c97c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 8 Aug 2024 02:23:10 +0000 Subject: [PATCH 30/30] Update error message --- vllm/engine/llm_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1bf7e220e713e..dcaf375f9b15d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -794,8 +794,8 @@ def _build_enc_dec_llm_inputs( decoder_prompt, decoder_prompt_ids, decoder_mm_data = decoder_comps if encoder_mm_data is not None or decoder_mm_data is not None: - raise ValueError("Multi-modal data is not supported for " - "(language) encoder-decoder models") + raise ValueError("Multi-modal encoder-decoder models are " + "not supported yet") decoder_prompt_ids = ( self._prepare_decoder_input_ids_for_generation(decoder_prompt_ids))