diff --git a/examples/openai-client.py b/examples/openai-client.py index 774b5cb..9d7c093 100644 --- a/examples/openai-client.py +++ b/examples/openai-client.py @@ -1,7 +1,7 @@ from openai import OpenAI -model = "llama" -# model = "smolm2-135m" +# model = "llama" +model = "smolm2-135m" # model = "olmo-7b" uri = "http://localhost:8000/v1/" @@ -28,7 +28,10 @@ stream = client.chat.completions.create( model=model, messages=messages, - max_tokens=500, + max_completion_tokens=200, + stop=["4.", "sushi"], + top_p=0.3, + # temperature=2.0, stream=True ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 9b8dce4..3b3a3c8 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -17,7 +17,7 @@ from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body from fastapi.middleware import Middleware from fastapi.middleware.cors import CORSMiddleware -from fastapi.security import HTTPBearer +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials from sse_starlette.sse import EventSourceResponse from starlette_context.plugins import RequestIdPlugin # type: ignore from starlette_context.middleware import RawContextMiddleware @@ -119,9 +119,8 @@ def create_app( server_settings = ServerSettings.model_validate(config_file_settings) model_settings = config_file_settings.models + # TODO: remove settings argument altogether. if server_settings is None and model_settings is None: - if settings is None: - settings = Settings() server_settings = ServerSettings.model_validate(settings) model_settings = [ModelSettings.model_validate(settings)] @@ -133,7 +132,7 @@ def create_app( middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))] app = FastAPI( middleware=middleware, - title="🦙 llama.cpp Python API", + title="NekkoAPI", version=llama_cpp.__version__, root_path=server_settings.root_path, ) @@ -191,8 +190,8 @@ def _logit_bias_tokens_to_input_ids( ) -> Dict[str, float]: to_bias: Dict[str, float] = {} for token, score in logit_bias.items(): - token = token.encode("utf-8") - for input_id in llama.tokenize(token, add_bos=False, special=True): + token_encoded = token.encode("utf-8") + for input_id in llama.tokenize(token_encoded, add_bos=False, special=True): to_bias[str(input_id)] = score return to_bias @@ -203,7 +202,7 @@ def _logit_bias_tokens_to_input_ids( async def authenticate( settings: Settings = Depends(get_server_settings), - authorization: Optional[str] = Depends(bearer_scheme), + authorization: Optional[HTTPAuthorizationCredentials] = Depends(bearer_scheme), ): # Skip API key check if it's not set in settings if settings.api_key is None: @@ -291,7 +290,6 @@ async def create_completion( "best_of", "logit_bias_type", "user", - "min_tokens", } kwargs = body.model_dump(exclude=exclude) @@ -305,15 +303,6 @@ async def create_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) - if body.min_tokens > 0: - _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( - [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] - ) - if "logits_processor" not in kwargs: - kwargs["logits_processor"] = _min_tokens_logits_processor - else: - kwargs["logits_processor"].extend(_min_tokens_logits_processor) - try: iterator_or_completion: Union[ llama_cpp.CreateCompletionResponse, @@ -334,7 +323,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: yield from iterator_or_completion exit_stack.close() - send_chan, recv_chan = anyio.create_memory_object_stream(10) + send_chan, recv_chan = anyio.create_memory_object_stream(10) # type: ignore return EventSourceResponse( recv_chan, data_sender_callable=partial( # type: ignore @@ -345,7 +334,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: on_complete=exit_stack.close, ), sep="\n", - ping_message_factory=_ping_message_factory, + ping_message_factory=_ping_message_factory, # type: ignore ) else: exit_stack.close() @@ -490,8 +479,10 @@ async def create_chat_completion( "n", "logit_bias_type", "user", - "min_tokens", + "max_completion_tokens", } + # TODO: use whitelisting and only include permitted fields. + # TODO: only leave OpenAI API compatible fields. kwargs = body.model_dump(exclude=exclude) llama = llama_proxy(body.model) if body.logit_bias is not None: @@ -504,14 +495,9 @@ async def create_chat_completion( if body.grammar is not None: kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) - if body.min_tokens > 0: - _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( - [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] - ) - if "logits_processor" not in kwargs: - kwargs["logits_processor"] = _min_tokens_logits_processor - else: - kwargs["logits_processor"].extend(_min_tokens_logits_processor) + # Override max_tokens with max_completion_tokens. + if body.max_completion_tokens is not None: + kwargs["max_tokens"] = body.max_completion_tokens try: iterator_or_completion: Union[ @@ -532,7 +518,7 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: yield from iterator_or_completion exit_stack.close() - send_chan, recv_chan = anyio.create_memory_object_stream(10) + send_chan, recv_chan = anyio.create_memory_object_stream(10) # type: ignore return EventSourceResponse( recv_chan, data_sender_callable=partial( # type: ignore @@ -543,7 +529,7 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: on_complete=exit_stack.close, ), sep="\n", - ping_message_factory=_ping_message_factory, + ping_message_factory=_ping_message_factory, # type: ignore ) else: exit_stack.close() diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index c6716f9..6ccc46b 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -24,21 +24,24 @@ def __init__(self, models: List[ModelSettings]) -> None: self._current_model: Optional[llama_cpp.Llama] = None self._current_model_alias: Optional[str] = None + # TODO: there should be no such thing as default model. self._default_model_settings: ModelSettings = models[0] self._default_model_alias: str = self._default_model_settings.model_alias # type: ignore + # TODO: when _default_model is removed, what do we set as + # current model and do we load it? # Load default model self._current_model = self.load_llama_from_model_settings( self._default_model_settings ) self._current_model_alias = self._default_model_alias - def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: + def __call__(self, model: str = None) -> llama_cpp.Llama: if model is None: model = self._default_model_alias if model not in self._model_settings_dict: - model = self._default_model_alias + raise ValueError(f"Model {model} not found.") if model == self._current_model_alias: if self._current_model is not None: diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 13c9512..405a3ea 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -103,6 +103,7 @@ class ModelSettings(BaseSettings): offload_kqv: bool = Field( default=True, description="Whether to offload kqv to the GPU." ) + # TODO: default this to True? flash_attn: bool = Field( default=False, description="Whether to use flash attention." ) diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index fdd1644..527a42f 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -16,24 +16,20 @@ default=16, ge=1, description="The maximum number of tokens to generate." ) -min_tokens_field = Field( - default=0, - ge=0, - description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).", -) - temperature_field = Field( - default=0.8, + default=1.0, + ge=0.0, + le=2.0, description="Adjust the randomness of the generated text.\n\n" - + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 1, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. We recommend to alter this or top_p, but not both.", ) top_p_field = Field( - default=0.95, + default=1.0, ge=0.0, le=1.0, description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" - + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.", + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. We recomment altering this or temperature, but not both.", ) min_p_field = Field( @@ -117,7 +113,6 @@ class CreateCompletionRequest(BaseModel): max_tokens: Optional[int] = Field( default=16, ge=0, description="The maximum number of tokens to generate." ) - min_tokens: int = min_tokens_field temperature: float = temperature_field top_p: float = top_p_field min_p: float = min_p_field @@ -212,8 +207,13 @@ class CreateChatCompletionRequest(BaseModel): max_tokens: Optional[int] = Field( default=None, description="The maximum number of tokens to generate. Defaults to inf", + deprecated="Deprecated in favor of max_completion_tokens", + ) + max_completion_tokens: Optional[int] = Field( + gt=0, + default=None, + description="An upper bound for the number of tokens that can be generated for a completion. Defaults to inf", ) - min_tokens: int = min_tokens_field logprobs: Optional[bool] = Field( default=False, description="Whether to output the logprobs or not. Default is True", @@ -236,8 +236,9 @@ class CreateChatCompletionRequest(BaseModel): default=None, ) + model: str = model_field + # ignored or currently unsupported - model: Optional[str] = model_field n: Optional[int] = 1 user: Optional[str] = Field(None) diff --git a/pyproject.toml b/pyproject.toml index 4c36ace..fe9e875 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "nekko_api" dynamic = ["version"] description = "OpenAI API compatible llama.cpp server" readme = "README.md" -license = { text = "MIT" } +license = { text = "Apache License 2.0" } authors = [ ] dependencies = [ @@ -48,6 +48,7 @@ test = [ "huggingface-hub>=0.23.0" ] dev = [ + "mypy>=1.13.0", "black>=23.3.0", "twine>=4.0.2", "mkdocs>=1.4.3",