From 36c7a4c44262d2daa37bf28652042a73e8af4344 Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Wed, 18 Dec 2024 08:32:32 -0800
Subject: [PATCH 01/12] add documentation

---
 docs/docs/contribution.md                     |   6 +-
 docs/docs/index.md                            | 108 -----
 libs/embed_package/embed/_infer.py            |   2 +-
 libs/infinity_emb/infinity_emb/cli.py         | 399 ++++++++++++++++++
 .../infinity_emb/inference/batch_handler.py   |  11 +-
 .../infinity_emb/inference/caching_layer.py   |   5 +-
 .../infinity_emb/infinity_server.py           | 377 +----------------
 .../tests/unit_test/test_infinity_server.py   |   7 +-
 8 files changed, 418 insertions(+), 497 deletions(-)
 create mode 100644 libs/infinity_emb/infinity_emb/cli.py
diff --git a/docs/docs/contribution.md b/docs/docs/contribution.md
index acdf4fe9..c7145f05 100644
--- a/docs/docs/contribution.md
+++ b/docs/docs/contribution.md
@@ -10,18 +10,20 @@ cd libs/infinity_emb
 poetry install --extras all --with test
 ```
 
-To ensure your contributions pass the Continuous Integration (CI) checks:
+To ensure your contributions pass the Continuous Integration (CI) checks. The `libs/infinity_emb/Makefile` is a useful entrypoint for this.
 ```bash
 cd libs/infinity_emb
 make format
 make lint
 poetry run pytest ./tests
 ```
-As an alternative, you can also use the following command:
+
+As an alternative, you can also use the following command, which bundles a range of the above.
 ```bash
 cd libs/infinity_emb
 make precommit
 ```
 
 ## CLA
+Infinity is developed as open source project. 
 All contributions must be made in a way to be compatible with the MIT License of this repo. 
\ No newline at end of file
diff --git a/docs/docs/index.md b/docs/docs/index.md
index 1e2a0bb3..e69de29b 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -1,108 +0,0 @@
-# [Infinity](https://github.com/michaelfeil/infinity)
-
-Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT License](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai) and other Embedding API providers.
-
-## Why Infinity
-
-Infinity provides the following features:
-
-* **Deploy any model from MTEB**: deploy the model you know from [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/)
-* **Fast inference backends**: The inference server is built on top of [torch](https://github.com/pytorch/pytorch), [optimum(onnx/tensorrt)](https://huggingface.co/docs/optimum/index) and [CTranslate2](https://github.com/OpenNMT/CTranslate2), using FlashAttention to get the most out of **CUDA**, **ROCM**, **CPU** or **MPS** device.
-* **Dynamic batching**: New embedding requests are queued while GPU is busy with the previous ones. New requests are squeezed intro your device as soon as ready. Similar max throughput on GPU as text-embeddings-inference.
-* **Correct and tested implementation**: Unit and end-to-end tested. Embeddings via infinity are identical to [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/) (up to numerical precision). Lets API users create embeddings till infinity and beyond.
-* **Easy to use**: The API is built on top of [FastAPI](https://fastapi.tiangolo.com/), [Swagger](https://swagger.io/) makes it fully documented. API are aligned to [OpenAI's Embedding specs](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings). See below on how to get started.
-
-## Getting started
-
-Install `infinity_emb` via pip
-```bash
-pip install infinity-emb[all]
-```
-
-<details>
-  <summary>Install from source with Poetry</summary>
-  
-  Advanced:
-  To install via Poetry use Poetry 1.8.4, Python 3.11 on Ubuntu 22.04
-  ```bash
-  git clone https://github.com/michaelfeil/infinity
-  cd infinity
-  cd libs/infinity_emb
-  poetry install --extras all
-  ```
-</details>
-
-### Launch the CLI using a pre-built docker container (recommended)
-
-```bash
-port=7997
-model1=michaelfeil/bge-small-en-v1.5
-model2=mixedbread-ai/mxbai-rerank-xsmall-v1
-volume=$PWD/data
-
-docker run -it --gpus all \
- -v $volume:/app/.cache \
- -p $port:$port \
- michaelf34/infinity:latest \
- v2 \
- --model-id $model1 \
- --model-id $model2 \
- --port $port
-```
-The cache path inside the docker container is set by the environment variable `HF_HOME`.
-
-### or launch the cli after the pip install
-After your pip install, with your venv activate, you can run the CLI directly.
-Check the `--help` command to get a description for all parameters.
-
-```bash
-infinity_emb --help
-```
-
-## Launch FAQ
-<details>
-  <summary>What are embedding models?</summary>
-  Embedding models can map any text to a low-dimensional dense vector which can be used for tasks like retrieval, classification, clustering, or semantic search. 
-  And it also can be used in vector databases for LLMs. 
-
-  
-  The most know architecture are encoder-only transformers such as BERT, and most popular implementation include [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/).
-</details>
-
-<details>
-  <summary>What models are supported?</summary>
-  
-  All models of the sentence transformers org are supported https://huggingface.co/sentence-transformers / sbert.net. 
-  LLM's like LLAMA2-7B are not intended for deployment.
-
-
-  With the command `--engine torch` the model must be compatible with https://github.com/UKPLab/sentence-transformers/.
-    - only models from Huggingface are supported.
-
-  
-  With the command `--engine ctranslate2`
-    - only `BERT` models are supported.
-    - only models from Huggingface are supported.
-    
-  
-  For the latest trends, you might want to check out one of the following models.
-    https://huggingface.co/spaces/mteb/leaderboard
-    
-</details>
-
-
-<details>
-  <summary>Using Langchain with Infinity</summary>
-  Now available under # Python Integrations in the side panel.  
-  ```
-</details>
-
-
-<details>
-  <summary>Question not answered here?</summary>
-
-  There is a Discussion section on the Github of Infinity:
-  https://github.com/michaelfeil/infinity/discussions
-
-</details>
-  
diff --git a/libs/embed_package/embed/_infer.py b/libs/embed_package/embed/_infer.py
index 93e185c8..ffee6422 100644
--- a/libs/embed_package/embed/_infer.py
+++ b/libs/embed_package/embed/_infer.py
@@ -2,7 +2,7 @@
 from typing import Collection, Literal, Union
 
 from infinity_emb import EngineArgs, SyncEngineArray  # type: ignore
-from infinity_emb.infinity_server import AutoPadding
+from infinity_emb.cli import AutoPadding
 
 __all__ = ["BatchedInference"]
 
diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py
new file mode 100644
index 00000000..2ed61749
--- /dev/null
+++ b/libs/infinity_emb/infinity_emb/cli.py
@@ -0,0 +1,399 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2023-now michaelfeil
+
+import asyncio
+import re
+import sys
+
+
+import infinity_emb
+from infinity_emb._optional_imports import CHECK_TYPER, CHECK_UVICORN
+from infinity_emb.args import EngineArgs
+from infinity_emb.env import MANAGER
+from infinity_emb.log_handler import UVICORN_LOG_LEVELS, logger
+from infinity_emb.primitives import (
+    Device,
+    DeviceID,
+    Dtype,
+    EmbeddingDtype,
+    InferenceEngine,
+    PoolingMethod,
+)
+from infinity_emb.infinity_server import create_server
+
+# CLI
+if CHECK_TYPER.is_available:
+    CHECK_TYPER.mark_required()
+    CHECK_UVICORN.mark_required()
+    import typer
+    import uvicorn
+
+    # path the asncio scheduler with uvloop
+    # which has theoretical speed-ups vs asyncio
+    loopname = "auto"
+    if sys.version_info < (3, 12):
+        try:
+            import uvloop
+
+            asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+            loopname = "uvloop"
+        except ImportError:
+            # Windows does not support uvloop
+            pass
+
+    tp = typer.Typer()
+
+    @tp.command("v1")
+    def v1(
+        # v1 is deprecated. Please do no longer modify it.
+        model_name_or_path: str = MANAGER.model_id[0],
+        served_model_name: str = MANAGER.served_model_name[0],
+        batch_size: int = MANAGER.batch_size[0],
+        revision: str = MANAGER.revision[0],
+        trust_remote_code: bool = MANAGER.trust_remote_code[0],
+        redirect_slash: str = MANAGER.redirect_slash,
+        engine: "InferenceEngine" = MANAGER.engine[0],  # type: ignore # noqa
+        model_warmup: bool = MANAGER.model_warmup[0],
+        vector_disk_cache: bool = MANAGER.vector_disk_cache[0],
+        device: "Device" = MANAGER.device[0],  # type: ignore
+        lengths_via_tokenize: bool = MANAGER.lengths_via_tokenize[0],
+        dtype: Dtype = MANAGER.dtype[0],  # type: ignore
+        embedding_dtype: "EmbeddingDtype" = EmbeddingDtype.default_value(),  # type: ignore
+        pooling_method: "PoolingMethod" = MANAGER.pooling_method[0],  # type: ignore
+        compile: bool = MANAGER.compile[0],
+        bettertransformer: bool = MANAGER.bettertransformer[0],
+        preload_only: bool = MANAGER.preload_only,
+        permissive_cors: bool = MANAGER.permissive_cors,
+        api_key: str = MANAGER.api_key,
+        url_prefix: str = MANAGER.url_prefix,
+        host: str = MANAGER.host,
+        port: int = MANAGER.port,
+        log_level: "UVICORN_LOG_LEVELS" = MANAGER.log_level,  # type: ignore
+    ):
+        """Infinity API ♾️  cli v1 - deprecated, consider use cli v2 via `infinity_emb v2`."""
+        if api_key:
+            # encourage switch to v2
+            raise ValueError("api_key is not supported in `v1`. Please migrate to `v2`.")
+        if not (
+            embedding_dtype == EmbeddingDtype.float32
+            or embedding_dtype == EmbeddingDtype.default_value()
+        ):
+            # encourage switch to v2
+            raise ValueError(
+                "selecting embedding_dtype is not supported in `v1`. Please migrate to `v2`."
+            )
+        logger.warning(
+            "CLI v1 is deprecated. Consider use CLI `v2`, by specifying `v2` as the command."
+        )
+        v2(
+            model_id=[model_name_or_path],
+            served_model_name=[served_model_name],  # type: ignore
+            batch_size=[batch_size],
+            revision=[revision],  # type: ignore
+            trust_remote_code=[trust_remote_code],
+            engine=[engine],
+            dtype=[dtype],
+            pooling_method=[pooling_method],
+            device=[device],
+            model_warmup=[model_warmup],
+            vector_disk_cache=[vector_disk_cache],
+            lengths_via_tokenize=[lengths_via_tokenize],
+            compile=[compile],
+            bettertransformer=[bettertransformer],
+            embedding_dtype=[EmbeddingDtype.float32],  # set to float32
+            # unique kwargs
+            preload_only=preload_only,
+            url_prefix=url_prefix,
+            host=host,
+            port=port,
+            redirect_slash=redirect_slash,
+            log_level=log_level,
+            permissive_cors=permissive_cors,
+            api_key=api_key,
+            proxy_root_path="",  # set as empty string
+        )
+
+    @tp.command("v2")
+    def v2(
+        # t
+        # arguments for engine
+        model_id: list[str] = typer.Option(
+            **_construct("model_id"),
+            help="Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference&",
+        ),
+        served_model_name: list[str] = typer.Option(
+            **_construct("served_model_name"),
+            help="the nickname for the API, under which the model_id can be selected",
+        ),
+        batch_size: list[int] = typer.Option(
+            **_construct("batch_size"), help="maximum batch size for inference"
+        ),
+        revision: list[str] = typer.Option(
+            **_construct("revision"), help="huggingface  model repo revision."
+        ),
+        trust_remote_code: list[bool] = typer.Option(
+            **_construct("trust_remote_code"),
+            help="if potential remote modeling code from huggingface repo is trusted.",
+        ),
+        engine: list["InferenceEngine"] = typer.Option(
+            **_construct("engine"),
+            help="Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.",
+        ),
+        model_warmup: list[bool] = typer.Option(
+            **_construct("model_warmup"),
+            help="if model should be warmed up after startup, and before ready.",
+        ),
+        vector_disk_cache: list[bool] = typer.Option(
+            **_construct("vector_disk_cache"),
+            help="If hash(request)/results should be cached to SQLite for latency improvement.",
+        ),
+        device: list[Device] = typer.Option(
+            **_construct("device"),
+            help="device to use for computing the model forward pass.",
+        ),
+        device_id: list[str] = typer.Option(
+            **_construct("device_id"),
+            help="device id defines the model placement. e.g. `0,1` will place the model on MPS/CUDA/GPU 0 and 1 each",
+        ),
+        lengths_via_tokenize: list[bool] = typer.Option(
+            **_construct("lengths_via_tokenize"),
+            help="if True, returned tokens is based on actual tokenizer count. If false, uses len(input) as proxy.",
+        ),
+        dtype: list[Dtype] = typer.Option(
+            **_construct("dtype"), help="dtype for the model weights."
+        ),
+        embedding_dtype: list["EmbeddingDtype"] = typer.Option(
+            **_construct("embedding_dtype"),
+            help="dtype post-forward pass. If != `float32`, using Post-Forward Static quantization.",
+        ),
+        pooling_method: list["PoolingMethod"] = typer.Option(
+            **_construct("pooling_method"),
+            help="overwrite the pooling method if inferred incorrectly.",
+        ),
+        compile: list[bool] = typer.Option(
+            **_construct("compile"),
+            help="Enable usage of `torch.compile(dynamic=True)` if engine relies on it.",
+        ),
+        bettertransformer: list[bool] = typer.Option(
+            **_construct("bettertransformer"),
+            help="Enables varlen flash-attention-2 via the `BetterTransformer` implementation. If available for this model.",
+        ),
+        # arguments for uvicorn / server
+        preload_only: bool = typer.Option(
+            **_construct("preload_only"),
+            help="If true, only downloads models and verifies setup, then exit. Recommended for pre-caching the download in a Dockerfile.",
+        ),
+        host: str = typer.Option(**_construct("host"), help="host for the FastAPI uvicorn server"),
+        port: int = typer.Option(**_construct("port"), help="port for the FastAPI uvicorn server"),
+        url_prefix: str = typer.Option(
+            **_construct("url_prefix"),
+            callback=validate_url,
+            help="prefix for all routes of the FastAPI uvicorn server. Useful if you run behind a proxy / cascaded API.",
+        ),
+        redirect_slash: str = typer.Option(
+            **_construct("redirect_slash"), help="where to redirect `/` requests to."
+        ),
+        log_level: "UVICORN_LOG_LEVELS" = typer.Option(
+            **_construct("log_level"), help="console log level."
+        ),  # type: ignore
+        permissive_cors: bool = typer.Option(
+            **_construct("permissive_cors"), help="whether to allow permissive cors."
+        ),
+        api_key: str = typer.Option(
+            **_construct("api_key"), help="api_key used for authentication headers."
+        ),
+        proxy_root_path: str = typer.Option(
+            **_construct("proxy_root_path"),
+            help="Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/",
+        ),
+    ):
+        """Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil \n
+        \n
+        Multiple Model CLI Playbook: \n
+        - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4` \n
+        - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;" \n
+        - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8. \n
+        """
+        # old
+        """
+        model_id, list[str]: Huggingface model, e.g.
+            ["michaelfeil/bge-small-en-v1.5", "mixedbread-ai/mxbai-embed-large-v1"]
+            Defaults to `INFINITY_MODEL_ID`
+        served_model_name, list[str]: "", e.g. ["bge-small-en-v1.5"]
+        batch_size, list[int]: batch size for forward pass.
+        revision: list[str]: revision of the model.
+        trust_remote_code, list[bool]: trust remote code.
+        url_prefix, str: prefix for api. typically "".
+        host, str: host-url, typically either "0.0.0.0" or "127.0.0.1".
+        port, int: port that you want to expose.
+        redirect_slash, str: redirect to of GET "/". Defaults to "/docs". Empty string to disable.
+        log_level: logging level.
+            For high performance, use "info" or higher levels. Defaults to "info".
+        engine, str: framework that should perform inference.
+        model_warmup, bool: perform model warmup before starting the server.
+            Defaults to True.
+        vector_disk_cache, bool: cache past embeddings in SQL.
+            Defaults to False or env-INFINITY_CACHE_VECTORS if set
+        device, Device: device to use for inference. Defaults to Device.auto or "auto"
+        lengths_via_tokenize: bool: schedule by token usage. Defaults to False.
+        dtype, Dtype: data type to use for inference. Defaults to Dtype.auto or "auto"
+        embedding_dtype, EmbeddingDtype: data type to use for embeddings. Defaults to EmbeddingDtype.float32 or "float32"
+        pooling_method, PoolingMethod: pooling method to use. Defaults to PoolingMethod.auto or "auto"
+        compile, bool: compile model for faster inference. Defaults to False.
+        use_bettertransformer, bool: use bettertransformer. Defaults to True.
+        preload_only, bool: only preload the model and exit. Defaults to False.
+        permissive_cors, bool: add permissive CORS headers to enable consumption from a browser. Defaults to False.
+        api_key, str: optional Bearer token for authentication. Defaults to "", which disables authentication.
+        proxy_root_path, str: optional Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/
+        """
+        logger.setLevel(log_level.to_int())
+        device_id_typed = [DeviceID(d) for d in typer_option_resolve(device_id)]
+        padder = AutoPadding(
+            length=len(model_id),
+            model_name_or_path=model_id,
+            batch_size=batch_size,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            engine=engine,
+            model_warmup=model_warmup,
+            vector_disk_cache_path=vector_disk_cache,
+            device=device,
+            device_id=device_id_typed,
+            lengths_via_tokenize=lengths_via_tokenize,
+            dtype=dtype,
+            embedding_dtype=embedding_dtype,
+            pooling_method=pooling_method,
+            compile=compile,
+            bettertransformer=bettertransformer,
+            served_model_name=served_model_name,
+        )
+
+        engine_args = []
+        for kwargs in padder:
+            engine_args.append(EngineArgs(**kwargs))
+
+        (
+            url_prefix,
+            host,
+            port,
+            redirect_slash,
+            log_level,
+            preload_only,
+            permissive_cors,
+            api_key,
+            proxy_root_path,
+        ) = typer_option_resolve(
+            url_prefix,
+            host,
+            port,
+            redirect_slash,
+            log_level,
+            preload_only,
+            permissive_cors,
+            api_key,
+            proxy_root_path,
+        )
+
+        app = create_server(
+            engine_args_list=engine_args,
+            url_prefix=url_prefix,
+            doc_extra=dict(host=host, port=port),
+            redirect_slash=redirect_slash,
+            preload_only=preload_only,
+            permissive_cors=permissive_cors,
+            api_key=api_key,
+            proxy_root_path=proxy_root_path,
+        )
+
+        uvicorn.run(
+            app,
+            host=host,
+            port=port,
+            log_level=log_level.name,
+            http="httptools",
+            loop=loopname,  # type: ignore
+        )
+
+    def cli():
+        CHECK_TYPER.mark_required()
+        if len(sys.argv) == 1 or sys.argv[1] not in [
+            "v1",
+            "v2",
+            "help",
+            "--help",
+            "--show-completion",
+            "--install-completion",
+        ]:
+            logger.critical(
+                "Error: No command given. Please use infinity with the `v2` command. "
+                f"This is deprecated since 0.0.32. You are on {infinity_emb.__version__}"
+                "Usage: `infinity_emb v2 --model-id BAAI/bge-large-en-v1.5"
+            )
+        tp()
+
+# helper functions for the CLI
+
+
+def _construct(name: str):
+    """constructs the default entry and type hint for the variable name"""
+    return dict(
+        # gets the default value from the ENV Manager
+        default=getattr(MANAGER, name),
+        # envvar is a dummy that is there for documentation purposes.
+        envvar=f"`{MANAGER.to_name(name)}`",
+    )
+
+
+def validate_url(path: str):
+    """
+    This regex matches:
+    - An empty string or A single '/'
+    - A string that starts with '/' and does not end with '/'
+    """
+    if re.match(r"^$|^/$|^/.*[^/]$", path):
+        return path
+    raise typer.BadParameter("Path must start with '/' and must not end with '/'")
+
+
+class AutoPadding:
+    """itertools.cycle with custom behaviour to pad to max length"""
+
+    def __init__(self, length: int, **kwargs):
+        self.length = length
+        self.kwargs = kwargs
+
+    def _resolve(self, x, iteration: int):
+        """pad x to length of self.length"""
+        x = typer_option_resolve(x)
+        if not isinstance(x, (list, tuple)):
+            return x
+        elif len(x) == 1:
+            return x[0]
+        elif len(x) == self.length:
+            return x[iteration]
+        else:
+            raise ValueError(f"Expected length {self.length} but got {len(x)}")
+
+    def __iter__(self):
+        """iterate over kwargs and pad them to length of self.length"""
+        for iteration in range(self.length):
+            kwargs = {}
+            for key, value in self.kwargs.items():
+                kwargs[key] = self._resolve(value, iteration)
+            yield kwargs
+
+
+def typer_option_resolve(*args):
+    """returns the value or the default value"""
+    if len(args) == 1:
+        return (
+            args[0].default  # if it is a typer option
+            if hasattr(args[0], "default") and hasattr(args[0], "envvar")
+            else args[0]  # if it is a normal value
+        )
+    return (a.default if (hasattr(a, "default") and hasattr(a, "envvar")) else a for a in args)
+
+
+if __name__ == "__main__":
+    if "cli" in locals():
+        cli()
diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
index 7ed4b83d..99e27b86 100644
--- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py
+++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -361,13 +361,12 @@ async def _get_prios_usage(self, items: Sequence[AbstractSingle]) -> tuple[list[
 
     def _publish_towards_model(
         self,
-        # shutdown: ShutdownReadOnly,
-        # queue_prio: "CustomFIFOQueue",
-        # publish_to_model_queue: Queue,
-        # max_batch_size: int,
-        # verbose: bool
     ):
-        """background thread for reading  exits only if shutdown.is_set()"""
+        """worker that moves batches from the priority_queue towards the model.
+        Runs in a separate thread, returns when self._shutdown.is_set().
+        """
+        # max_n_batches: how many batches are set for switching to `max-throughput` mode
+        # in thoughput mode, read the last n-batches
         max_n_batches = 8
         try:
             while not self._shutdown.is_set():
diff --git a/libs/infinity_emb/infinity_emb/inference/caching_layer.py b/libs/infinity_emb/infinity_emb/inference/caching_layer.py
index 0d48814f..9a6aafa9 100644
--- a/libs/infinity_emb/infinity_emb/inference/caching_layer.py
+++ b/libs/infinity_emb/infinity_emb/inference/caching_layer.py
@@ -22,7 +22,10 @@
 
 
 class Cache:
-    """wrapper around DiskCache"""
+    """wrapper around DiskCache. The Diskcache in infinity `races` against the model inference.
+    
+    The concept is that with the code be
+    """
 
     def __init__(self, cache_name: str, shutdown: threading.Event) -> None:
         """
diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
index 0ac1c835..2cd29e1e 100644
--- a/libs/infinity_emb/infinity_emb/infinity_server.py
+++ b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -3,9 +3,7 @@
 
 import asyncio
 import os
-import re
 import signal
-import sys
 import time
 import threading
 import uuid
@@ -13,24 +11,17 @@
 from typing import Any, Optional, Union, TYPE_CHECKING
 
 import infinity_emb
-from infinity_emb._optional_imports import CHECK_TYPER, CHECK_UVICORN
 from infinity_emb.args import EngineArgs
 from infinity_emb.engine import AsyncEmbeddingEngine, AsyncEngineArray
 from infinity_emb.env import MANAGER
 from infinity_emb.fastapi_schemas import docs, errors
-from infinity_emb.log_handler import UVICORN_LOG_LEVELS, logger
+from infinity_emb.log_handler import logger
 from infinity_emb.primitives import (
     AudioCorruption,
-    Device,
-    DeviceID,
-    Dtype,
-    EmbeddingDtype,
     ImageCorruption,
-    InferenceEngine,
     Modality,
     ModelCapabilites,
     ModelNotDeployedError,
-    PoolingMethod,
 )
 from infinity_emb.telemetry import PostHog, StartupTelemetry, telemetry_log_info
 
@@ -615,369 +606,3 @@ async def _embeddings_audio(data: AudioEmbeddingInput):
             )
 
     return app
-
-
-class AutoPadding:
-    """itertools.cycle with custom behaviour"""
-
-    def __init__(self, length: int, **kwargs):
-        self.length = length
-        self.kwargs = kwargs
-
-    def _resolve(self, x, iteration: int):
-        """pad x to length of self.length"""
-        x = typer_option_resolve(x)
-        if not isinstance(x, (list, tuple)):
-            return x
-        elif len(x) == 1:
-            return x[0]
-        elif len(x) == self.length:
-            return x[iteration]
-        else:
-            raise ValueError(f"Expected length {self.length} but got {len(x)}")
-
-    def __iter__(self):
-        """iterate over kwargs and pad them to length of self.length"""
-        for iteration in range(self.length):
-            kwargs = {}
-            for key, value in self.kwargs.items():
-                kwargs[key] = self._resolve(value, iteration)
-            yield kwargs
-
-
-def typer_option_resolve(*args):
-    """returns the value or the default value"""
-    if len(args) == 1:
-        return (
-            args[0].default  # if it is a typer option
-            if hasattr(args[0], "default") and hasattr(args[0], "envvar")
-            else args[0]  # if it is a normal value
-        )
-    return (a.default if (hasattr(a, "default") and hasattr(a, "envvar")) else a for a in args)
-
-
-# CLI
-if CHECK_TYPER.is_available:
-    CHECK_TYPER.mark_required()
-    CHECK_UVICORN.mark_required()
-    import typer
-    import uvicorn
-
-    loopname = "auto"
-    if sys.version_info < (3, 12):
-        try:
-            import uvloop
-
-            asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-            loopname = "uvloop"
-        except ImportError:
-            # Windows does not support uvloop
-            pass
-
-    tp = typer.Typer()
-
-    @tp.command("v1")
-    def v1(
-        # v1 is deprecated. Please do no longer modify it.
-        model_name_or_path: str = MANAGER.model_id[0],
-        served_model_name: str = MANAGER.served_model_name[0],
-        batch_size: int = MANAGER.batch_size[0],
-        revision: str = MANAGER.revision[0],
-        trust_remote_code: bool = MANAGER.trust_remote_code[0],
-        redirect_slash: str = MANAGER.redirect_slash,
-        engine: InferenceEngine = MANAGER.engine[0],  # type: ignore # noqa
-        model_warmup: bool = MANAGER.model_warmup[0],
-        vector_disk_cache: bool = MANAGER.vector_disk_cache[0],
-        device: Device = MANAGER.device[0],  # type: ignore
-        lengths_via_tokenize: bool = MANAGER.lengths_via_tokenize[0],
-        dtype: Dtype = MANAGER.dtype[0],  # type: ignore
-        embedding_dtype: EmbeddingDtype = EmbeddingDtype.default_value(),  # type: ignore
-        pooling_method: PoolingMethod = MANAGER.pooling_method[0],  # type: ignore
-        compile: bool = MANAGER.compile[0],
-        bettertransformer: bool = MANAGER.bettertransformer[0],
-        preload_only: bool = MANAGER.preload_only,
-        permissive_cors: bool = MANAGER.permissive_cors,
-        api_key: str = MANAGER.api_key,
-        url_prefix: str = MANAGER.url_prefix,
-        host: str = MANAGER.host,
-        port: int = MANAGER.port,
-        log_level: UVICORN_LOG_LEVELS = MANAGER.log_level,  # type: ignore
-    ):
-        """Infinity API ♾️  cli v1 - deprecated, consider use cli v2 via `infinity_emb v2`."""
-        if api_key:
-            raise ValueError("api_key is not supported in `v1`. Please migrate to `v2`.")
-        if not (
-            embedding_dtype == EmbeddingDtype.float32
-            or embedding_dtype == EmbeddingDtype.default_value()
-        ):
-            raise ValueError(
-                "selecting embedding_dtype is not supported in `v1`. Please migrate to `v2`."
-            )
-        logger.warning(
-            "CLI v1 is deprecated. Consider use CLI `v2`, by specifying `v2` as the command."
-        )
-        time.sleep(1)
-        v2(
-            model_id=[model_name_or_path],
-            served_model_name=[served_model_name],  # type: ignore
-            batch_size=[batch_size],
-            revision=[revision],  # type: ignore
-            trust_remote_code=[trust_remote_code],
-            engine=[engine],
-            dtype=[dtype],
-            pooling_method=[pooling_method],
-            device=[device],
-            model_warmup=[model_warmup],
-            vector_disk_cache=[vector_disk_cache],
-            lengths_via_tokenize=[lengths_via_tokenize],
-            compile=[compile],
-            bettertransformer=[bettertransformer],
-            embedding_dtype=[EmbeddingDtype.float32],  # set to float32
-            # unique kwargs
-            preload_only=preload_only,
-            url_prefix=url_prefix,
-            host=host,
-            port=port,
-            redirect_slash=redirect_slash,
-            log_level=log_level,
-            permissive_cors=permissive_cors,
-            api_key=api_key,
-            proxy_root_path="",  # set as empty string
-        )
-
-    def _construct(name: str):
-        """constructs the default entry and type hint for the variable name"""
-        return dict(
-            # gets the default value from the ENV Manager
-            default=getattr(MANAGER, name),
-            # envvar is a dummy that is there for documentation purposes.
-            envvar=f"`{MANAGER.to_name(name)}`",
-        )
-
-    def validate_url(path: str):
-        """
-        This regex matches:
-        - An empty string or A single '/'
-        - A string that starts with '/' and does not end with '/'
-        """
-        if re.match(r"^$|^/$|^/.*[^/]$", path):
-            return path
-        raise typer.BadParameter("Path must start with '/' and must not end with '/'")
-
-    @tp.command("v2")
-    def v2(
-        # t
-        # arguments for engine
-        model_id: list[str] = typer.Option(
-            **_construct("model_id"),
-            help="Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference&",
-        ),
-        served_model_name: list[str] = typer.Option(
-            **_construct("served_model_name"),
-            help="the nickname for the API, under which the model_id can be selected",
-        ),
-        batch_size: list[int] = typer.Option(
-            **_construct("batch_size"), help="maximum batch size for inference"
-        ),
-        revision: list[str] = typer.Option(
-            **_construct("revision"), help="huggingface  model repo revision."
-        ),
-        trust_remote_code: list[bool] = typer.Option(
-            **_construct("trust_remote_code"),
-            help="if potential remote modeling code from huggingface repo is trusted.",
-        ),
-        engine: list[InferenceEngine] = typer.Option(
-            **_construct("engine"),
-            help="Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.",
-        ),
-        model_warmup: list[bool] = typer.Option(
-            **_construct("model_warmup"),
-            help="if model should be warmed up after startup, and before ready.",
-        ),
-        vector_disk_cache: list[bool] = typer.Option(
-            **_construct("vector_disk_cache"),
-            help="If hash(request)/results should be cached to SQLite for latency improvement.",
-        ),
-        device: list[Device] = typer.Option(
-            **_construct("device"),
-            help="device to use for computing the model forward pass.",
-        ),
-        device_id: list[str] = typer.Option(
-            **_construct("device_id"),
-            help="device id defines the model placement. e.g. `0,1` will place the model on MPS/CUDA/GPU 0 and 1 each",
-        ),
-        lengths_via_tokenize: list[bool] = typer.Option(
-            **_construct("lengths_via_tokenize"),
-            help="if True, returned tokens is based on actual tokenizer count. If false, uses len(input) as proxy.",
-        ),
-        dtype: list[Dtype] = typer.Option(
-            **_construct("dtype"), help="dtype for the model weights."
-        ),
-        embedding_dtype: list[EmbeddingDtype] = typer.Option(
-            **_construct("embedding_dtype"),
-            help="dtype post-forward pass. If != `float32`, using Post-Forward Static quantization.",
-        ),
-        pooling_method: list[PoolingMethod] = typer.Option(
-            **_construct("pooling_method"),
-            help="overwrite the pooling method if inferred incorrectly.",
-        ),
-        compile: list[bool] = typer.Option(
-            **_construct("compile"),
-            help="Enable usage of `torch.compile(dynamic=True)` if engine relies on it.",
-        ),
-        bettertransformer: list[bool] = typer.Option(
-            **_construct("bettertransformer"),
-            help="Enables varlen flash-attention-2 via the `BetterTransformer` implementation. If available for this model.",
-        ),
-        # arguments for uvicorn / server
-        preload_only: bool = typer.Option(
-            **_construct("preload_only"),
-            help="If true, only downloads models and verifies setup, then exit. Recommended for pre-caching the download in a Dockerfile.",
-        ),
-        host: str = typer.Option(**_construct("host"), help="host for the FastAPI uvicorn server"),
-        port: int = typer.Option(**_construct("port"), help="port for the FastAPI uvicorn server"),
-        url_prefix: str = typer.Option(
-            **_construct("url_prefix"),
-            callback=validate_url,
-            help="prefix for all routes of the FastAPI uvicorn server. Useful if you run behind a proxy / cascaded API.",
-        ),
-        redirect_slash: str = typer.Option(
-            **_construct("redirect_slash"), help="where to redirect `/` requests to."
-        ),
-        log_level: UVICORN_LOG_LEVELS = typer.Option(
-            **_construct("log_level"), help="console log level."
-        ),  # type: ignore
-        permissive_cors: bool = typer.Option(
-            **_construct("permissive_cors"), help="whether to allow permissive cors."
-        ),
-        api_key: str = typer.Option(
-            **_construct("api_key"), help="api_key used for authentication headers."
-        ),
-        proxy_root_path: str = typer.Option(
-            **_construct("proxy_root_path"),
-            help="Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/",
-        ),
-    ):
-        """Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil \n
-        \n
-        Multiple Model CLI Playbook: \n
-        - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4` \n
-        - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;" \n
-        - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8. \n
-        """
-        # old
-        """
-        model_id, list[str]: Huggingface model, e.g.
-            ["michaelfeil/bge-small-en-v1.5", "mixedbread-ai/mxbai-embed-large-v1"]
-            Defaults to `INFINITY_MODEL_ID`
-        served_model_name, list[str]: "", e.g. ["bge-small-en-v1.5"]
-        batch_size, list[int]: batch size for forward pass.
-        revision: list[str]: revision of the model.
-        trust_remote_code, list[bool]: trust remote code.
-        url_prefix, str: prefix for api. typically "".
-        host, str: host-url, typically either "0.0.0.0" or "127.0.0.1".
-        port, int: port that you want to expose.
-        redirect_slash, str: redirect to of GET "/". Defaults to "/docs". Empty string to disable.
-        log_level: logging level.
-            For high performance, use "info" or higher levels. Defaults to "info".
-        engine, str: framework that should perform inference.
-        model_warmup, bool: perform model warmup before starting the server.
-            Defaults to True.
-        vector_disk_cache, bool: cache past embeddings in SQL.
-            Defaults to False or env-INFINITY_CACHE_VECTORS if set
-        device, Device: device to use for inference. Defaults to Device.auto or "auto"
-        lengths_via_tokenize: bool: schedule by token usage. Defaults to False.
-        dtype, Dtype: data type to use for inference. Defaults to Dtype.auto or "auto"
-        embedding_dtype, EmbeddingDtype: data type to use for embeddings. Defaults to EmbeddingDtype.float32 or "float32"
-        pooling_method, PoolingMethod: pooling method to use. Defaults to PoolingMethod.auto or "auto"
-        compile, bool: compile model for faster inference. Defaults to False.
-        use_bettertransformer, bool: use bettertransformer. Defaults to True.
-        preload_only, bool: only preload the model and exit. Defaults to False.
-        permissive_cors, bool: add permissive CORS headers to enable consumption from a browser. Defaults to False.
-        api_key, str: optional Bearer token for authentication. Defaults to "", which disables authentication.
-        proxy_root_path, str: optional Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/
-        """
-        logger.setLevel(log_level.to_int())
-        device_id_typed = [DeviceID(d) for d in typer_option_resolve(device_id)]
-        padder = AutoPadding(
-            length=len(model_id),
-            model_name_or_path=model_id,
-            batch_size=batch_size,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            engine=engine,
-            model_warmup=model_warmup,
-            vector_disk_cache_path=vector_disk_cache,
-            device=device,
-            device_id=device_id_typed,
-            lengths_via_tokenize=lengths_via_tokenize,
-            dtype=dtype,
-            embedding_dtype=embedding_dtype,
-            pooling_method=pooling_method,
-            compile=compile,
-            bettertransformer=bettertransformer,
-            served_model_name=served_model_name,
-        )
-
-        engine_args = []
-        for kwargs in padder:
-            engine_args.append(EngineArgs(**kwargs))
-
-        (
-            url_prefix,
-            host,
-            port,
-            redirect_slash,
-            log_level,
-            preload_only,
-            permissive_cors,
-            api_key,
-            proxy_root_path,
-        ) = typer_option_resolve(
-            url_prefix,
-            host,
-            port,
-            redirect_slash,
-            log_level,
-            preload_only,
-            permissive_cors,
-            api_key,
-            proxy_root_path,
-        )
-
-        app = create_server(
-            engine_args_list=engine_args,
-            url_prefix=url_prefix,
-            doc_extra=dict(host=host, port=port),
-            redirect_slash=redirect_slash,
-            preload_only=preload_only,
-            permissive_cors=permissive_cors,
-            api_key=api_key,
-            proxy_root_path=proxy_root_path,
-        )
-
-        uvicorn.run(
-            app,
-            host=host,
-            port=port,
-            log_level=log_level.name,
-            http="httptools",
-            loop=loopname,  # type: ignore
-        )
-
-    def cli():
-        CHECK_TYPER.mark_required()
-        if len(sys.argv) == 1 or sys.argv[1] not in ["v1", "v2", "help", "--help"]:
-            for _ in range(3):
-                logger.error(
-                    "Error: No command given. Defaulting to `v1`. "
-                    "Relying on this side effect is considered an error and "
-                    "will be deprecated in the future, which requires explicit usage of a `infinity_emb v1` or `infinity_emb v2`. "
-                    "Specify the version of the CLI you want to use. "
-                )
-                time.sleep(1)
-            sys.argv.insert(1, "v1")
-        tp()
-
-    if __name__ == "__main__":
-        cli()
diff --git a/libs/infinity_emb/tests/unit_test/test_infinity_server.py b/libs/infinity_emb/tests/unit_test/test_infinity_server.py
index d0bcb6a3..00191130 100644
--- a/libs/infinity_emb/tests/unit_test/test_infinity_server.py
+++ b/libs/infinity_emb/tests/unit_test/test_infinity_server.py
@@ -7,14 +7,15 @@
 
 from infinity_emb.args import EngineArgs
 from infinity_emb.infinity_server import (
+    create_server,
+)
+
+from infinity_emb.cli import (
     UVICORN_LOG_LEVELS,
     Device,
     Dtype,
     InferenceEngine,
     PoolingMethod,
-    create_server,
-    v1,
-    v2,
 )
 
 

From 013dad91837202cc3ddc044f6144b289a154756a Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Tue, 31 Dec 2024 10:13:09 +0100
Subject: [PATCH 02/12] update files: for cli

---
 docs/docs/contribution.md                     |   4 +-
 libs/infinity_emb/infinity_emb/cli.py         | 126 +++++++++---------
 .../infinity_emb/inference/batch_handler.py   |   2 +-
 .../infinity_emb/inference/caching_layer.py   |   2 +-
 .../tests/unit_test/test_infinity_server.py   |   2 +
 5 files changed, 71 insertions(+), 65 deletions(-)

diff --git a/docs/docs/contribution.md b/docs/docs/contribution.md
index c7145f05..ad862132 100644
--- a/docs/docs/contribution.md
+++ b/docs/docs/contribution.md
@@ -10,11 +10,13 @@ cd libs/infinity_emb
 poetry install --extras all --with test
 ```
 
-To ensure your contributions pass the Continuous Integration (CI) checks. The `libs/infinity_emb/Makefile` is a useful entrypoint for this.
+To ensure your contributions pass the Continuous Integration (CI), there are some useful local actions.
+The `libs/infinity_emb/Makefile` is a useful entrypoint for this.
 ```bash
 cd libs/infinity_emb
 make format
 make lint
+make template-docker
 poetry run pytest ./tests
 ```
 
diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py
index 2ed61749..dd8a6d8c 100644
--- a/libs/infinity_emb/infinity_emb/cli.py
+++ b/libs/infinity_emb/infinity_emb/cli.py
@@ -21,6 +21,70 @@
 )
 from infinity_emb.infinity_server import create_server
 
+
+# helper functions for the CLI
+
+
+def validate_url(path: str):
+    """
+    This regex matches:
+    - An empty string or A single '/'
+    - A string that starts with '/' and does not end with '/'
+    """
+    if re.match(r"^$|^/$|^/.*[^/]$", path):
+        return path
+    raise typer.BadParameter("Path must start with '/' and must not end with '/'")
+
+
+class AutoPadding:
+    """itertools.cycle with custom behaviour to pad to max length"""
+
+    def __init__(self, length: int, **kwargs):
+        self.length = length
+        self.kwargs = kwargs
+
+    def _resolve(self, x, iteration: int):
+        """pad x to length of self.length"""
+        x = typer_option_resolve(x)
+        if not isinstance(x, (list, tuple)):
+            return x
+        elif len(x) == 1:
+            return x[0]
+        elif len(x) == self.length:
+            return x[iteration]
+        else:
+            raise ValueError(f"Expected length {self.length} but got {len(x)}")
+
+    def __iter__(self):
+        """iterate over kwargs and pad them to length of self.length"""
+        for iteration in range(self.length):
+            kwargs = {}
+            for key, value in self.kwargs.items():
+                kwargs[key] = self._resolve(value, iteration)
+            yield kwargs
+
+
+def typer_option_resolve(*args):
+    """returns the value or the default value"""
+    if len(args) == 1:
+        return (
+            args[0].default  # if it is a typer option
+            if hasattr(args[0], "default") and hasattr(args[0], "envvar")
+            else args[0]  # if it is a normal value
+        )
+    return (a.default if (hasattr(a, "default") and hasattr(a, "envvar")) else a for a in args)
+
+
+def _construct(name: str):
+    """constructs the default entry and type hint for the variable name"""
+    return dict(
+        # gets the default value from the ENV Manager
+        default=getattr(MANAGER, name),
+        # envvar is a dummy that is there for documentation purposes.
+        envvar=f"`{MANAGER.to_name(name)}`",
+    )
+
+
 # CLI
 if CHECK_TYPER.is_available:
     CHECK_TYPER.mark_required()
@@ -331,68 +395,6 @@ def cli():
             )
         tp()
 
-# helper functions for the CLI
-
-
-def _construct(name: str):
-    """constructs the default entry and type hint for the variable name"""
-    return dict(
-        # gets the default value from the ENV Manager
-        default=getattr(MANAGER, name),
-        # envvar is a dummy that is there for documentation purposes.
-        envvar=f"`{MANAGER.to_name(name)}`",
-    )
-
-
-def validate_url(path: str):
-    """
-    This regex matches:
-    - An empty string or A single '/'
-    - A string that starts with '/' and does not end with '/'
-    """
-    if re.match(r"^$|^/$|^/.*[^/]$", path):
-        return path
-    raise typer.BadParameter("Path must start with '/' and must not end with '/'")
-
-
-class AutoPadding:
-    """itertools.cycle with custom behaviour to pad to max length"""
-
-    def __init__(self, length: int, **kwargs):
-        self.length = length
-        self.kwargs = kwargs
-
-    def _resolve(self, x, iteration: int):
-        """pad x to length of self.length"""
-        x = typer_option_resolve(x)
-        if not isinstance(x, (list, tuple)):
-            return x
-        elif len(x) == 1:
-            return x[0]
-        elif len(x) == self.length:
-            return x[iteration]
-        else:
-            raise ValueError(f"Expected length {self.length} but got {len(x)}")
-
-    def __iter__(self):
-        """iterate over kwargs and pad them to length of self.length"""
-        for iteration in range(self.length):
-            kwargs = {}
-            for key, value in self.kwargs.items():
-                kwargs[key] = self._resolve(value, iteration)
-            yield kwargs
-
-
-def typer_option_resolve(*args):
-    """returns the value or the default value"""
-    if len(args) == 1:
-        return (
-            args[0].default  # if it is a typer option
-            if hasattr(args[0], "default") and hasattr(args[0], "envvar")
-            else args[0]  # if it is a normal value
-        )
-    return (a.default if (hasattr(a, "default") and hasattr(a, "envvar")) else a for a in args)
-
 
 if __name__ == "__main__":
     if "cli" in locals():
diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
index 9175f6bc..740feb95 100644
--- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py
+++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -380,7 +380,7 @@ def _publish_towards_model(
         Runs in a separate thread, returns when self._shutdown.is_set().
         """
         # max_n_batches: how many batches are set for switching to `max-throughput` mode
-        # in thoughput mode, read the last n-batches
+        # in throughput mode, read the last n-batches
         max_n_batches = 8
         try:
             while not self._shutdown.is_set():
diff --git a/libs/infinity_emb/infinity_emb/inference/caching_layer.py b/libs/infinity_emb/infinity_emb/inference/caching_layer.py
index 9a6aafa9..4a5628f5 100644
--- a/libs/infinity_emb/infinity_emb/inference/caching_layer.py
+++ b/libs/infinity_emb/infinity_emb/inference/caching_layer.py
@@ -23,7 +23,7 @@
 
 class Cache:
     """wrapper around DiskCache. The Diskcache in infinity `races` against the model inference.
-    
+
     The concept is that with the code be
     """
 
diff --git a/libs/infinity_emb/tests/unit_test/test_infinity_server.py b/libs/infinity_emb/tests/unit_test/test_infinity_server.py
index 00191130..20d8207d 100644
--- a/libs/infinity_emb/tests/unit_test/test_infinity_server.py
+++ b/libs/infinity_emb/tests/unit_test/test_infinity_server.py
@@ -9,6 +9,8 @@
 from infinity_emb.infinity_server import (
     create_server,
 )
+from infinity_emb.cli import v1, v2
+
 
 from infinity_emb.cli import (
     UVICORN_LOG_LEVELS,

From f5d2d6f43013ebe91b737f967f52c30313133923 Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Tue, 31 Dec 2024 10:14:41 +0100
Subject: [PATCH 03/12] update cli definition

---
 libs/infinity_emb/infinity_emb/cli.py | 2 +-
 libs/infinity_emb/pyproject.toml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py
index dd8a6d8c..03ba2189 100644
--- a/libs/infinity_emb/infinity_emb/cli.py
+++ b/libs/infinity_emb/infinity_emb/cli.py
@@ -92,7 +92,7 @@ def _construct(name: str):
     import typer
     import uvicorn
 
-    # path the asncio scheduler with uvloop
+    # patch the asyncio scheduler with uvloop
     # which has theoretical speed-ups vs asyncio
     loopname = "auto"
     if sys.version_info < (3, 12):
diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml
index 1bd621d1..d4787830 100644
--- a/libs/infinity_emb/pyproject.toml
+++ b/libs/infinity_emb/pyproject.toml
@@ -59,7 +59,7 @@ soundfile = {version="^0.12.1", optional=true}
 
 
 [tool.poetry.scripts]
-infinity_emb = "infinity_emb.infinity_server:cli"
+infinity_emb = "infinity_emb.cli:cli"
 
 [tool.poetry.group.test.dependencies]
 pytest = "^8.0.0"

From 46dc3d79f7f755bac4f12cba5ec7b777596bea6b Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Tue, 31 Dec 2024 10:31:28 +0100
Subject: [PATCH 04/12] update: openapi

---
 docs/assets/openapi.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/assets/openapi.json b/docs/assets/openapi.json
index 497a0bc5..0ee52977 100644
--- a/docs/assets/openapi.json
+++ b/docs/assets/openapi.json
@@ -1 +1 @@
-{"openapi":"3.1.0","info":{"title":"♾️ Infinity - Embedding Inference Server","summary":"Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip. Infinity is developed under MIT License at https://github.com/michaelfeil/infinity.","contact":{"name":"Michael Feil, Raphael Wirth"},"license":{"name":"MIT License","identifier":"MIT"},"version":"0.0.72"},"paths":{"/health":{"get":{"summary":" Health","description":"health check endpoint\n\nReturns:\n    dict(unix=float): dict with unix time stamp","operationId":"health","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"type":"number"},"type":"object","title":"Response Health"}}}}}}},"/":{"get":{"summary":"Redirect","operationId":"redirect__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/models":{"get":{"summary":" Models","description":"get models endpoint","operationId":"models","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIModelInfo"}}}}}}},"/embeddings":{"post":{"summary":" Embeddings","description":"Encode Embeddings. Supports with multimodal inputs. Aligned with OpenAI Embeddings API.\n\n## Running Text Embeddings\n```python\nimport requests, base64\nrequests.post(\"http://..:7997/embeddings\",\n    json={\"model\":\"openai/clip-vit-base-patch32\",\"input\":[\"Two cute cats.\"]})\n```\n\n## Running Image Embeddings\n```python\nrequests.post(\"http://..:7997/embeddings\",\n    json={\n        \"model\": \"openai/clip-vit-base-patch32\",\n        \"encoding_format\": \"base64\",\n        \"input\": [\n            \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n            # can also be base64 encoded\n        ],\n        # set extra modality to image to process as image\n        \"modality\": \"image\"\n)\n```\n\n## Running Audio Embeddings\n```python\nimport requests, base64\nurl = \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\"\n\ndef url_to_base64(url, modality = \"image\"):\n    '''small helper to convert url to base64 without server requiring access to the url'''\n    response = requests.get(url)\n    response.raise_for_status()\n    base64_encoded = base64.b64encode(response.content).decode('utf-8')\n    mimetype = f\"{modality}/{url.split('.')[-1]}\"\n    return f\"data:{mimetype};base64,{base64_encoded}\"\n\nrequests.post(\"http://localhost:7997/embeddings\",\n    json={\n        \"model\": \"laion/larger_clap_general\",\n        \"encoding_format\": \"float\",\n        \"input\": [\n            url, url_to_base64(url, \"audio\")\n        ],\n        # set extra modality to audio to process as audio\n        \"modality\": \"audio\"\n    }\n)\n```\n\n## Running via OpenAI Client\n```python\nfrom openai import OpenAI # pip install openai==1.51.0\nclient = OpenAI(base_url=\"http://localhost:7997/\")\nclient.embeddings.create(\n    model=\"laion/larger_clap_general\",\n    input=[url_to_base64(url, \"audio\")],\n    encoding_format=\"float\",\n    extra_body={\n        \"modality\": \"audio\"\n    }\n)\n\nclient.embeddings.create(\n    model=\"laion/larger_clap_general\",\n    input=[\"the sound of a beep\", \"the sound of a cat\"],\n    encoding_format=\"base64\", # base64: optional high performance setting\n    extra_body={\n        \"modality\": \"text\"\n    }\n)\n```\n\n### Hint: Run all the above models on one server:\n```bash\ninfinity_emb v2 --model-id BAAI/bge-small-en-v1.5 --model-id openai/clip-vit-base-patch32 --model-id laion/larger_clap_general\n```","operationId":"embeddings","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/MultiModalOpenAIEmbedding"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/rerank":{"post":{"summary":" Rerank","description":"Rerank documents. Aligned with Cohere API (https://docs.cohere.com/reference/rerank)\n\n```python\nimport requests\nrequests.post(\"http://..:7997/rerank\",\n    json={\n        \"model\":\"mixedbread-ai/mxbai-rerank-xsmall-v1\",\n        \"query\":\"Where is Munich?\",\n        \"documents\":[\"Munich is in Germany.\", \"The sky is blue.\"]\n    })\n```","operationId":"rerank","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RerankInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ReRankResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/classify":{"post":{"summary":" Classify","description":"Score or Classify Sentiments\n\n```python\nimport requests\nrequests.post(\"http://..:7997/classify\",\n    json={\"model\":\"SamLowe/roberta-base-go_emotions\",\"input\":[\"I am not having a great day.\"]})\n```","operationId":"classify","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/embeddings_image":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `image`","description":"Encode Embeddings from Image files\n\nSupports URLs of Images and Base64-encoded Images\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_image\",\n    json={\n        \"model\":\"openai/clip-vit-base-patch32\",\n        \"input\": [\n            \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n            \"data:image/png;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDIMAGE\"\n        ]\n    })\n```","operationId":"embeddings_image","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ImageEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/embeddings_audio":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `audio`","description":"Encode Embeddings from Audio files\n\nSupports URLs of Audios and Base64-encoded Audios\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_audio\",\n    json={\n        \"model\":\"laion/larger_clap_general\",\n        \"input\": [\n            \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\",\n            \"data:audio/wav;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDAUDIO\"\n        ]\n    })\n```","operationId":"embeddings_audio","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/AudioEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/metrics":{"get":{"summary":"Metrics","description":"Endpoint that serves Prometheus metrics.","operationId":"metrics_metrics_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}}},"components":{"schemas":{"AudioEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"AudioEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ClassifyInput":{"properties":{"input":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false}},"type":"object","required":["input"],"title":"ClassifyInput"},"ClassifyResult":{"properties":{"object":{"type":"string","enum":["classify"],"const":"classify","title":"Object","default":"classify"},"data":{"items":{"items":{"$ref":"#/components/schemas/_ClassifyObject"},"type":"array"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"ClassifyResult","description":"Result of classification."},"EmbeddingEncodingFormat":{"type":"string","enum":["float","base64"],"title":"EmbeddingEncodingFormat"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ImageEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"ImageEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ModelInfo":{"properties":{"id":{"type":"string","title":"Id"},"stats":{"type":"object","title":"Stats"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"owned_by":{"type":"string","enum":["infinity"],"const":"infinity","title":"Owned By","default":"infinity"},"created":{"type":"integer","title":"Created"},"backend":{"type":"string","title":"Backend","default":""},"capabilities":{"items":{"type":"string"},"type":"array","uniqueItems":true,"title":"Capabilities","default":[]}},"type":"object","required":["id","stats"],"title":"ModelInfo"},"MultiModalOpenAIEmbedding":{"oneOf":[{"$ref":"#/components/schemas/_OpenAIEmbeddingInput_Text"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Audio"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Image"}],"title":"MultiModalOpenAIEmbedding"},"OpenAIEmbeddingInput_Audio":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["audio"],"const":"audio","title":"Modality","default":"audio"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Audio"},"OpenAIEmbeddingInput_Image":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["image"],"const":"image","title":"Modality","default":"image"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Image"},"OpenAIEmbeddingResult":{"properties":{"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"},"data":{"items":{"$ref":"#/components/schemas/_EmbeddingObject"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"OpenAIEmbeddingResult"},"OpenAIModelInfo":{"properties":{"data":{"items":{"$ref":"#/components/schemas/ModelInfo"},"type":"array","title":"Data"},"object":{"type":"string","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"OpenAIModelInfo"},"ReRankResult":{"properties":{"object":{"type":"string","enum":["rerank"],"const":"rerank","title":"Object","default":"rerank"},"results":{"items":{"$ref":"#/components/schemas/_ReRankObject"},"type":"array","title":"Results"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["results","model","usage"],"title":"ReRankResult","description":"Following the Cohere protocol for Rerankers."},"RerankInput":{"properties":{"query":{"type":"string","maxLength":122880,"title":"Query"},"documents":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Documents"},"return_documents":{"type":"boolean","title":"Return Documents","default":false},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false},"model":{"type":"string","title":"Model","default":"default/not-specified"},"top_n":{"anyOf":[{"type":"integer","exclusiveMinimum":0.0},{"type":"null"}],"title":"Top N"}},"type":"object","required":["query","documents"],"title":"RerankInput","description":"Input for reranking"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"_ClassifyObject":{"properties":{"score":{"type":"number","title":"Score"},"label":{"type":"string","title":"Label"}},"type":"object","required":["score","label"],"title":"_ClassifyObject"},"_EmbeddingObject":{"properties":{"object":{"type":"string","enum":["embedding"],"const":"embedding","title":"Object","default":"embedding"},"embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"string","format":"binary"},{"items":{"items":{"type":"number"},"type":"array"},"type":"array"}],"title":"Embedding"},"index":{"type":"integer","title":"Index"}},"type":"object","required":["embedding","index"],"title":"_EmbeddingObject"},"_OpenAIEmbeddingInput_Text":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1},{"type":"string","maxLength":122880}],"title":"Input"},"modality":{"type":"string","enum":["text"],"const":"text","title":"Modality","default":"text"}},"type":"object","required":["input"],"title":"_OpenAIEmbeddingInput_Text","description":"helper"},"_ReRankObject":{"properties":{"relevance_score":{"type":"number","title":"Relevance Score"},"index":{"type":"integer","title":"Index"},"document":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Document"}},"type":"object","required":["relevance_score","index"],"title":"_ReRankObject"},"_Usage":{"properties":{"prompt_tokens":{"type":"integer","title":"Prompt Tokens"},"total_tokens":{"type":"integer","title":"Total Tokens"}},"type":"object","required":["prompt_tokens","total_tokens"],"title":"_Usage"}}}}
\ No newline at end of file
+{"openapi":"3.1.0","info":{"title":"♾️ Infinity - Embedding Inference Server","summary":"Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip. Infinity is developed under MIT License at https://github.com/michaelfeil/infinity.","contact":{"name":"Michael Feil, Raphael Wirth"},"license":{"name":"MIT License","identifier":"MIT"},"version":"0.0.73"},"paths":{"/health":{"get":{"summary":" Health","description":"health check endpoint\n\nReturns:\n    dict(unix=float): dict with unix time stamp","operationId":"health","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"type":"number"},"type":"object","title":"Response Health"}}}}}}},"/":{"get":{"summary":"Redirect","operationId":"redirect__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/models":{"get":{"summary":" Models","description":"get models endpoint","operationId":"models","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIModelInfo"}}}}}}},"/embeddings":{"post":{"summary":" Embeddings","description":"Encode Embeddings. Supports with multimodal inputs. Aligned with OpenAI Embeddings API.\n\n## Running Text Embeddings\n```python\nimport requests, base64\nrequests.post(\"http://..:7997/embeddings\",\n    json={\"model\":\"openai/clip-vit-base-patch32\",\"input\":[\"Two cute cats.\"]})\n```\n\n## Running Image Embeddings\n```python\nrequests.post(\"http://..:7997/embeddings\",\n    json={\n        \"model\": \"openai/clip-vit-base-patch32\",\n        \"encoding_format\": \"base64\",\n        \"input\": [\n            \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n            # can also be base64 encoded\n        ],\n        # set extra modality to image to process as image\n        \"modality\": \"image\"\n)\n```\n\n## Running Audio Embeddings\n```python\nimport requests, base64\nurl = \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\"\n\ndef url_to_base64(url, modality = \"image\"):\n    '''small helper to convert url to base64 without server requiring access to the url'''\n    response = requests.get(url)\n    response.raise_for_status()\n    base64_encoded = base64.b64encode(response.content).decode('utf-8')\n    mimetype = f\"{modality}/{url.split('.')[-1]}\"\n    return f\"data:{mimetype};base64,{base64_encoded}\"\n\nrequests.post(\"http://localhost:7997/embeddings\",\n    json={\n        \"model\": \"laion/larger_clap_general\",\n        \"encoding_format\": \"float\",\n        \"input\": [\n            url, url_to_base64(url, \"audio\")\n        ],\n        # set extra modality to audio to process as audio\n        \"modality\": \"audio\"\n    }\n)\n```\n\n## Running via OpenAI Client\n```python\nfrom openai import OpenAI # pip install openai==1.51.0\nclient = OpenAI(base_url=\"http://localhost:7997/\")\nclient.embeddings.create(\n    model=\"laion/larger_clap_general\",\n    input=[url_to_base64(url, \"audio\")],\n    encoding_format=\"float\",\n    extra_body={\n        \"modality\": \"audio\"\n    }\n)\n\nclient.embeddings.create(\n    model=\"laion/larger_clap_general\",\n    input=[\"the sound of a beep\", \"the sound of a cat\"],\n    encoding_format=\"base64\", # base64: optional high performance setting\n    extra_body={\n        \"modality\": \"text\"\n    }\n)\n```\n\n### Hint: Run all the above models on one server:\n```bash\ninfinity_emb v2 --model-id BAAI/bge-small-en-v1.5 --model-id openai/clip-vit-base-patch32 --model-id laion/larger_clap_general\n```","operationId":"embeddings","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/MultiModalOpenAIEmbedding"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/rerank":{"post":{"summary":" Rerank","description":"Rerank documents. Aligned with Cohere API (https://docs.cohere.com/reference/rerank)\n\n```python\nimport requests\nrequests.post(\"http://..:7997/rerank\",\n    json={\n        \"model\":\"mixedbread-ai/mxbai-rerank-xsmall-v1\",\n        \"query\":\"Where is Munich?\",\n        \"documents\":[\"Munich is in Germany.\", \"The sky is blue.\"]\n    })\n```","operationId":"rerank","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RerankInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ReRankResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/classify":{"post":{"summary":" Classify","description":"Score or Classify Sentiments\n\n```python\nimport requests\nrequests.post(\"http://..:7997/classify\",\n    json={\"model\":\"SamLowe/roberta-base-go_emotions\",\"input\":[\"I am not having a great day.\"]})\n```","operationId":"classify","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/embeddings_image":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `image`","description":"Encode Embeddings from Image files\n\nSupports URLs of Images and Base64-encoded Images\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_image\",\n    json={\n        \"model\":\"openai/clip-vit-base-patch32\",\n        \"input\": [\n            \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n            \"data:image/png;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDIMAGE\"\n        ]\n    })\n```","operationId":"embeddings_image","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ImageEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/embeddings_audio":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `audio`","description":"Encode Embeddings from Audio files\n\nSupports URLs of Audios and Base64-encoded Audios\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_audio\",\n    json={\n        \"model\":\"laion/larger_clap_general\",\n        \"input\": [\n            \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\",\n            \"data:audio/wav;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDAUDIO\"\n        ]\n    })\n```","operationId":"embeddings_audio","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/AudioEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/metrics":{"get":{"summary":"Metrics","description":"Endpoint that serves Prometheus metrics.","operationId":"metrics_metrics_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}}},"components":{"schemas":{"AudioEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"AudioEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ClassifyInput":{"properties":{"input":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false}},"type":"object","required":["input"],"title":"ClassifyInput"},"ClassifyResult":{"properties":{"object":{"type":"string","enum":["classify"],"const":"classify","title":"Object","default":"classify"},"data":{"items":{"items":{"$ref":"#/components/schemas/_ClassifyObject"},"type":"array"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"ClassifyResult","description":"Result of classification."},"EmbeddingEncodingFormat":{"type":"string","enum":["float","base64"],"title":"EmbeddingEncodingFormat"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ImageEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"ImageEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ModelInfo":{"properties":{"id":{"type":"string","title":"Id"},"stats":{"type":"object","title":"Stats"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"owned_by":{"type":"string","enum":["infinity"],"const":"infinity","title":"Owned By","default":"infinity"},"created":{"type":"integer","title":"Created"},"backend":{"type":"string","title":"Backend","default":""},"capabilities":{"items":{"type":"string"},"type":"array","uniqueItems":true,"title":"Capabilities","default":[]}},"type":"object","required":["id","stats"],"title":"ModelInfo"},"MultiModalOpenAIEmbedding":{"oneOf":[{"$ref":"#/components/schemas/_OpenAIEmbeddingInput_Text"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Audio"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Image"}],"title":"MultiModalOpenAIEmbedding"},"OpenAIEmbeddingInput_Audio":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["audio"],"const":"audio","title":"Modality","default":"audio"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Audio"},"OpenAIEmbeddingInput_Image":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["image"],"const":"image","title":"Modality","default":"image"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Image"},"OpenAIEmbeddingResult":{"properties":{"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"},"data":{"items":{"$ref":"#/components/schemas/_EmbeddingObject"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"OpenAIEmbeddingResult"},"OpenAIModelInfo":{"properties":{"data":{"items":{"$ref":"#/components/schemas/ModelInfo"},"type":"array","title":"Data"},"object":{"type":"string","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"OpenAIModelInfo"},"ReRankResult":{"properties":{"object":{"type":"string","enum":["rerank"],"const":"rerank","title":"Object","default":"rerank"},"results":{"items":{"$ref":"#/components/schemas/_ReRankObject"},"type":"array","title":"Results"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["results","model","usage"],"title":"ReRankResult","description":"Following the Cohere protocol for Rerankers."},"RerankInput":{"properties":{"query":{"type":"string","maxLength":122880,"title":"Query"},"documents":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Documents"},"return_documents":{"type":"boolean","title":"Return Documents","default":false},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false},"model":{"type":"string","title":"Model","default":"default/not-specified"},"top_n":{"anyOf":[{"type":"integer","exclusiveMinimum":0.0},{"type":"null"}],"title":"Top N"}},"type":"object","required":["query","documents"],"title":"RerankInput","description":"Input for reranking"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"_ClassifyObject":{"properties":{"score":{"type":"number","title":"Score"},"label":{"type":"string","title":"Label"}},"type":"object","required":["score","label"],"title":"_ClassifyObject"},"_EmbeddingObject":{"properties":{"object":{"type":"string","enum":["embedding"],"const":"embedding","title":"Object","default":"embedding"},"embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"string","format":"binary"},{"items":{"items":{"type":"number"},"type":"array"},"type":"array"}],"title":"Embedding"},"index":{"type":"integer","title":"Index"}},"type":"object","required":["embedding","index"],"title":"_EmbeddingObject"},"_OpenAIEmbeddingInput_Text":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1},{"type":"string","maxLength":122880}],"title":"Input"},"modality":{"type":"string","enum":["text"],"const":"text","title":"Modality","default":"text"}},"type":"object","required":["input"],"title":"_OpenAIEmbeddingInput_Text","description":"helper"},"_ReRankObject":{"properties":{"relevance_score":{"type":"number","title":"Relevance Score"},"index":{"type":"integer","title":"Index"},"document":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Document"}},"type":"object","required":["relevance_score","index"],"title":"_ReRankObject"},"_Usage":{"properties":{"prompt_tokens":{"type":"integer","title":"Prompt Tokens"},"total_tokens":{"type":"integer","title":"Total Tokens"}},"type":"object","required":["prompt_tokens","total_tokens"],"title":"_Usage"}}}}
\ No newline at end of file

From 093b78addd5ba889e1c9553fc89d8df5be45dc18 Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Tue, 31 Dec 2024 10:34:48 +0100
Subject: [PATCH 05/12] undo: infer changes

---
 libs/embed_package/embed/_infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/embed_package/embed/_infer.py b/libs/embed_package/embed/_infer.py
index ffee6422..93e185c8 100644
--- a/libs/embed_package/embed/_infer.py
+++ b/libs/embed_package/embed/_infer.py
@@ -2,7 +2,7 @@
 from typing import Collection, Literal, Union
 
 from infinity_emb import EngineArgs, SyncEngineArray  # type: ignore
-from infinity_emb.cli import AutoPadding
+from infinity_emb.infinity_server import AutoPadding
 
 __all__ = ["BatchedInference"]
 

From 6ddbeeff243443dce7e46a39dfe3356df3095c35 Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Tue, 31 Dec 2024 10:41:24 +0100
Subject: [PATCH 06/12] loosen: openai-restrictions

---
 .../tests/end_to_end/test_openapi_client_compat.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py b/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py
index 19f5a385..bd7f35ad 100644
--- a/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py
+++ b/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py
@@ -132,7 +132,7 @@ async def test_openai(client: AsyncClient):
 
     # test AUDIO: cosine distance of beep to cat and dog
     np.testing.assert_allclose(
-        emb1_audio.data[0].embedding, emb1_1_audio.data[0].embedding, rtol=1e-5
+        emb1_audio.data[0].embedding, emb1_1_audio.data[0].embedding, rtol=1e-4
     )
     assert all(
         np.dot(emb1_audio.data[0].embedding, emb1_audio_from_text.data[0].embedding)
@@ -142,7 +142,7 @@ async def test_openai(client: AsyncClient):
 
     # test IMAGE: cosine distance of cat to dog and bird
     np.testing.assert_allclose(
-        emb_1_image.data[0].embedding, emb_1_1_image.data[0].embedding, rtol=1e-5
+        emb_1_image.data[0].embedding, emb_1_1_image.data[0].embedding, rtol=1e-4
     )
     assert all(
         np.dot(emb_1_image.data[0].embedding, emb_1_image_from_text.data[0].embedding)
@@ -152,7 +152,7 @@ async def test_openai(client: AsyncClient):
 
     # test TEXT: cosine distance of cat to dog and bird
     np.testing.assert_allclose(
-        emb_1_text.data[0].embedding, emb_1_text.data[1].embedding, rtol=1e-5
+        emb_1_text.data[0].embedding, emb_1_text.data[1].embedding, rtol=1e-4
     )
 
     # wrong key

From f995cfac71834953c4163e5537ecbf2a866af65a Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Tue, 31 Dec 2024 10:54:50 +0100
Subject: [PATCH 07/12] update docs, update mac unit tests

---
 README.md                                              |  2 +-
 docs/docs/cli_v2.md                                    |  2 +-
 libs/infinity_emb/infinity_emb/cli.py                  |  2 +-
 .../crossencoder/test_torch_crossencoder.py            | 10 ++++++++--
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 8980e8c3..aaffc187 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ The cache path inside the docker container is set by the environment variable `H
   Checkout `infinity_emb v2 --help` for all args and validation.
 
   Multiple Model CLI Playbook:                                                                                         
-   - 1. cli options can be repeated e.g. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`. This will create two models `model/id1` and `model/id2`
+   - 1. cli options can be repeated e.g. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`. This will create two models `model/id1` and `model/id2`
    - 2. or adapt the defaults by setting ENV Variables separated by `;`: `INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;"`
    - 3. single items are broadcasted to `--model-id` length,  `v2 --model-id model/id1 --model-id/id2 --batch-size 8` making both models have batch-size 8.
    - 4. Everything is broadcasted to the number of `--model-id` + API requests are routed to the `--served-model-name/--model-id`
diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index 51d7de54..13e081c9 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -11,7 +11,7 @@ $ infinity_emb v2 --help
                                                                                                                         
  Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                               
  Multiple Model CLI Playbook:                                                                                           
- - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`         
+ - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`         
  - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" &&      
  INFINITY_BATCH_SIZE="8;4;"                                                                                             
  - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size  
diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py
index 03ba2189..21a099f3 100644
--- a/libs/infinity_emb/infinity_emb/cli.py
+++ b/libs/infinity_emb/infinity_emb/cli.py
@@ -274,7 +274,7 @@ def v2(
         """Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil \n
         \n
         Multiple Model CLI Playbook: \n
-        - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4` \n
+        - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4` \n
         - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;" \n
         - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8. \n
         """
diff --git a/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py b/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py
index dff9065b..9d58041e 100644
--- a/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py
+++ b/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py
@@ -5,6 +5,11 @@
 
 from infinity_emb.args import EngineArgs
 from infinity_emb.transformer.crossencoder.torch import CrossEncoderPatched
+from infinity_emb.primitives import Device
+
+import torch
+
+device = Device.cpu if torch.backends.mps.is_available() else Device.auto
 
 SHOULD_TORCH_COMPILE = sys.platform == "linux" and sys.version_info < (3, 12)
 
@@ -14,6 +19,7 @@ def test_crossencoder():
         engine_args=EngineArgs(
             model_name_or_path="mixedbread-ai/mxbai-rerank-xsmall-v1",
             compile=SHOULD_TORCH_COMPILE,
+            device=device,
         )
     )
 
@@ -37,10 +43,10 @@ def test_crossencoder():
 def test_patched_crossencoder_vs_sentence_transformers():
     model = CrossEncoderPatched(
         engine_args=EngineArgs(
-            model_name_or_path="mixedbread-ai/mxbai-rerank-xsmall-v1", compile=True
+            model_name_or_path="mixedbread-ai/mxbai-rerank-xsmall-v1", compile=True, device=device
         )
     )
-    model_unpatched = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1", trust_remote_code=True)
+    model_unpatched = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 
     query = "Where is Paris?"
     documents = [

From 990f4d03950a74b2e9be10c9e98e811a54582afe Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Tue, 31 Dec 2024 12:42:25 +0100
Subject: [PATCH 08/12] update: cli / docs

---
 docs/docs/cli_v2.md                   |  9 ++++----
 libs/infinity_emb/infinity_emb/cli.py | 33 ++++++++++++++-------------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index 13e081c9..eb734428 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -6,12 +6,13 @@ $ infinity_emb v2 --help
 ```
 
 ```
-                                                                                                                        
- Usage: infinity_emb v2 [OPTIONS]                                                                                       
-                                                                                                                        
+                 
+ Usage: infinity_emb v2 [OPTIONS]                                                                      
+
+
  Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                               
  Multiple Model CLI Playbook:                                                                                           
- - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`         
+ - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`
  - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" &&      
  INFINITY_BATCH_SIZE="8;4;"                                                                                             
  - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size  
diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py
index 21a099f3..0056e252 100644
--- a/libs/infinity_emb/infinity_emb/cli.py
+++ b/libs/infinity_emb/infinity_emb/cli.py
@@ -378,22 +378,23 @@ def v2(
             loop=loopname,  # type: ignore
         )
 
-    def cli():
-        CHECK_TYPER.mark_required()
-        if len(sys.argv) == 1 or sys.argv[1] not in [
-            "v1",
-            "v2",
-            "help",
-            "--help",
-            "--show-completion",
-            "--install-completion",
-        ]:
-            logger.critical(
-                "Error: No command given. Please use infinity with the `v2` command. "
-                f"This is deprecated since 0.0.32. You are on {infinity_emb.__version__}"
-                "Usage: `infinity_emb v2 --model-id BAAI/bge-large-en-v1.5"
-            )
-        tp()
+
+def cli():
+    CHECK_TYPER.mark_required()
+    if len(sys.argv) == 1 or sys.argv[1] not in [
+        "v1",
+        "v2",
+        "help",
+        "--help",
+        "--show-completion",
+        "--install-completion",
+    ]:
+        logger.critical(
+            "Error: No command given. Please use infinity with the `v2` command. "
+            f"This is deprecated since 0.0.32. You are on {infinity_emb.__version__}"
+            "Usage: `infinity_emb v2 --model-id BAAI/bge-large-en-v1.5"
+        )
+    tp()
 
 
 if __name__ == "__main__":

From 286de3e525ad641bb7da5bc748e0dc4bf60fab3a Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Tue, 31 Dec 2024 12:52:49 +0100
Subject: [PATCH 09/12] add cli / test

---
 docs/docs/cli_v2.md                                      | 9 ++++-----
 .../unit_test/{test_infinity_server.py => test_cli.py}   | 1 -
 2 files changed, 4 insertions(+), 6 deletions(-)
 rename libs/infinity_emb/tests/unit_test/{test_infinity_server.py => test_cli.py} (99%)

diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index eb734428..0bae7fce 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -6,13 +6,12 @@ $ infinity_emb v2 --help
 ```
 
 ```
-                 
- Usage: infinity_emb v2 [OPTIONS]                                                                      
-
-
+                                                                                                                        
+ Usage: infinity_emb v2 [OPTIONS]                                                                                       
+                                                                                                                        
  Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                               
  Multiple Model CLI Playbook:                                                                                           
- - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`
+ - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`   
  - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" &&      
  INFINITY_BATCH_SIZE="8;4;"                                                                                             
  - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size  
diff --git a/libs/infinity_emb/tests/unit_test/test_infinity_server.py b/libs/infinity_emb/tests/unit_test/test_cli.py
similarity index 99%
rename from libs/infinity_emb/tests/unit_test/test_infinity_server.py
rename to libs/infinity_emb/tests/unit_test/test_cli.py
index 20d8207d..942e01cf 100644
--- a/libs/infinity_emb/tests/unit_test/test_infinity_server.py
+++ b/libs/infinity_emb/tests/unit_test/test_cli.py
@@ -11,7 +11,6 @@
 )
 from infinity_emb.cli import v1, v2
 
-
 from infinity_emb.cli import (
     UVICORN_LOG_LEVELS,
     Device,

From 0e0612189877eb1f8bbae350be4ac436c0f836d9 Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Tue, 31 Dec 2024 13:14:37 +0100
Subject: [PATCH 10/12] refactor cli . inf server tests

---
 libs/infinity_emb/tests/unit_test/test_cli.py | 46 -------------------
 .../tests/unit_test/test_infinity_server.py   | 46 +++++++++++++++++++
 2 files changed, 46 insertions(+), 46 deletions(-)
 create mode 100644 libs/infinity_emb/tests/unit_test/test_infinity_server.py

diff --git a/libs/infinity_emb/tests/unit_test/test_cli.py b/libs/infinity_emb/tests/unit_test/test_cli.py
index 942e01cf..c392aa67 100644
--- a/libs/infinity_emb/tests/unit_test/test_cli.py
+++ b/libs/infinity_emb/tests/unit_test/test_cli.py
@@ -2,22 +2,6 @@
 import sys
 
 import pytest
-import uvicorn
-from fastapi import FastAPI
-
-from infinity_emb.args import EngineArgs
-from infinity_emb.infinity_server import (
-    create_server,
-)
-from infinity_emb.cli import v1, v2
-
-from infinity_emb.cli import (
-    UVICORN_LOG_LEVELS,
-    Device,
-    Dtype,
-    InferenceEngine,
-    PoolingMethod,
-)
 
 
 # only run subprocess on non-windows
@@ -77,33 +61,3 @@ def test_cli_v2_weird():
 def test_cli_preload(version):
     log = subprocess.run(["infinity_emb", f"{version}", "--preload-only"])
     assert log.returncode == 0
-
-
-def test_create_server():
-    app = create_server(engine_args_list=[EngineArgs(engine="debugengine")])
-    assert isinstance(app, FastAPI)
-
-
-def test_patched_create_uvicorn_v1(mocker):
-    mocker.patch("uvicorn.run")
-    v1(
-        log_level=UVICORN_LOG_LEVELS.debug,  # type: ignore[arg-type]
-        engine=InferenceEngine.torch,
-        device=Device.auto,
-        dtype=Dtype.auto,
-        pooling_method=PoolingMethod.auto,
-    )
-    assert uvicorn.run.call_count == 1
-
-
-def test_patched_create_uvicorn_v2(mocker):
-    mocker.patch("uvicorn.run")
-    v2(
-        log_level=UVICORN_LOG_LEVELS.debug,  # type: ignore[arg-type]
-        engine=[InferenceEngine.torch],
-        model_id=["michaelfeil/bge-small-en-v1.5", "BAAI/bge-small-en-v1.5"],
-        device=[Device.auto],
-        dtype=[Dtype.auto],
-        pooling_method=[PoolingMethod.auto],
-    )
-    assert uvicorn.run.call_count == 1
diff --git a/libs/infinity_emb/tests/unit_test/test_infinity_server.py b/libs/infinity_emb/tests/unit_test/test_infinity_server.py
new file mode 100644
index 00000000..755a8eba
--- /dev/null
+++ b/libs/infinity_emb/tests/unit_test/test_infinity_server.py
@@ -0,0 +1,46 @@
+import uvicorn
+from fastapi import FastAPI
+
+from infinity_emb.args import EngineArgs
+from infinity_emb.infinity_server import (
+    create_server,
+)
+from infinity_emb.cli import v1, v2
+
+from infinity_emb.cli import (
+    UVICORN_LOG_LEVELS,
+    Device,
+    Dtype,
+    InferenceEngine,
+    PoolingMethod,
+)
+
+
+def test_create_server():
+    app = create_server(engine_args_list=[EngineArgs(engine="debugengine")])
+    assert isinstance(app, FastAPI)
+
+
+def test_patched_create_uvicorn_v1(mocker):
+    mocker.patch("uvicorn.run")
+    v1(
+        log_level=UVICORN_LOG_LEVELS.debug,  # type: ignore[arg-type]
+        engine=InferenceEngine.torch,
+        device=Device.auto,
+        dtype=Dtype.auto,
+        pooling_method=PoolingMethod.auto,
+    )
+    assert uvicorn.run.call_count == 1
+
+
+def test_patched_create_uvicorn_v2(mocker):
+    mocker.patch("uvicorn.run")
+    v2(
+        log_level=UVICORN_LOG_LEVELS.debug,  # type: ignore[arg-type]
+        engine=[InferenceEngine.torch],
+        model_id=["michaelfeil/bge-small-en-v1.5", "BAAI/bge-small-en-v1.5"],
+        device=[Device.auto],
+        dtype=[Dtype.auto],
+        pooling_method=[PoolingMethod.auto],
+    )
+    assert uvicorn.run.call_count == 1

From bfff7d6f8a2bc5f622baa89f90751a97a1982d91 Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Wed, 1 Jan 2025 16:19:38 -0500
Subject: [PATCH 11/12] cli: remove defered typing

---
 libs/infinity_emb/infinity_emb/cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py
index 0056e252..f9533866 100644
--- a/libs/infinity_emb/infinity_emb/cli.py
+++ b/libs/infinity_emb/infinity_emb/cli.py
@@ -199,7 +199,7 @@ def v2(
             **_construct("trust_remote_code"),
             help="if potential remote modeling code from huggingface repo is trusted.",
         ),
-        engine: list["InferenceEngine"] = typer.Option(
+        engine: list[InferenceEngine] = typer.Option(
             **_construct("engine"),
             help="Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.",
         ),
@@ -226,11 +226,11 @@ def v2(
         dtype: list[Dtype] = typer.Option(
             **_construct("dtype"), help="dtype for the model weights."
         ),
-        embedding_dtype: list["EmbeddingDtype"] = typer.Option(
+        embedding_dtype: list[EmbeddingDtype] = typer.Option(
             **_construct("embedding_dtype"),
             help="dtype post-forward pass. If != `float32`, using Post-Forward Static quantization.",
         ),
-        pooling_method: list["PoolingMethod"] = typer.Option(
+        pooling_method: list[PoolingMethod] = typer.Option(
             **_construct("pooling_method"),
             help="overwrite the pooling method if inferred incorrectly.",
         ),

From 350f9d81629a6a3ae099b1742400850c68b1c3e9 Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Wed, 1 Jan 2025 16:24:21 -0500
Subject: [PATCH 12/12] improve tolerance of embedding compat

---
 .../tests/end_to_end/test_openapi_client_compat.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py b/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py
index bd7f35ad..333741fb 100644
--- a/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py
+++ b/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py
@@ -132,7 +132,7 @@ async def test_openai(client: AsyncClient):
 
     # test AUDIO: cosine distance of beep to cat and dog
     np.testing.assert_allclose(
-        emb1_audio.data[0].embedding, emb1_1_audio.data[0].embedding, rtol=1e-4
+        emb1_audio.data[0].embedding, emb1_1_audio.data[0].embedding, rtol=1e-4, atol=1e-4
     )
     assert all(
         np.dot(emb1_audio.data[0].embedding, emb1_audio_from_text.data[0].embedding)
@@ -142,7 +142,7 @@ async def test_openai(client: AsyncClient):
 
     # test IMAGE: cosine distance of cat to dog and bird
     np.testing.assert_allclose(
-        emb_1_image.data[0].embedding, emb_1_1_image.data[0].embedding, rtol=1e-4
+        emb_1_image.data[0].embedding, emb_1_1_image.data[0].embedding, rtol=1e-4, atol=1e-4
     )
     assert all(
         np.dot(emb_1_image.data[0].embedding, emb_1_image_from_text.data[0].embedding)
@@ -152,7 +152,7 @@ async def test_openai(client: AsyncClient):
 
     # test TEXT: cosine distance of cat to dog and bird
     np.testing.assert_allclose(
-        emb_1_text.data[0].embedding, emb_1_text.data[1].embedding, rtol=1e-4
+        emb_1_text.data[0].embedding, emb_1_text.data[1].embedding, rtol=1e-4, atol=1e-4
     )
 
     # wrong key