diff --git a/README.md b/README.md
index 8980e8c3..aaffc187 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ The cache path inside the docker container is set by the environment variable `H
   Checkout `infinity_emb v2 --help` for all args and validation.
 
   Multiple Model CLI Playbook:                                                                                         
-   - 1. cli options can be repeated e.g. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`. This will create two models `model/id1` and `model/id2`
+   - 1. cli options can be repeated e.g. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`. This will create two models `model/id1` and `model/id2`
    - 2. or adapt the defaults by setting ENV Variables separated by `;`: `INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;"`
    - 3. single items are broadcasted to `--model-id` length,  `v2 --model-id model/id1 --model-id/id2 --batch-size 8` making both models have batch-size 8.
    - 4. Everything is broadcasted to the number of `--model-id` + API requests are routed to the `--served-model-name/--model-id`
diff --git a/docs/assets/openapi.json b/docs/assets/openapi.json
index 497a0bc5..0ee52977 100644
--- a/docs/assets/openapi.json
+++ b/docs/assets/openapi.json
@@ -1 +1 @@
-{"openapi":"3.1.0","info":{"title":"♾️ Infinity - Embedding Inference Server","summary":"Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip. Infinity is developed under MIT License at https://github.com/michaelfeil/infinity.","contact":{"name":"Michael Feil, Raphael Wirth"},"license":{"name":"MIT License","identifier":"MIT"},"version":"0.0.72"},"paths":{"/health":{"get":{"summary":" Health","description":"health check endpoint\n\nReturns:\n    dict(unix=float): dict with unix time stamp","operationId":"health","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"type":"number"},"type":"object","title":"Response Health"}}}}}}},"/":{"get":{"summary":"Redirect","operationId":"redirect__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/models":{"get":{"summary":" Models","description":"get models endpoint","operationId":"models","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIModelInfo"}}}}}}},"/embeddings":{"post":{"summary":" Embeddings","description":"Encode Embeddings. Supports with multimodal inputs. Aligned with OpenAI Embeddings API.\n\n## Running Text Embeddings\n```python\nimport requests, base64\nrequests.post(\"http://..:7997/embeddings\",\n    json={\"model\":\"openai/clip-vit-base-patch32\",\"input\":[\"Two cute cats.\"]})\n```\n\n## Running Image Embeddings\n```python\nrequests.post(\"http://..:7997/embeddings\",\n    json={\n        \"model\": \"openai/clip-vit-base-patch32\",\n        \"encoding_format\": \"base64\",\n        \"input\": [\n            \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n            # can also be base64 encoded\n        ],\n        # set extra modality to image to process as image\n        \"modality\": \"image\"\n)\n```\n\n## Running Audio Embeddings\n```python\nimport requests, base64\nurl = \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\"\n\ndef url_to_base64(url, modality = \"image\"):\n    '''small helper to convert url to base64 without server requiring access to the url'''\n    response = requests.get(url)\n    response.raise_for_status()\n    base64_encoded = base64.b64encode(response.content).decode('utf-8')\n    mimetype = f\"{modality}/{url.split('.')[-1]}\"\n    return f\"data:{mimetype};base64,{base64_encoded}\"\n\nrequests.post(\"http://localhost:7997/embeddings\",\n    json={\n        \"model\": \"laion/larger_clap_general\",\n        \"encoding_format\": \"float\",\n        \"input\": [\n            url, url_to_base64(url, \"audio\")\n        ],\n        # set extra modality to audio to process as audio\n        \"modality\": \"audio\"\n    }\n)\n```\n\n## Running via OpenAI Client\n```python\nfrom openai import OpenAI # pip install openai==1.51.0\nclient = OpenAI(base_url=\"http://localhost:7997/\")\nclient.embeddings.create(\n    model=\"laion/larger_clap_general\",\n    input=[url_to_base64(url, \"audio\")],\n    encoding_format=\"float\",\n    extra_body={\n        \"modality\": \"audio\"\n    }\n)\n\nclient.embeddings.create(\n    model=\"laion/larger_clap_general\",\n    input=[\"the sound of a beep\", \"the sound of a cat\"],\n    encoding_format=\"base64\", # base64: optional high performance setting\n    extra_body={\n        \"modality\": \"text\"\n    }\n)\n```\n\n### Hint: Run all the above models on one server:\n```bash\ninfinity_emb v2 --model-id BAAI/bge-small-en-v1.5 --model-id openai/clip-vit-base-patch32 --model-id laion/larger_clap_general\n```","operationId":"embeddings","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/MultiModalOpenAIEmbedding"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/rerank":{"post":{"summary":" Rerank","description":"Rerank documents. Aligned with Cohere API (https://docs.cohere.com/reference/rerank)\n\n```python\nimport requests\nrequests.post(\"http://..:7997/rerank\",\n    json={\n        \"model\":\"mixedbread-ai/mxbai-rerank-xsmall-v1\",\n        \"query\":\"Where is Munich?\",\n        \"documents\":[\"Munich is in Germany.\", \"The sky is blue.\"]\n    })\n```","operationId":"rerank","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RerankInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ReRankResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/classify":{"post":{"summary":" Classify","description":"Score or Classify Sentiments\n\n```python\nimport requests\nrequests.post(\"http://..:7997/classify\",\n    json={\"model\":\"SamLowe/roberta-base-go_emotions\",\"input\":[\"I am not having a great day.\"]})\n```","operationId":"classify","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/embeddings_image":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `image`","description":"Encode Embeddings from Image files\n\nSupports URLs of Images and Base64-encoded Images\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_image\",\n    json={\n        \"model\":\"openai/clip-vit-base-patch32\",\n        \"input\": [\n            \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n            \"data:image/png;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDIMAGE\"\n        ]\n    })\n```","operationId":"embeddings_image","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ImageEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/embeddings_audio":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `audio`","description":"Encode Embeddings from Audio files\n\nSupports URLs of Audios and Base64-encoded Audios\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_audio\",\n    json={\n        \"model\":\"laion/larger_clap_general\",\n        \"input\": [\n            \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\",\n            \"data:audio/wav;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDAUDIO\"\n        ]\n    })\n```","operationId":"embeddings_audio","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/AudioEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/metrics":{"get":{"summary":"Metrics","description":"Endpoint that serves Prometheus metrics.","operationId":"metrics_metrics_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}}},"components":{"schemas":{"AudioEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"AudioEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ClassifyInput":{"properties":{"input":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false}},"type":"object","required":["input"],"title":"ClassifyInput"},"ClassifyResult":{"properties":{"object":{"type":"string","enum":["classify"],"const":"classify","title":"Object","default":"classify"},"data":{"items":{"items":{"$ref":"#/components/schemas/_ClassifyObject"},"type":"array"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"ClassifyResult","description":"Result of classification."},"EmbeddingEncodingFormat":{"type":"string","enum":["float","base64"],"title":"EmbeddingEncodingFormat"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ImageEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"ImageEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ModelInfo":{"properties":{"id":{"type":"string","title":"Id"},"stats":{"type":"object","title":"Stats"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"owned_by":{"type":"string","enum":["infinity"],"const":"infinity","title":"Owned By","default":"infinity"},"created":{"type":"integer","title":"Created"},"backend":{"type":"string","title":"Backend","default":""},"capabilities":{"items":{"type":"string"},"type":"array","uniqueItems":true,"title":"Capabilities","default":[]}},"type":"object","required":["id","stats"],"title":"ModelInfo"},"MultiModalOpenAIEmbedding":{"oneOf":[{"$ref":"#/components/schemas/_OpenAIEmbeddingInput_Text"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Audio"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Image"}],"title":"MultiModalOpenAIEmbedding"},"OpenAIEmbeddingInput_Audio":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["audio"],"const":"audio","title":"Modality","default":"audio"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Audio"},"OpenAIEmbeddingInput_Image":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["image"],"const":"image","title":"Modality","default":"image"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Image"},"OpenAIEmbeddingResult":{"properties":{"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"},"data":{"items":{"$ref":"#/components/schemas/_EmbeddingObject"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"OpenAIEmbeddingResult"},"OpenAIModelInfo":{"properties":{"data":{"items":{"$ref":"#/components/schemas/ModelInfo"},"type":"array","title":"Data"},"object":{"type":"string","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"OpenAIModelInfo"},"ReRankResult":{"properties":{"object":{"type":"string","enum":["rerank"],"const":"rerank","title":"Object","default":"rerank"},"results":{"items":{"$ref":"#/components/schemas/_ReRankObject"},"type":"array","title":"Results"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["results","model","usage"],"title":"ReRankResult","description":"Following the Cohere protocol for Rerankers."},"RerankInput":{"properties":{"query":{"type":"string","maxLength":122880,"title":"Query"},"documents":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Documents"},"return_documents":{"type":"boolean","title":"Return Documents","default":false},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false},"model":{"type":"string","title":"Model","default":"default/not-specified"},"top_n":{"anyOf":[{"type":"integer","exclusiveMinimum":0.0},{"type":"null"}],"title":"Top N"}},"type":"object","required":["query","documents"],"title":"RerankInput","description":"Input for reranking"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"_ClassifyObject":{"properties":{"score":{"type":"number","title":"Score"},"label":{"type":"string","title":"Label"}},"type":"object","required":["score","label"],"title":"_ClassifyObject"},"_EmbeddingObject":{"properties":{"object":{"type":"string","enum":["embedding"],"const":"embedding","title":"Object","default":"embedding"},"embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"string","format":"binary"},{"items":{"items":{"type":"number"},"type":"array"},"type":"array"}],"title":"Embedding"},"index":{"type":"integer","title":"Index"}},"type":"object","required":["embedding","index"],"title":"_EmbeddingObject"},"_OpenAIEmbeddingInput_Text":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1},{"type":"string","maxLength":122880}],"title":"Input"},"modality":{"type":"string","enum":["text"],"const":"text","title":"Modality","default":"text"}},"type":"object","required":["input"],"title":"_OpenAIEmbeddingInput_Text","description":"helper"},"_ReRankObject":{"properties":{"relevance_score":{"type":"number","title":"Relevance Score"},"index":{"type":"integer","title":"Index"},"document":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Document"}},"type":"object","required":["relevance_score","index"],"title":"_ReRankObject"},"_Usage":{"properties":{"prompt_tokens":{"type":"integer","title":"Prompt Tokens"},"total_tokens":{"type":"integer","title":"Total Tokens"}},"type":"object","required":["prompt_tokens","total_tokens"],"title":"_Usage"}}}}
\ No newline at end of file
+{"openapi":"3.1.0","info":{"title":"♾️ Infinity - Embedding Inference Server","summary":"Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip. Infinity is developed under MIT License at https://github.com/michaelfeil/infinity.","contact":{"name":"Michael Feil, Raphael Wirth"},"license":{"name":"MIT License","identifier":"MIT"},"version":"0.0.73"},"paths":{"/health":{"get":{"summary":" Health","description":"health check endpoint\n\nReturns:\n    dict(unix=float): dict with unix time stamp","operationId":"health","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"additionalProperties":{"type":"number"},"type":"object","title":"Response Health"}}}}}}},"/":{"get":{"summary":"Redirect","operationId":"redirect__get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/models":{"get":{"summary":" Models","description":"get models endpoint","operationId":"models","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIModelInfo"}}}}}}},"/embeddings":{"post":{"summary":" Embeddings","description":"Encode Embeddings. Supports with multimodal inputs. Aligned with OpenAI Embeddings API.\n\n## Running Text Embeddings\n```python\nimport requests, base64\nrequests.post(\"http://..:7997/embeddings\",\n    json={\"model\":\"openai/clip-vit-base-patch32\",\"input\":[\"Two cute cats.\"]})\n```\n\n## Running Image Embeddings\n```python\nrequests.post(\"http://..:7997/embeddings\",\n    json={\n        \"model\": \"openai/clip-vit-base-patch32\",\n        \"encoding_format\": \"base64\",\n        \"input\": [\n            \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n            # can also be base64 encoded\n        ],\n        # set extra modality to image to process as image\n        \"modality\": \"image\"\n)\n```\n\n## Running Audio Embeddings\n```python\nimport requests, base64\nurl = \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\"\n\ndef url_to_base64(url, modality = \"image\"):\n    '''small helper to convert url to base64 without server requiring access to the url'''\n    response = requests.get(url)\n    response.raise_for_status()\n    base64_encoded = base64.b64encode(response.content).decode('utf-8')\n    mimetype = f\"{modality}/{url.split('.')[-1]}\"\n    return f\"data:{mimetype};base64,{base64_encoded}\"\n\nrequests.post(\"http://localhost:7997/embeddings\",\n    json={\n        \"model\": \"laion/larger_clap_general\",\n        \"encoding_format\": \"float\",\n        \"input\": [\n            url, url_to_base64(url, \"audio\")\n        ],\n        # set extra modality to audio to process as audio\n        \"modality\": \"audio\"\n    }\n)\n```\n\n## Running via OpenAI Client\n```python\nfrom openai import OpenAI # pip install openai==1.51.0\nclient = OpenAI(base_url=\"http://localhost:7997/\")\nclient.embeddings.create(\n    model=\"laion/larger_clap_general\",\n    input=[url_to_base64(url, \"audio\")],\n    encoding_format=\"float\",\n    extra_body={\n        \"modality\": \"audio\"\n    }\n)\n\nclient.embeddings.create(\n    model=\"laion/larger_clap_general\",\n    input=[\"the sound of a beep\", \"the sound of a cat\"],\n    encoding_format=\"base64\", # base64: optional high performance setting\n    extra_body={\n        \"modality\": \"text\"\n    }\n)\n```\n\n### Hint: Run all the above models on one server:\n```bash\ninfinity_emb v2 --model-id BAAI/bge-small-en-v1.5 --model-id openai/clip-vit-base-patch32 --model-id laion/larger_clap_general\n```","operationId":"embeddings","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/MultiModalOpenAIEmbedding"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/rerank":{"post":{"summary":" Rerank","description":"Rerank documents. Aligned with Cohere API (https://docs.cohere.com/reference/rerank)\n\n```python\nimport requests\nrequests.post(\"http://..:7997/rerank\",\n    json={\n        \"model\":\"mixedbread-ai/mxbai-rerank-xsmall-v1\",\n        \"query\":\"Where is Munich?\",\n        \"documents\":[\"Munich is in Germany.\", \"The sky is blue.\"]\n    })\n```","operationId":"rerank","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RerankInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ReRankResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/classify":{"post":{"summary":" Classify","description":"Score or Classify Sentiments\n\n```python\nimport requests\nrequests.post(\"http://..:7997/classify\",\n    json={\"model\":\"SamLowe/roberta-base-go_emotions\",\"input\":[\"I am not having a great day.\"]})\n```","operationId":"classify","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClassifyResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/embeddings_image":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `image`","description":"Encode Embeddings from Image files\n\nSupports URLs of Images and Base64-encoded Images\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_image\",\n    json={\n        \"model\":\"openai/clip-vit-base-patch32\",\n        \"input\": [\n            \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n            \"data:image/png;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDIMAGE\"\n        ]\n    })\n```","operationId":"embeddings_image","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ImageEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/embeddings_audio":{"post":{"summary":"Deprecated: Use `embeddings` with `modality` set to `audio`","description":"Encode Embeddings from Audio files\n\nSupports URLs of Audios and Base64-encoded Audios\n\n```python\nimport requests\nrequests.post(\"http://..:7997/embeddings_audio\",\n    json={\n        \"model\":\"laion/larger_clap_general\",\n        \"input\": [\n            \"https://github.com/michaelfeil/infinity/raw/3b72eb7c14bae06e68ddd07c1f23fe0bf403f220/libs/infinity_emb/tests/data/audio/beep.wav\",\n            \"data:audio/wav;base64,iVBORw0KGgoDEMOoSAMPLEoENCODEDAUDIO\"\n        ]\n    })\n```","operationId":"embeddings_audio","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/AudioEmbeddingInput"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/OpenAIEmbeddingResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"deprecated":true}},"/metrics":{"get":{"summary":"Metrics","description":"Endpoint that serves Prometheus metrics.","operationId":"metrics_metrics_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}}},"components":{"schemas":{"AudioEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"AudioEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ClassifyInput":{"properties":{"input":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false}},"type":"object","required":["input"],"title":"ClassifyInput"},"ClassifyResult":{"properties":{"object":{"type":"string","enum":["classify"],"const":"classify","title":"Object","default":"classify"},"data":{"items":{"items":{"$ref":"#/components/schemas/_ClassifyObject"},"type":"array"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"ClassifyResult","description":"Result of classification."},"EmbeddingEncodingFormat":{"type":"string","enum":["float","base64"],"title":"EmbeddingEncodingFormat"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ImageEmbeddingInput":{"properties":{"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"}},"type":"object","required":["input"],"title":"ImageEmbeddingInput","description":"LEGACY, DO NO LONGER UPDATE"},"ModelInfo":{"properties":{"id":{"type":"string","title":"Id"},"stats":{"type":"object","title":"Stats"},"object":{"type":"string","enum":["model"],"const":"model","title":"Object","default":"model"},"owned_by":{"type":"string","enum":["infinity"],"const":"infinity","title":"Owned By","default":"infinity"},"created":{"type":"integer","title":"Created"},"backend":{"type":"string","title":"Backend","default":""},"capabilities":{"items":{"type":"string"},"type":"array","uniqueItems":true,"title":"Capabilities","default":[]}},"type":"object","required":["id","stats"],"title":"ModelInfo"},"MultiModalOpenAIEmbedding":{"oneOf":[{"$ref":"#/components/schemas/_OpenAIEmbeddingInput_Text"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Audio"},{"$ref":"#/components/schemas/OpenAIEmbeddingInput_Image"}],"title":"MultiModalOpenAIEmbedding"},"OpenAIEmbeddingInput_Audio":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["audio"],"const":"audio","title":"Modality","default":"audio"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Audio"},"OpenAIEmbeddingInput_Image":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"anyOf":[{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}]},"type":"array","maxItems":32,"minItems":1},{"type":"string","pattern":"data:(?P<mimetype>[\\w]+\\/[\\w\\-\\+\\.]+)?(?:\\;name\\=(?P<name>[\\w\\.\\-%!*'~\\(\\)]+))?(?:\\;charset\\=(?P<charset>[\\w\\-\\+\\.]+))?(?P<base64>\\;base64)?,(?P<data>.*)","examples":["data:text/plain;charset=utf-8;base64,VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wZWQgb3ZlciB0aGUgbGF6eSBkb2cu"]},{"type":"string","maxLength":2083,"minLength":1,"format":"uri"}],"title":"Input"},"modality":{"type":"string","enum":["image"],"const":"image","title":"Modality","default":"image"}},"type":"object","required":["input"],"title":"OpenAIEmbeddingInput_Image"},"OpenAIEmbeddingResult":{"properties":{"object":{"type":"string","enum":["list"],"const":"list","title":"Object","default":"list"},"data":{"items":{"$ref":"#/components/schemas/_EmbeddingObject"},"type":"array","title":"Data"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["data","model","usage"],"title":"OpenAIEmbeddingResult"},"OpenAIModelInfo":{"properties":{"data":{"items":{"$ref":"#/components/schemas/ModelInfo"},"type":"array","title":"Data"},"object":{"type":"string","title":"Object","default":"list"}},"type":"object","required":["data"],"title":"OpenAIModelInfo"},"ReRankResult":{"properties":{"object":{"type":"string","enum":["rerank"],"const":"rerank","title":"Object","default":"rerank"},"results":{"items":{"$ref":"#/components/schemas/_ReRankObject"},"type":"array","title":"Results"},"model":{"type":"string","title":"Model"},"usage":{"$ref":"#/components/schemas/_Usage"},"id":{"type":"string","title":"Id"},"created":{"type":"integer","title":"Created"}},"type":"object","required":["results","model","usage"],"title":"ReRankResult","description":"Following the Cohere protocol for Rerankers."},"RerankInput":{"properties":{"query":{"type":"string","maxLength":122880,"title":"Query"},"documents":{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1,"title":"Documents"},"return_documents":{"type":"boolean","title":"Return Documents","default":false},"raw_scores":{"type":"boolean","title":"Raw Scores","default":false},"model":{"type":"string","title":"Model","default":"default/not-specified"},"top_n":{"anyOf":[{"type":"integer","exclusiveMinimum":0.0},{"type":"null"}],"title":"Top N"}},"type":"object","required":["query","documents"],"title":"RerankInput","description":"Input for reranking"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"_ClassifyObject":{"properties":{"score":{"type":"number","title":"Score"},"label":{"type":"string","title":"Label"}},"type":"object","required":["score","label"],"title":"_ClassifyObject"},"_EmbeddingObject":{"properties":{"object":{"type":"string","enum":["embedding"],"const":"embedding","title":"Object","default":"embedding"},"embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"string","format":"binary"},{"items":{"items":{"type":"number"},"type":"array"},"type":"array"}],"title":"Embedding"},"index":{"type":"integer","title":"Index"}},"type":"object","required":["embedding","index"],"title":"_EmbeddingObject"},"_OpenAIEmbeddingInput_Text":{"properties":{"model":{"type":"string","title":"Model","default":"default/not-specified"},"encoding_format":{"$ref":"#/components/schemas/EmbeddingEncodingFormat","default":"float"},"user":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"User"},"dimensions":{"type":"integer","title":"Dimensions","default":0},"input":{"anyOf":[{"items":{"type":"string","maxLength":122880},"type":"array","maxItems":2048,"minItems":1},{"type":"string","maxLength":122880}],"title":"Input"},"modality":{"type":"string","enum":["text"],"const":"text","title":"Modality","default":"text"}},"type":"object","required":["input"],"title":"_OpenAIEmbeddingInput_Text","description":"helper"},"_ReRankObject":{"properties":{"relevance_score":{"type":"number","title":"Relevance Score"},"index":{"type":"integer","title":"Index"},"document":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Document"}},"type":"object","required":["relevance_score","index"],"title":"_ReRankObject"},"_Usage":{"properties":{"prompt_tokens":{"type":"integer","title":"Prompt Tokens"},"total_tokens":{"type":"integer","title":"Total Tokens"}},"type":"object","required":["prompt_tokens","total_tokens"],"title":"_Usage"}}}}
\ No newline at end of file
diff --git a/docs/docs/cli_v2.md b/docs/docs/cli_v2.md
index 51d7de54..0bae7fce 100644
--- a/docs/docs/cli_v2.md
+++ b/docs/docs/cli_v2.md
@@ -11,7 +11,7 @@ $ infinity_emb v2 --help
                                                                                                                         
  Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil                                               
  Multiple Model CLI Playbook:                                                                                           
- - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4`         
+ - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4`   
  - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" &&      
  INFINITY_BATCH_SIZE="8;4;"                                                                                             
  - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size  
diff --git a/docs/docs/contribution.md b/docs/docs/contribution.md
index acdf4fe9..ad862132 100644
--- a/docs/docs/contribution.md
+++ b/docs/docs/contribution.md
@@ -10,18 +10,22 @@ cd libs/infinity_emb
 poetry install --extras all --with test
 ```
 
-To ensure your contributions pass the Continuous Integration (CI) checks:
+To ensure your contributions pass the Continuous Integration (CI), there are some useful local actions.
+The `libs/infinity_emb/Makefile` is a useful entrypoint for this.
 ```bash
 cd libs/infinity_emb
 make format
 make lint
+make template-docker
 poetry run pytest ./tests
 ```
-As an alternative, you can also use the following command:
+
+As an alternative, you can also use the following command, which bundles a range of the above.
 ```bash
 cd libs/infinity_emb
 make precommit
 ```
 
 ## CLA
+Infinity is developed as open source project. 
 All contributions must be made in a way to be compatible with the MIT License of this repo. 
\ No newline at end of file
diff --git a/docs/docs/index.md b/docs/docs/index.md
index 1e2a0bb3..e69de29b 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -1,108 +0,0 @@
-# [Infinity](https://github.com/michaelfeil/infinity)
-
-Infinity is a high-throughput, low-latency REST API for serving vector embeddings, supporting all sentence-transformer models and frameworks. Infinity is developed under [MIT License](https://github.com/michaelfeil/infinity/blob/main/LICENSE). Infinity powers inference behind [Gradient.ai](https://gradient.ai) and other Embedding API providers.
-
-## Why Infinity
-
-Infinity provides the following features:
-
-* **Deploy any model from MTEB**: deploy the model you know from [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/)
-* **Fast inference backends**: The inference server is built on top of [torch](https://github.com/pytorch/pytorch), [optimum(onnx/tensorrt)](https://huggingface.co/docs/optimum/index) and [CTranslate2](https://github.com/OpenNMT/CTranslate2), using FlashAttention to get the most out of **CUDA**, **ROCM**, **CPU** or **MPS** device.
-* **Dynamic batching**: New embedding requests are queued while GPU is busy with the previous ones. New requests are squeezed intro your device as soon as ready. Similar max throughput on GPU as text-embeddings-inference.
-* **Correct and tested implementation**: Unit and end-to-end tested. Embeddings via infinity are identical to [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/) (up to numerical precision). Lets API users create embeddings till infinity and beyond.
-* **Easy to use**: The API is built on top of [FastAPI](https://fastapi.tiangolo.com/), [Swagger](https://swagger.io/) makes it fully documented. API are aligned to [OpenAI's Embedding specs](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings). See below on how to get started.
-
-## Getting started
-
-Install `infinity_emb` via pip
-```bash
-pip install infinity-emb[all]
-```
-
-<details>
-  <summary>Install from source with Poetry</summary>
-  
-  Advanced:
-  To install via Poetry use Poetry 1.8.4, Python 3.11 on Ubuntu 22.04
-  ```bash
-  git clone https://github.com/michaelfeil/infinity
-  cd infinity
-  cd libs/infinity_emb
-  poetry install --extras all
-  ```
-</details>
-
-### Launch the CLI using a pre-built docker container (recommended)
-
-```bash
-port=7997
-model1=michaelfeil/bge-small-en-v1.5
-model2=mixedbread-ai/mxbai-rerank-xsmall-v1
-volume=$PWD/data
-
-docker run -it --gpus all \
- -v $volume:/app/.cache \
- -p $port:$port \
- michaelf34/infinity:latest \
- v2 \
- --model-id $model1 \
- --model-id $model2 \
- --port $port
-```
-The cache path inside the docker container is set by the environment variable `HF_HOME`.
-
-### or launch the cli after the pip install
-After your pip install, with your venv activate, you can run the CLI directly.
-Check the `--help` command to get a description for all parameters.
-
-```bash
-infinity_emb --help
-```
-
-## Launch FAQ
-<details>
-  <summary>What are embedding models?</summary>
-  Embedding models can map any text to a low-dimensional dense vector which can be used for tasks like retrieval, classification, clustering, or semantic search. 
-  And it also can be used in vector databases for LLMs. 
-
-  
-  The most know architecture are encoder-only transformers such as BERT, and most popular implementation include [SentenceTransformers](https://github.com/UKPLab/sentence-transformers/).
-</details>
-
-<details>
-  <summary>What models are supported?</summary>
-  
-  All models of the sentence transformers org are supported https://huggingface.co/sentence-transformers / sbert.net. 
-  LLM's like LLAMA2-7B are not intended for deployment.
-
-
-  With the command `--engine torch` the model must be compatible with https://github.com/UKPLab/sentence-transformers/.
-    - only models from Huggingface are supported.
-
-  
-  With the command `--engine ctranslate2`
-    - only `BERT` models are supported.
-    - only models from Huggingface are supported.
-    
-  
-  For the latest trends, you might want to check out one of the following models.
-    https://huggingface.co/spaces/mteb/leaderboard
-    
-</details>
-
-
-<details>
-  <summary>Using Langchain with Infinity</summary>
-  Now available under # Python Integrations in the side panel.  
-  ```
-</details>
-
-
-<details>
-  <summary>Question not answered here?</summary>
-
-  There is a Discussion section on the Github of Infinity:
-  https://github.com/michaelfeil/infinity/discussions
-
-</details>
-  
diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py
new file mode 100644
index 00000000..f9533866
--- /dev/null
+++ b/libs/infinity_emb/infinity_emb/cli.py
@@ -0,0 +1,402 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2023-now michaelfeil
+
+import asyncio
+import re
+import sys
+
+
+import infinity_emb
+from infinity_emb._optional_imports import CHECK_TYPER, CHECK_UVICORN
+from infinity_emb.args import EngineArgs
+from infinity_emb.env import MANAGER
+from infinity_emb.log_handler import UVICORN_LOG_LEVELS, logger
+from infinity_emb.primitives import (
+    Device,
+    DeviceID,
+    Dtype,
+    EmbeddingDtype,
+    InferenceEngine,
+    PoolingMethod,
+)
+from infinity_emb.infinity_server import create_server
+
+
+# helper functions for the CLI
+
+
+def validate_url(path: str):
+    """
+    This regex matches:
+    - An empty string or A single '/'
+    - A string that starts with '/' and does not end with '/'
+    """
+    if re.match(r"^$|^/$|^/.*[^/]$", path):
+        return path
+    raise typer.BadParameter("Path must start with '/' and must not end with '/'")
+
+
+class AutoPadding:
+    """itertools.cycle with custom behaviour to pad to max length"""
+
+    def __init__(self, length: int, **kwargs):
+        self.length = length
+        self.kwargs = kwargs
+
+    def _resolve(self, x, iteration: int):
+        """pad x to length of self.length"""
+        x = typer_option_resolve(x)
+        if not isinstance(x, (list, tuple)):
+            return x
+        elif len(x) == 1:
+            return x[0]
+        elif len(x) == self.length:
+            return x[iteration]
+        else:
+            raise ValueError(f"Expected length {self.length} but got {len(x)}")
+
+    def __iter__(self):
+        """iterate over kwargs and pad them to length of self.length"""
+        for iteration in range(self.length):
+            kwargs = {}
+            for key, value in self.kwargs.items():
+                kwargs[key] = self._resolve(value, iteration)
+            yield kwargs
+
+
+def typer_option_resolve(*args):
+    """returns the value or the default value"""
+    if len(args) == 1:
+        return (
+            args[0].default  # if it is a typer option
+            if hasattr(args[0], "default") and hasattr(args[0], "envvar")
+            else args[0]  # if it is a normal value
+        )
+    return (a.default if (hasattr(a, "default") and hasattr(a, "envvar")) else a for a in args)
+
+
+def _construct(name: str):
+    """constructs the default entry and type hint for the variable name"""
+    return dict(
+        # gets the default value from the ENV Manager
+        default=getattr(MANAGER, name),
+        # envvar is a dummy that is there for documentation purposes.
+        envvar=f"`{MANAGER.to_name(name)}`",
+    )
+
+
+# CLI
+if CHECK_TYPER.is_available:
+    CHECK_TYPER.mark_required()
+    CHECK_UVICORN.mark_required()
+    import typer
+    import uvicorn
+
+    # patch the asyncio scheduler with uvloop
+    # which has theoretical speed-ups vs asyncio
+    loopname = "auto"
+    if sys.version_info < (3, 12):
+        try:
+            import uvloop
+
+            asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+            loopname = "uvloop"
+        except ImportError:
+            # Windows does not support uvloop
+            pass
+
+    tp = typer.Typer()
+
+    @tp.command("v1")
+    def v1(
+        # v1 is deprecated. Please do no longer modify it.
+        model_name_or_path: str = MANAGER.model_id[0],
+        served_model_name: str = MANAGER.served_model_name[0],
+        batch_size: int = MANAGER.batch_size[0],
+        revision: str = MANAGER.revision[0],
+        trust_remote_code: bool = MANAGER.trust_remote_code[0],
+        redirect_slash: str = MANAGER.redirect_slash,
+        engine: "InferenceEngine" = MANAGER.engine[0],  # type: ignore # noqa
+        model_warmup: bool = MANAGER.model_warmup[0],
+        vector_disk_cache: bool = MANAGER.vector_disk_cache[0],
+        device: "Device" = MANAGER.device[0],  # type: ignore
+        lengths_via_tokenize: bool = MANAGER.lengths_via_tokenize[0],
+        dtype: Dtype = MANAGER.dtype[0],  # type: ignore
+        embedding_dtype: "EmbeddingDtype" = EmbeddingDtype.default_value(),  # type: ignore
+        pooling_method: "PoolingMethod" = MANAGER.pooling_method[0],  # type: ignore
+        compile: bool = MANAGER.compile[0],
+        bettertransformer: bool = MANAGER.bettertransformer[0],
+        preload_only: bool = MANAGER.preload_only,
+        permissive_cors: bool = MANAGER.permissive_cors,
+        api_key: str = MANAGER.api_key,
+        url_prefix: str = MANAGER.url_prefix,
+        host: str = MANAGER.host,
+        port: int = MANAGER.port,
+        log_level: "UVICORN_LOG_LEVELS" = MANAGER.log_level,  # type: ignore
+    ):
+        """Infinity API ♾️  cli v1 - deprecated, consider use cli v2 via `infinity_emb v2`."""
+        if api_key:
+            # encourage switch to v2
+            raise ValueError("api_key is not supported in `v1`. Please migrate to `v2`.")
+        if not (
+            embedding_dtype == EmbeddingDtype.float32
+            or embedding_dtype == EmbeddingDtype.default_value()
+        ):
+            # encourage switch to v2
+            raise ValueError(
+                "selecting embedding_dtype is not supported in `v1`. Please migrate to `v2`."
+            )
+        logger.warning(
+            "CLI v1 is deprecated. Consider use CLI `v2`, by specifying `v2` as the command."
+        )
+        v2(
+            model_id=[model_name_or_path],
+            served_model_name=[served_model_name],  # type: ignore
+            batch_size=[batch_size],
+            revision=[revision],  # type: ignore
+            trust_remote_code=[trust_remote_code],
+            engine=[engine],
+            dtype=[dtype],
+            pooling_method=[pooling_method],
+            device=[device],
+            model_warmup=[model_warmup],
+            vector_disk_cache=[vector_disk_cache],
+            lengths_via_tokenize=[lengths_via_tokenize],
+            compile=[compile],
+            bettertransformer=[bettertransformer],
+            embedding_dtype=[EmbeddingDtype.float32],  # set to float32
+            # unique kwargs
+            preload_only=preload_only,
+            url_prefix=url_prefix,
+            host=host,
+            port=port,
+            redirect_slash=redirect_slash,
+            log_level=log_level,
+            permissive_cors=permissive_cors,
+            api_key=api_key,
+            proxy_root_path="",  # set as empty string
+        )
+
+    @tp.command("v2")
+    def v2(
+        # t
+        # arguments for engine
+        model_id: list[str] = typer.Option(
+            **_construct("model_id"),
+            help="Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference&",
+        ),
+        served_model_name: list[str] = typer.Option(
+            **_construct("served_model_name"),
+            help="the nickname for the API, under which the model_id can be selected",
+        ),
+        batch_size: list[int] = typer.Option(
+            **_construct("batch_size"), help="maximum batch size for inference"
+        ),
+        revision: list[str] = typer.Option(
+            **_construct("revision"), help="huggingface  model repo revision."
+        ),
+        trust_remote_code: list[bool] = typer.Option(
+            **_construct("trust_remote_code"),
+            help="if potential remote modeling code from huggingface repo is trusted.",
+        ),
+        engine: list[InferenceEngine] = typer.Option(
+            **_construct("engine"),
+            help="Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.",
+        ),
+        model_warmup: list[bool] = typer.Option(
+            **_construct("model_warmup"),
+            help="if model should be warmed up after startup, and before ready.",
+        ),
+        vector_disk_cache: list[bool] = typer.Option(
+            **_construct("vector_disk_cache"),
+            help="If hash(request)/results should be cached to SQLite for latency improvement.",
+        ),
+        device: list[Device] = typer.Option(
+            **_construct("device"),
+            help="device to use for computing the model forward pass.",
+        ),
+        device_id: list[str] = typer.Option(
+            **_construct("device_id"),
+            help="device id defines the model placement. e.g. `0,1` will place the model on MPS/CUDA/GPU 0 and 1 each",
+        ),
+        lengths_via_tokenize: list[bool] = typer.Option(
+            **_construct("lengths_via_tokenize"),
+            help="if True, returned tokens is based on actual tokenizer count. If false, uses len(input) as proxy.",
+        ),
+        dtype: list[Dtype] = typer.Option(
+            **_construct("dtype"), help="dtype for the model weights."
+        ),
+        embedding_dtype: list[EmbeddingDtype] = typer.Option(
+            **_construct("embedding_dtype"),
+            help="dtype post-forward pass. If != `float32`, using Post-Forward Static quantization.",
+        ),
+        pooling_method: list[PoolingMethod] = typer.Option(
+            **_construct("pooling_method"),
+            help="overwrite the pooling method if inferred incorrectly.",
+        ),
+        compile: list[bool] = typer.Option(
+            **_construct("compile"),
+            help="Enable usage of `torch.compile(dynamic=True)` if engine relies on it.",
+        ),
+        bettertransformer: list[bool] = typer.Option(
+            **_construct("bettertransformer"),
+            help="Enables varlen flash-attention-2 via the `BetterTransformer` implementation. If available for this model.",
+        ),
+        # arguments for uvicorn / server
+        preload_only: bool = typer.Option(
+            **_construct("preload_only"),
+            help="If true, only downloads models and verifies setup, then exit. Recommended for pre-caching the download in a Dockerfile.",
+        ),
+        host: str = typer.Option(**_construct("host"), help="host for the FastAPI uvicorn server"),
+        port: int = typer.Option(**_construct("port"), help="port for the FastAPI uvicorn server"),
+        url_prefix: str = typer.Option(
+            **_construct("url_prefix"),
+            callback=validate_url,
+            help="prefix for all routes of the FastAPI uvicorn server. Useful if you run behind a proxy / cascaded API.",
+        ),
+        redirect_slash: str = typer.Option(
+            **_construct("redirect_slash"), help="where to redirect `/` requests to."
+        ),
+        log_level: "UVICORN_LOG_LEVELS" = typer.Option(
+            **_construct("log_level"), help="console log level."
+        ),  # type: ignore
+        permissive_cors: bool = typer.Option(
+            **_construct("permissive_cors"), help="whether to allow permissive cors."
+        ),
+        api_key: str = typer.Option(
+            **_construct("api_key"), help="api_key used for authentication headers."
+        ),
+        proxy_root_path: str = typer.Option(
+            **_construct("proxy_root_path"),
+            help="Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/",
+        ),
+    ):
+        """Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil \n
+        \n
+        Multiple Model CLI Playbook: \n
+        - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id model/id2 --batch-size 8 --batch-size 4` \n
+        - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;" \n
+        - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8. \n
+        """
+        # old
+        """
+        model_id, list[str]: Huggingface model, e.g.
+            ["michaelfeil/bge-small-en-v1.5", "mixedbread-ai/mxbai-embed-large-v1"]
+            Defaults to `INFINITY_MODEL_ID`
+        served_model_name, list[str]: "", e.g. ["bge-small-en-v1.5"]
+        batch_size, list[int]: batch size for forward pass.
+        revision: list[str]: revision of the model.
+        trust_remote_code, list[bool]: trust remote code.
+        url_prefix, str: prefix for api. typically "".
+        host, str: host-url, typically either "0.0.0.0" or "127.0.0.1".
+        port, int: port that you want to expose.
+        redirect_slash, str: redirect to of GET "/". Defaults to "/docs". Empty string to disable.
+        log_level: logging level.
+            For high performance, use "info" or higher levels. Defaults to "info".
+        engine, str: framework that should perform inference.
+        model_warmup, bool: perform model warmup before starting the server.
+            Defaults to True.
+        vector_disk_cache, bool: cache past embeddings in SQL.
+            Defaults to False or env-INFINITY_CACHE_VECTORS if set
+        device, Device: device to use for inference. Defaults to Device.auto or "auto"
+        lengths_via_tokenize: bool: schedule by token usage. Defaults to False.
+        dtype, Dtype: data type to use for inference. Defaults to Dtype.auto or "auto"
+        embedding_dtype, EmbeddingDtype: data type to use for embeddings. Defaults to EmbeddingDtype.float32 or "float32"
+        pooling_method, PoolingMethod: pooling method to use. Defaults to PoolingMethod.auto or "auto"
+        compile, bool: compile model for faster inference. Defaults to False.
+        use_bettertransformer, bool: use bettertransformer. Defaults to True.
+        preload_only, bool: only preload the model and exit. Defaults to False.
+        permissive_cors, bool: add permissive CORS headers to enable consumption from a browser. Defaults to False.
+        api_key, str: optional Bearer token for authentication. Defaults to "", which disables authentication.
+        proxy_root_path, str: optional Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/
+        """
+        logger.setLevel(log_level.to_int())
+        device_id_typed = [DeviceID(d) for d in typer_option_resolve(device_id)]
+        padder = AutoPadding(
+            length=len(model_id),
+            model_name_or_path=model_id,
+            batch_size=batch_size,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            engine=engine,
+            model_warmup=model_warmup,
+            vector_disk_cache_path=vector_disk_cache,
+            device=device,
+            device_id=device_id_typed,
+            lengths_via_tokenize=lengths_via_tokenize,
+            dtype=dtype,
+            embedding_dtype=embedding_dtype,
+            pooling_method=pooling_method,
+            compile=compile,
+            bettertransformer=bettertransformer,
+            served_model_name=served_model_name,
+        )
+
+        engine_args = []
+        for kwargs in padder:
+            engine_args.append(EngineArgs(**kwargs))
+
+        (
+            url_prefix,
+            host,
+            port,
+            redirect_slash,
+            log_level,
+            preload_only,
+            permissive_cors,
+            api_key,
+            proxy_root_path,
+        ) = typer_option_resolve(
+            url_prefix,
+            host,
+            port,
+            redirect_slash,
+            log_level,
+            preload_only,
+            permissive_cors,
+            api_key,
+            proxy_root_path,
+        )
+
+        app = create_server(
+            engine_args_list=engine_args,
+            url_prefix=url_prefix,
+            doc_extra=dict(host=host, port=port),
+            redirect_slash=redirect_slash,
+            preload_only=preload_only,
+            permissive_cors=permissive_cors,
+            api_key=api_key,
+            proxy_root_path=proxy_root_path,
+        )
+
+        uvicorn.run(
+            app,
+            host=host,
+            port=port,
+            log_level=log_level.name,
+            http="httptools",
+            loop=loopname,  # type: ignore
+        )
+
+
+def cli():
+    CHECK_TYPER.mark_required()
+    if len(sys.argv) == 1 or sys.argv[1] not in [
+        "v1",
+        "v2",
+        "help",
+        "--help",
+        "--show-completion",
+        "--install-completion",
+    ]:
+        logger.critical(
+            "Error: No command given. Please use infinity with the `v2` command. "
+            f"This is deprecated since 0.0.32. You are on {infinity_emb.__version__}"
+            "Usage: `infinity_emb v2 --model-id BAAI/bge-large-en-v1.5"
+        )
+    tp()
+
+
+if __name__ == "__main__":
+    if "cli" in locals():
+        cli()
diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
index 1bcf14eb..740feb95 100644
--- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py
+++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -375,13 +375,12 @@ async def _get_prios_usage(self, items: Sequence[AbstractSingle]) -> tuple[list[
 
     def _publish_towards_model(
         self,
-        # shutdown: ShutdownReadOnly,
-        # queue_prio: "CustomFIFOQueue",
-        # publish_to_model_queue: Queue,
-        # max_batch_size: int,
-        # verbose: bool
     ):
-        """background thread for reading  exits only if shutdown.is_set()"""
+        """worker that moves batches from the priority_queue towards the model.
+        Runs in a separate thread, returns when self._shutdown.is_set().
+        """
+        # max_n_batches: how many batches are set for switching to `max-throughput` mode
+        # in throughput mode, read the last n-batches
         max_n_batches = 8
         try:
             while not self._shutdown.is_set():
diff --git a/libs/infinity_emb/infinity_emb/inference/caching_layer.py b/libs/infinity_emb/infinity_emb/inference/caching_layer.py
index 0d48814f..4a5628f5 100644
--- a/libs/infinity_emb/infinity_emb/inference/caching_layer.py
+++ b/libs/infinity_emb/infinity_emb/inference/caching_layer.py
@@ -22,7 +22,10 @@
 
 
 class Cache:
-    """wrapper around DiskCache"""
+    """wrapper around DiskCache. The Diskcache in infinity `races` against the model inference.
+
+    The concept is that with the code be
+    """
 
     def __init__(self, cache_name: str, shutdown: threading.Event) -> None:
         """
diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
index 74e085ca..43488012 100644
--- a/libs/infinity_emb/infinity_emb/infinity_server.py
+++ b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -3,9 +3,7 @@
 
 import asyncio
 import os
-import re
 import signal
-import sys
 import time
 import threading
 import uuid
@@ -13,25 +11,18 @@
 from typing import Any, Optional, Union, TYPE_CHECKING
 
 import infinity_emb
-from infinity_emb._optional_imports import CHECK_TYPER, CHECK_UVICORN
 from infinity_emb.args import EngineArgs
 from infinity_emb.engine import AsyncEmbeddingEngine, AsyncEngineArray
 from infinity_emb.env import MANAGER
 from infinity_emb.fastapi_schemas import docs, errors
-from infinity_emb.log_handler import UVICORN_LOG_LEVELS, logger
+from infinity_emb.log_handler import logger
 from infinity_emb.primitives import (
     AudioCorruption,
-    Device,
-    DeviceID,
-    Dtype,
-    EmbeddingDtype,
     ImageCorruption,
-    InferenceEngine,
     Modality,
     ModelCapabilites,
     MatryoshkaDimError,
     ModelNotDeployedError,
-    PoolingMethod,
 )
 from infinity_emb.telemetry import PostHog, StartupTelemetry, telemetry_log_info
 
@@ -617,369 +608,3 @@ async def _embeddings_audio(data: AudioEmbeddingInput):
             )
 
     return app
-
-
-class AutoPadding:
-    """itertools.cycle with custom behaviour"""
-
-    def __init__(self, length: int, **kwargs):
-        self.length = length
-        self.kwargs = kwargs
-
-    def _resolve(self, x, iteration: int):
-        """pad x to length of self.length"""
-        x = typer_option_resolve(x)
-        if not isinstance(x, (list, tuple)):
-            return x
-        elif len(x) == 1:
-            return x[0]
-        elif len(x) == self.length:
-            return x[iteration]
-        else:
-            raise ValueError(f"Expected length {self.length} but got {len(x)}")
-
-    def __iter__(self):
-        """iterate over kwargs and pad them to length of self.length"""
-        for iteration in range(self.length):
-            kwargs = {}
-            for key, value in self.kwargs.items():
-                kwargs[key] = self._resolve(value, iteration)
-            yield kwargs
-
-
-def typer_option_resolve(*args):
-    """returns the value or the default value"""
-    if len(args) == 1:
-        return (
-            args[0].default  # if it is a typer option
-            if hasattr(args[0], "default") and hasattr(args[0], "envvar")
-            else args[0]  # if it is a normal value
-        )
-    return (a.default if (hasattr(a, "default") and hasattr(a, "envvar")) else a for a in args)
-
-
-# CLI
-if CHECK_TYPER.is_available:
-    CHECK_TYPER.mark_required()
-    CHECK_UVICORN.mark_required()
-    import typer
-    import uvicorn
-
-    loopname = "auto"
-    if sys.version_info < (3, 12):
-        try:
-            import uvloop
-
-            asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-            loopname = "uvloop"
-        except ImportError:
-            # Windows does not support uvloop
-            pass
-
-    tp = typer.Typer()
-
-    @tp.command("v1")
-    def v1(
-        # v1 is deprecated. Please do no longer modify it.
-        model_name_or_path: str = MANAGER.model_id[0],
-        served_model_name: str = MANAGER.served_model_name[0],
-        batch_size: int = MANAGER.batch_size[0],
-        revision: str = MANAGER.revision[0],
-        trust_remote_code: bool = MANAGER.trust_remote_code[0],
-        redirect_slash: str = MANAGER.redirect_slash,
-        engine: InferenceEngine = MANAGER.engine[0],  # type: ignore # noqa
-        model_warmup: bool = MANAGER.model_warmup[0],
-        vector_disk_cache: bool = MANAGER.vector_disk_cache[0],
-        device: Device = MANAGER.device[0],  # type: ignore
-        lengths_via_tokenize: bool = MANAGER.lengths_via_tokenize[0],
-        dtype: Dtype = MANAGER.dtype[0],  # type: ignore
-        embedding_dtype: EmbeddingDtype = EmbeddingDtype.default_value(),  # type: ignore
-        pooling_method: PoolingMethod = MANAGER.pooling_method[0],  # type: ignore
-        compile: bool = MANAGER.compile[0],
-        bettertransformer: bool = MANAGER.bettertransformer[0],
-        preload_only: bool = MANAGER.preload_only,
-        permissive_cors: bool = MANAGER.permissive_cors,
-        api_key: str = MANAGER.api_key,
-        url_prefix: str = MANAGER.url_prefix,
-        host: str = MANAGER.host,
-        port: int = MANAGER.port,
-        log_level: UVICORN_LOG_LEVELS = MANAGER.log_level,  # type: ignore
-    ):
-        """Infinity API ♾️  cli v1 - deprecated, consider use cli v2 via `infinity_emb v2`."""
-        if api_key:
-            raise ValueError("api_key is not supported in `v1`. Please migrate to `v2`.")
-        if not (
-            embedding_dtype == EmbeddingDtype.float32
-            or embedding_dtype == EmbeddingDtype.default_value()
-        ):
-            raise ValueError(
-                "selecting embedding_dtype is not supported in `v1`. Please migrate to `v2`."
-            )
-        logger.warning(
-            "CLI v1 is deprecated. Consider use CLI `v2`, by specifying `v2` as the command."
-        )
-        time.sleep(1)
-        v2(
-            model_id=[model_name_or_path],
-            served_model_name=[served_model_name],  # type: ignore
-            batch_size=[batch_size],
-            revision=[revision],  # type: ignore
-            trust_remote_code=[trust_remote_code],
-            engine=[engine],
-            dtype=[dtype],
-            pooling_method=[pooling_method],
-            device=[device],
-            model_warmup=[model_warmup],
-            vector_disk_cache=[vector_disk_cache],
-            lengths_via_tokenize=[lengths_via_tokenize],
-            compile=[compile],
-            bettertransformer=[bettertransformer],
-            embedding_dtype=[EmbeddingDtype.float32],  # set to float32
-            # unique kwargs
-            preload_only=preload_only,
-            url_prefix=url_prefix,
-            host=host,
-            port=port,
-            redirect_slash=redirect_slash,
-            log_level=log_level,
-            permissive_cors=permissive_cors,
-            api_key=api_key,
-            proxy_root_path="",  # set as empty string
-        )
-
-    def _construct(name: str):
-        """constructs the default entry and type hint for the variable name"""
-        return dict(
-            # gets the default value from the ENV Manager
-            default=getattr(MANAGER, name),
-            # envvar is a dummy that is there for documentation purposes.
-            envvar=f"`{MANAGER.to_name(name)}`",
-        )
-
-    def validate_url(path: str):
-        """
-        This regex matches:
-        - An empty string or A single '/'
-        - A string that starts with '/' and does not end with '/'
-        """
-        if re.match(r"^$|^/$|^/.*[^/]$", path):
-            return path
-        raise typer.BadParameter("Path must start with '/' and must not end with '/'")
-
-    @tp.command("v2")
-    def v2(
-        # t
-        # arguments for engine
-        model_id: list[str] = typer.Option(
-            **_construct("model_id"),
-            help="Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference&",
-        ),
-        served_model_name: list[str] = typer.Option(
-            **_construct("served_model_name"),
-            help="the nickname for the API, under which the model_id can be selected",
-        ),
-        batch_size: list[int] = typer.Option(
-            **_construct("batch_size"), help="maximum batch size for inference"
-        ),
-        revision: list[str] = typer.Option(
-            **_construct("revision"), help="huggingface  model repo revision."
-        ),
-        trust_remote_code: list[bool] = typer.Option(
-            **_construct("trust_remote_code"),
-            help="if potential remote modeling code from huggingface repo is trusted.",
-        ),
-        engine: list[InferenceEngine] = typer.Option(
-            **_construct("engine"),
-            help="Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.",
-        ),
-        model_warmup: list[bool] = typer.Option(
-            **_construct("model_warmup"),
-            help="if model should be warmed up after startup, and before ready.",
-        ),
-        vector_disk_cache: list[bool] = typer.Option(
-            **_construct("vector_disk_cache"),
-            help="If hash(request)/results should be cached to SQLite for latency improvement.",
-        ),
-        device: list[Device] = typer.Option(
-            **_construct("device"),
-            help="device to use for computing the model forward pass.",
-        ),
-        device_id: list[str] = typer.Option(
-            **_construct("device_id"),
-            help="device id defines the model placement. e.g. `0,1` will place the model on MPS/CUDA/GPU 0 and 1 each",
-        ),
-        lengths_via_tokenize: list[bool] = typer.Option(
-            **_construct("lengths_via_tokenize"),
-            help="if True, returned tokens is based on actual tokenizer count. If false, uses len(input) as proxy.",
-        ),
-        dtype: list[Dtype] = typer.Option(
-            **_construct("dtype"), help="dtype for the model weights."
-        ),
-        embedding_dtype: list[EmbeddingDtype] = typer.Option(
-            **_construct("embedding_dtype"),
-            help="dtype post-forward pass. If != `float32`, using Post-Forward Static quantization.",
-        ),
-        pooling_method: list[PoolingMethod] = typer.Option(
-            **_construct("pooling_method"),
-            help="overwrite the pooling method if inferred incorrectly.",
-        ),
-        compile: list[bool] = typer.Option(
-            **_construct("compile"),
-            help="Enable usage of `torch.compile(dynamic=True)` if engine relies on it.",
-        ),
-        bettertransformer: list[bool] = typer.Option(
-            **_construct("bettertransformer"),
-            help="Enables varlen flash-attention-2 via the `BetterTransformer` implementation. If available for this model.",
-        ),
-        # arguments for uvicorn / server
-        preload_only: bool = typer.Option(
-            **_construct("preload_only"),
-            help="If true, only downloads models and verifies setup, then exit. Recommended for pre-caching the download in a Dockerfile.",
-        ),
-        host: str = typer.Option(**_construct("host"), help="host for the FastAPI uvicorn server"),
-        port: int = typer.Option(**_construct("port"), help="port for the FastAPI uvicorn server"),
-        url_prefix: str = typer.Option(
-            **_construct("url_prefix"),
-            callback=validate_url,
-            help="prefix for all routes of the FastAPI uvicorn server. Useful if you run behind a proxy / cascaded API.",
-        ),
-        redirect_slash: str = typer.Option(
-            **_construct("redirect_slash"), help="where to redirect `/` requests to."
-        ),
-        log_level: UVICORN_LOG_LEVELS = typer.Option(
-            **_construct("log_level"), help="console log level."
-        ),  # type: ignore
-        permissive_cors: bool = typer.Option(
-            **_construct("permissive_cors"), help="whether to allow permissive cors."
-        ),
-        api_key: str = typer.Option(
-            **_construct("api_key"), help="api_key used for authentication headers."
-        ),
-        proxy_root_path: str = typer.Option(
-            **_construct("proxy_root_path"),
-            help="Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/",
-        ),
-    ):
-        """Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil \n
-        \n
-        Multiple Model CLI Playbook: \n
-        - 1. cli options can be overloaded i.e. `v2 --model-id model/id1 --model-id/id2 --batch-size 8 --batch-size 4` \n
-        - 2. or adapt the defaults by setting ENV Variables separated by `;`: INFINITY_MODEL_ID="model/id1;model/id2;" && INFINITY_BATCH_SIZE="8;4;" \n
-        - 3. single items are broadcasted to `--model-id` length, making `v2 --model-id model/id1 --model-id/id2 --batch-size 8` both models have batch-size 8. \n
-        """
-        # old
-        """
-        model_id, list[str]: Huggingface model, e.g.
-            ["michaelfeil/bge-small-en-v1.5", "mixedbread-ai/mxbai-embed-large-v1"]
-            Defaults to `INFINITY_MODEL_ID`
-        served_model_name, list[str]: "", e.g. ["bge-small-en-v1.5"]
-        batch_size, list[int]: batch size for forward pass.
-        revision: list[str]: revision of the model.
-        trust_remote_code, list[bool]: trust remote code.
-        url_prefix, str: prefix for api. typically "".
-        host, str: host-url, typically either "0.0.0.0" or "127.0.0.1".
-        port, int: port that you want to expose.
-        redirect_slash, str: redirect to of GET "/". Defaults to "/docs". Empty string to disable.
-        log_level: logging level.
-            For high performance, use "info" or higher levels. Defaults to "info".
-        engine, str: framework that should perform inference.
-        model_warmup, bool: perform model warmup before starting the server.
-            Defaults to True.
-        vector_disk_cache, bool: cache past embeddings in SQL.
-            Defaults to False or env-INFINITY_CACHE_VECTORS if set
-        device, Device: device to use for inference. Defaults to Device.auto or "auto"
-        lengths_via_tokenize: bool: schedule by token usage. Defaults to False.
-        dtype, Dtype: data type to use for inference. Defaults to Dtype.auto or "auto"
-        embedding_dtype, EmbeddingDtype: data type to use for embeddings. Defaults to EmbeddingDtype.float32 or "float32"
-        pooling_method, PoolingMethod: pooling method to use. Defaults to PoolingMethod.auto or "auto"
-        compile, bool: compile model for faster inference. Defaults to False.
-        use_bettertransformer, bool: use bettertransformer. Defaults to True.
-        preload_only, bool: only preload the model and exit. Defaults to False.
-        permissive_cors, bool: add permissive CORS headers to enable consumption from a browser. Defaults to False.
-        api_key, str: optional Bearer token for authentication. Defaults to "", which disables authentication.
-        proxy_root_path, str: optional Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/
-        """
-        logger.setLevel(log_level.to_int())
-        device_id_typed = [DeviceID(d) for d in typer_option_resolve(device_id)]
-        padder = AutoPadding(
-            length=len(model_id),
-            model_name_or_path=model_id,
-            batch_size=batch_size,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            engine=engine,
-            model_warmup=model_warmup,
-            vector_disk_cache_path=vector_disk_cache,
-            device=device,
-            device_id=device_id_typed,
-            lengths_via_tokenize=lengths_via_tokenize,
-            dtype=dtype,
-            embedding_dtype=embedding_dtype,
-            pooling_method=pooling_method,
-            compile=compile,
-            bettertransformer=bettertransformer,
-            served_model_name=served_model_name,
-        )
-
-        engine_args = []
-        for kwargs in padder:
-            engine_args.append(EngineArgs(**kwargs))
-
-        (
-            url_prefix,
-            host,
-            port,
-            redirect_slash,
-            log_level,
-            preload_only,
-            permissive_cors,
-            api_key,
-            proxy_root_path,
-        ) = typer_option_resolve(
-            url_prefix,
-            host,
-            port,
-            redirect_slash,
-            log_level,
-            preload_only,
-            permissive_cors,
-            api_key,
-            proxy_root_path,
-        )
-
-        app = create_server(
-            engine_args_list=engine_args,
-            url_prefix=url_prefix,
-            doc_extra=dict(host=host, port=port),
-            redirect_slash=redirect_slash,
-            preload_only=preload_only,
-            permissive_cors=permissive_cors,
-            api_key=api_key,
-            proxy_root_path=proxy_root_path,
-        )
-
-        uvicorn.run(
-            app,
-            host=host,
-            port=port,
-            log_level=log_level.name,
-            http="httptools",
-            loop=loopname,  # type: ignore
-        )
-
-    def cli():
-        CHECK_TYPER.mark_required()
-        if len(sys.argv) == 1 or sys.argv[1] not in ["v1", "v2", "help", "--help"]:
-            for _ in range(3):
-                logger.error(
-                    "Error: No command given. Defaulting to `v1`. "
-                    "Relying on this side effect is considered an error and "
-                    "will be deprecated in the future, which requires explicit usage of a `infinity_emb v1` or `infinity_emb v2`. "
-                    "Specify the version of the CLI you want to use. "
-                )
-                time.sleep(1)
-            sys.argv.insert(1, "v1")
-        tp()
-
-    if __name__ == "__main__":
-        cli()
diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml
index 1bd621d1..d4787830 100644
--- a/libs/infinity_emb/pyproject.toml
+++ b/libs/infinity_emb/pyproject.toml
@@ -59,7 +59,7 @@ soundfile = {version="^0.12.1", optional=true}
 
 
 [tool.poetry.scripts]
-infinity_emb = "infinity_emb.infinity_server:cli"
+infinity_emb = "infinity_emb.cli:cli"
 
 [tool.poetry.group.test.dependencies]
 pytest = "^8.0.0"
diff --git a/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py b/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py
index 19f5a385..333741fb 100644
--- a/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py
+++ b/libs/infinity_emb/tests/end_to_end/test_openapi_client_compat.py
@@ -132,7 +132,7 @@ async def test_openai(client: AsyncClient):
 
     # test AUDIO: cosine distance of beep to cat and dog
     np.testing.assert_allclose(
-        emb1_audio.data[0].embedding, emb1_1_audio.data[0].embedding, rtol=1e-5
+        emb1_audio.data[0].embedding, emb1_1_audio.data[0].embedding, rtol=1e-4, atol=1e-4
     )
     assert all(
         np.dot(emb1_audio.data[0].embedding, emb1_audio_from_text.data[0].embedding)
@@ -142,7 +142,7 @@ async def test_openai(client: AsyncClient):
 
     # test IMAGE: cosine distance of cat to dog and bird
     np.testing.assert_allclose(
-        emb_1_image.data[0].embedding, emb_1_1_image.data[0].embedding, rtol=1e-5
+        emb_1_image.data[0].embedding, emb_1_1_image.data[0].embedding, rtol=1e-4, atol=1e-4
     )
     assert all(
         np.dot(emb_1_image.data[0].embedding, emb_1_image_from_text.data[0].embedding)
@@ -152,7 +152,7 @@ async def test_openai(client: AsyncClient):
 
     # test TEXT: cosine distance of cat to dog and bird
     np.testing.assert_allclose(
-        emb_1_text.data[0].embedding, emb_1_text.data[1].embedding, rtol=1e-5
+        emb_1_text.data[0].embedding, emb_1_text.data[1].embedding, rtol=1e-4, atol=1e-4
     )
 
     # wrong key
diff --git a/libs/infinity_emb/tests/unit_test/test_cli.py b/libs/infinity_emb/tests/unit_test/test_cli.py
new file mode 100644
index 00000000..c392aa67
--- /dev/null
+++ b/libs/infinity_emb/tests/unit_test/test_cli.py
@@ -0,0 +1,63 @@
+import subprocess
+import sys
+
+import pytest
+
+
+# only run subprocess on non-windows
+@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
+def test_cli_help():
+    log = subprocess.run(["infinity_emb", "--help"])
+    assert log.returncode == 0
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
+def test_cli_v1_help():
+    log = subprocess.run(["infinity_emb", "v1", "--help"])
+    assert log.returncode == 0
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
+def test_cli_v2_help():
+    log = subprocess.run(["infinity_emb", "v2", "--help"])
+    assert log.returncode == 0
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
+def test_cli_v1_wrong_batch_size():
+    log = subprocess.run(["infinity_emb", "v1", "--batch-size", "WrongArgument"])
+    assert log.returncode == 2
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
+def test_cli_v2_wrong_batch_size():
+    log = subprocess.run(["infinity_emb", "v2", "--batch-size", "WrongArgument"])
+    assert log.returncode == 2
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
+def test_cli_v2_weird():
+    log = subprocess.run(
+        [
+            "infinity_emb",
+            "v2",
+            "--model-id",
+            "model1",
+            "--model-id",
+            "model2",
+            "--model-id",
+            "model3",
+            "--batch-size",
+            "32",
+            "--batch-size",
+            "32",
+        ]
+    )
+    assert log.returncode == 1
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
+@pytest.mark.parametrize("version", ["v1", "v2"])
+def test_cli_preload(version):
+    log = subprocess.run(["infinity_emb", f"{version}", "--preload-only"])
+    assert log.returncode == 0
diff --git a/libs/infinity_emb/tests/unit_test/test_infinity_server.py b/libs/infinity_emb/tests/unit_test/test_infinity_server.py
index d0bcb6a3..755a8eba 100644
--- a/libs/infinity_emb/tests/unit_test/test_infinity_server.py
+++ b/libs/infinity_emb/tests/unit_test/test_infinity_server.py
@@ -1,82 +1,21 @@
-import subprocess
-import sys
-
-import pytest
 import uvicorn
 from fastapi import FastAPI
 
 from infinity_emb.args import EngineArgs
 from infinity_emb.infinity_server import (
+    create_server,
+)
+from infinity_emb.cli import v1, v2
+
+from infinity_emb.cli import (
     UVICORN_LOG_LEVELS,
     Device,
     Dtype,
     InferenceEngine,
     PoolingMethod,
-    create_server,
-    v1,
-    v2,
 )
 
 
-# only run subprocess on non-windows
-@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
-def test_cli_help():
-    log = subprocess.run(["infinity_emb", "--help"])
-    assert log.returncode == 0
-
-
-@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
-def test_cli_v1_help():
-    log = subprocess.run(["infinity_emb", "v1", "--help"])
-    assert log.returncode == 0
-
-
-@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
-def test_cli_v2_help():
-    log = subprocess.run(["infinity_emb", "v2", "--help"])
-    assert log.returncode == 0
-
-
-@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
-def test_cli_v1_wrong_batch_size():
-    log = subprocess.run(["infinity_emb", "v1", "--batch-size", "WrongArgument"])
-    assert log.returncode == 2
-
-
-@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
-def test_cli_v2_wrong_batch_size():
-    log = subprocess.run(["infinity_emb", "v2", "--batch-size", "WrongArgument"])
-    assert log.returncode == 2
-
-
-@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
-def test_cli_v2_weird():
-    log = subprocess.run(
-        [
-            "infinity_emb",
-            "v2",
-            "--model-id",
-            "model1",
-            "--model-id",
-            "model2",
-            "--model-id",
-            "model3",
-            "--batch-size",
-            "32",
-            "--batch-size",
-            "32",
-        ]
-    )
-    assert log.returncode == 1
-
-
-@pytest.mark.skipif(sys.platform == "win32", reason="does not run on windows")
-@pytest.mark.parametrize("version", ["v1", "v2"])
-def test_cli_preload(version):
-    log = subprocess.run(["infinity_emb", f"{version}", "--preload-only"])
-    assert log.returncode == 0
-
-
 def test_create_server():
     app = create_server(engine_args_list=[EngineArgs(engine="debugengine")])
     assert isinstance(app, FastAPI)
diff --git a/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py b/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py
index dff9065b..9d58041e 100644
--- a/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py
+++ b/libs/infinity_emb/tests/unit_test/transformer/crossencoder/test_torch_crossencoder.py
@@ -5,6 +5,11 @@
 
 from infinity_emb.args import EngineArgs
 from infinity_emb.transformer.crossencoder.torch import CrossEncoderPatched
+from infinity_emb.primitives import Device
+
+import torch
+
+device = Device.cpu if torch.backends.mps.is_available() else Device.auto
 
 SHOULD_TORCH_COMPILE = sys.platform == "linux" and sys.version_info < (3, 12)
 
@@ -14,6 +19,7 @@ def test_crossencoder():
         engine_args=EngineArgs(
             model_name_or_path="mixedbread-ai/mxbai-rerank-xsmall-v1",
             compile=SHOULD_TORCH_COMPILE,
+            device=device,
         )
     )
 
@@ -37,10 +43,10 @@ def test_crossencoder():
 def test_patched_crossencoder_vs_sentence_transformers():
     model = CrossEncoderPatched(
         engine_args=EngineArgs(
-            model_name_or_path="mixedbread-ai/mxbai-rerank-xsmall-v1", compile=True
+            model_name_or_path="mixedbread-ai/mxbai-rerank-xsmall-v1", compile=True, device=device
         )
     )
-    model_unpatched = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1", trust_remote_code=True)
+    model_unpatched = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 
     query = "Where is Paris?"
     documents = [