From 5dde578e189e55d7c65a8231f32ee9f1ae2a8e28 Mon Sep 17 00:00:00 2001
From: michaelfeil <me@michaelfeil.eu>
Date: Thu, 12 Oct 2023 02:35:58 +0200
Subject: [PATCH 1/4] add unit tests passing.

---
 .github/workflows/test.yaml                   |  4 +-
 README.md                                     |  4 +-
 .../infinity_emb/fastapi_schemas/convert.py   | 21 ++-----
 .../infinity_emb/fastapi_schemas/docs.py      | 16 +++++
 .../infinity_emb/inference/batch_handler.py   | 14 +++--
 .../infinity_emb/inference/models.py          | 25 +++-----
 .../infinity_emb/infinity_server.py           | 57 +++++++++--------
 libs/infinity_emb/poetry.lock                 | 61 ++++++++++++++++++-
 libs/infinity_emb/pyproject.toml              |  1 +
 .../tests/end_to_end/test_ct2_sentence.py     |  4 +-
 libs/infinity_emb/tests/script_live.py        | 51 ++++++++++++++++
 .../unit_test/inference/test_batch_handler.py |  4 +-
 12 files changed, 189 insertions(+), 73 deletions(-)
 create mode 100644 libs/infinity_emb/infinity_emb/fastapi_schemas/docs.py
 create mode 100644 libs/infinity_emb/tests/script_live.py

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 486fb67e..0583588e 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -49,4 +49,6 @@ jobs:
 
       - name: Upload coverage Report to Codecov for python 3.10
         if: ${{ matrix.python-version == '3.10' && inputs.upload_coverage == true }}
-        uses: codecov/codecov-action@v2
\ No newline at end of file
+        uses: codecov/codecov-action@v3
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
\ No newline at end of file
diff --git a/README.md b/README.md
index 1842ef62..bb1f306f 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@ Embedding Inference Server - finding TGI for embeddings
 ## Why Infinity:
 Infinity provides the following features:
 - **Fast inference**: The inference server is built on top of [torch](https:) and [ctranslate2](https://github.com/OpenNMT/CTranslate2) under the hood, getting most out of your **CUDA** or **CPU** hardware.
-- **Continous batching**: All new embedding requests are queued while GPU is busy with the previous ones. New requests are served as soon as GPU is ready. Adds only ~2% overhead for large datasets, over static batching.
+- **Dynamic, optimal batching**: New embedding requests are queued while GPU is busy with the previous ones. New requests are squeezed intro your GPU/CPU as soon as ready. 
 - **Correct and tested implementation**: Unit and end-to-end tested. API embeddings are identical to [sentence-transformers](https://github.com/UKPLab/sentence-transformers/) (up to numerical precision). Lets API users create embeddings till infinity and beyond.
-- **Easy to use**: The API is built on top of [FastAPI](https://fastapi.tiangolo.com/) and [Swagger](https://swagger.io/) and is fully documented. See below on how to get started.
+- **Easy to use**: The API is built on top of [FastAPI](https://fastapi.tiangolo.com/), [Swagger](https://swagger.io/) makes it fully documented. API specs are aligned to OpenAI. See below on how to get started.
 
 # Demo:
 A quick demo of launching: [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) with batch-size=2 and sending 3 requests via cURL.
diff --git a/libs/infinity_emb/infinity_emb/fastapi_schemas/convert.py b/libs/infinity_emb/infinity_emb/fastapi_schemas/convert.py
index 457553e7..eb8c1854 100644
--- a/libs/infinity_emb/infinity_emb/fastapi_schemas/convert.py
+++ b/libs/infinity_emb/infinity_emb/fastapi_schemas/convert.py
@@ -1,32 +1,19 @@
 from ..inference.primitives import NpEmbeddingType
-from .pymodels import OpenAIEmbeddingResult, _EmbeddingObject, _Usage
+from .pymodels import OpenAIEmbeddingResult
 
 
 def list_embeddings_to_response(
     embeddings: NpEmbeddingType, model: str, usage: int
 ) -> OpenAIEmbeddingResult:
-    return OpenAIEmbeddingResult(
+    return dict(
         model=model,
         data=[
-            _EmbeddingObject(
+            dict(
                 object="embedding",
                 embedding=emb,
                 index=count,
             )
             for count, emb in enumerate(embeddings)
         ],
-        usage=_Usage(prompt_tokens=usage, total_tokens=usage),
+        usage=dict(prompt_tokens=usage, total_tokens=usage),
     )
-
-    # return {
-    #     "model": model,
-    #     "data": [
-    #         dict(
-    #             object="embedding",
-    #             embedding=emb,
-    #             index=count,
-    #         )
-    #         for count, emb in enumerate(embeddings)
-    #     ],
-    #     "usage": {"prompt_tokens": usage, "total_tokens": usage},
-    # }
diff --git a/libs/infinity_emb/infinity_emb/fastapi_schemas/docs.py b/libs/infinity_emb/infinity_emb/fastapi_schemas/docs.py
new file mode 100644
index 00000000..fa251908
--- /dev/null
+++ b/libs/infinity_emb/infinity_emb/fastapi_schemas/docs.py
@@ -0,0 +1,16 @@
+FASTAPI_TITLE = "♾️ Infinity - Embedding Inference Server"
+FASTAPI_SUMMARY = "Embedding Inference Server - finding TGI for embeddings"
+
+
+def startup_message(host: str, port: str, prefix: str) -> str:
+    return f"""
+
+♾️  Infinity - Embedding Inference Server
+MIT License; Copyright (c) 2023 Michael Feil
+
+Open the Docs via Swagger UI:
+http://{host}:{port}/docs
+
+Access model via 'GET':
+curl http://{host}:{port}{prefix}/models
+"""
diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
index 5d5029c9..c0316f81 100644
--- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py
+++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -8,7 +8,7 @@
 from typing import Dict, List, Union
 
 from ..log_handler import logger
-from .models import BaseTransformer
+from .models import BaseTransformer, get_lengths_with_tokenize
 from .primitives import (
     EmbeddingResult,
     NpEmbeddingType,
@@ -128,9 +128,7 @@ def __init__(
     def shutdown(self):
         self._shutdown.set()
 
-    async def schedule(
-        self, sentences: List[str], prios: List[int]
-    ) -> NpEmbeddingType | None:
+    async def schedule(self, sentences: List[str]) -> tuple[List[NpEmbeddingType], int]:
         """Schedule a sentence to be embedded. Awaits until embedded.
 
         Args:
@@ -143,6 +141,9 @@ async def schedule(
         # add an unique identifier
         uuid_event = []
         prioqueue = []
+
+        prios, usage = get_lengths_with_tokenize(sentences, self.model.tokenize_lengths)
+
         for s, p in zip(sentences, prios):
             inner = EmbeddingResult(sentence=s, event=EventTS(self._threadpool))
             item = PrioritizedQueueItem(item=inner, priority=p)
@@ -154,7 +155,8 @@ async def schedule(
             self._result_store.wait_for_response(uuid, event)
             for uuid, event in uuid_event
         ]
-        return await asyncio.gather(*gather_results)
+        embeddings = await asyncio.gather(*gather_results)
+        return embeddings, usage
 
     def is_overloaded(self) -> bool:
         # start consuming
@@ -176,7 +178,7 @@ def overload_status(self) -> OverloadStatus:
     def _preprocess_batch(self):
         """loops and checks if the _core_batch has worked on all items"""
         self._ready = True
-        logger.info("ready to receive requests.")
+        logger.info("ready to batch requests.")
         try:
             while not self._shutdown.is_set():
                 # patience:
diff --git a/libs/infinity_emb/infinity_emb/inference/models.py b/libs/infinity_emb/infinity_emb/inference/models.py
index 2bdba0a6..5c93b86f 100644
--- a/libs/infinity_emb/infinity_emb/inference/models.py
+++ b/libs/infinity_emb/infinity_emb/inference/models.py
@@ -1,3 +1,4 @@
+import copy
 import os
 from abc import ABC, abstractmethod
 from enum import Enum
@@ -70,9 +71,12 @@ def __init__(self, *args, **kwargs):
         device = self._target_device
         self.eval()
         self.to(device)
+        # make a copy of the tokenizer,
+        # to be able to could the tokens in another thread
+        # without corrupting the original.
+        self._infinity_tokenizer = copy.deepcopy(self._first_module().tokenizer)
 
     def encode_pre(self, sentences) -> Dict[str, Tensor]:
-        # features = self._tokenize_actual(sentences)
         features = self.tokenize(sentences)
 
         return features
@@ -81,7 +85,6 @@ def encode_core(self, features: Dict[str, Tensor]) -> Tensor:
         """
         Computes sentence embeddings
         """
-        # features = self._tokenize_actual(features)
         device = self._target_device
         features = util.batch_to_device(features, device)
         # move forward
@@ -103,29 +106,17 @@ def encode_post(
         return embeddings_out
 
     def tokenize_lengths(self, sentences: List[str]) -> List[int]:
-        fm = self._first_module()
-        tks = fm.tokenizer.batch_encode_plus(
+        tks = self._infinity_tokenizer.batch_encode_plus(
             sentences,
             add_special_tokens=False,
             return_token_type_ids=False,
             return_attention_mask=False,
             return_length=False,
+            max_length=self._infinity_tokenizer.model_max_length,
+            truncation="longest_first",
         ).encodings
         return [len(t.tokens) for t in tks]
 
-    def _tokenize_actual(self, sentences: List[str]):
-        fm = self._first_module()
-        output = fm.tokenizer(
-            sentences,
-            padding=True,
-            truncation="longest_first",
-            return_tensors="pt",
-            max_length=fm.tokenizer.model_max_length,
-            # pad_to_multiple_of=16,
-        )
-
-        return dict(**output)
-
 
 class CT2SentenceTransformer(SentenceTransformerPatched):
     """
diff --git a/libs/infinity_emb/infinity_emb/infinity_server.py b/libs/infinity_emb/infinity_emb/infinity_server.py
index 5ae10a61..2870c9ec 100644
--- a/libs/infinity_emb/infinity_emb/infinity_server.py
+++ b/libs/infinity_emb/infinity_emb/infinity_server.py
@@ -3,15 +3,14 @@
 
 import typer
 import uvicorn
-from fastapi import FastAPI, status
+from fastapi import FastAPI, responses, status
 from prometheus_fastapi_instrumentator import Instrumentator
 
 # prometheus
 import infinity_emb
-from infinity_emb.fastapi_schemas import errors
+from infinity_emb.fastapi_schemas import docs, errors
 from infinity_emb.fastapi_schemas.convert import list_embeddings_to_response
 from infinity_emb.fastapi_schemas.pymodels import (
-    ModelInfo,
     OpenAIEmbeddingInput,
     OpenAIEmbeddingResult,
     OpenAIModelInfo,
@@ -26,10 +25,11 @@ def create_server(
     batch_size: int = 64,
     engine: models.InferenceEngine = models.InferenceEngine.torch,
     verbose: bool = False,
+    doc_extra: dict = {},
 ):
     app = FastAPI(
-        title="♾️ Infinity - Embedding Inference Server",
-        summary="Embedding Inference Server - finding TGI for embeddings",
+        title=docs.FASTAPI_TITLE,
+        summary=docs.FASTAPI_SUMMARY,
         version=infinity_emb.__version__,
         contact=dict(name="Michael Feil"),
         docs_url="/docs",
@@ -53,10 +53,17 @@ async def _startup():
         app.batch_handler = BatchHandler(
             max_batch_size=batch_size, model=model, threadpool=app.tp, verbose=verbose
         )
-        app.tokenize_len = model.tokenize_lengths
         # start in a threadpool
         await app.batch_handler.spawn()
 
+        logger.info(
+            docs.startup_message(
+                host=doc_extra.pop("host", "localhost"),
+                port=doc_extra.pop("port", "PORT"),
+                prefix=url_prefix,
+            )
+        )
+
     @app.on_event("shutdown")
     async def _shutdown():
         app.batch_handler.shutdown()
@@ -71,23 +78,32 @@ async def _ready() -> float:
                 "model not ready", code=status.HTTP_503_SERVICE_UNAVAILABLE
             )
 
-    @app.get(f"{url_prefix}/models")
-    async def _models() -> OpenAIModelInfo:
+    @app.get(
+        f"{url_prefix}/models",
+        response_model=OpenAIModelInfo,
+        response_class=responses.ORJSONResponse,
+    )
+    async def _models():
         """get models endpoint"""
         s = app.batch_handler.overload_status()  # type: ignore
-        return OpenAIModelInfo(
-            data=ModelInfo(
+        return dict(
+            data=dict(
                 id=model_name_or_path,
                 stats=dict(
                     queue_fraction=s.queue_fraction,
                     queue_absolute=s.queue_absolute,
                     results_pending=s.results_absolute,
+                    batch_size=batch_size,
                 ),
             )
         )
 
-    @app.post(f"{url_prefix}/embeddings")
-    async def _embeddings(data: OpenAIEmbeddingInput) -> OpenAIEmbeddingResult:
+    @app.post(
+        f"{url_prefix}/embeddings",
+        response_model=OpenAIEmbeddingResult,
+        response_class=responses.ORJSONResponse,
+    )
+    async def _embeddings(data: OpenAIEmbeddingInput):
         """Encode Embeddings
 
         ```python
@@ -102,25 +118,16 @@ async def _embeddings(data: OpenAIEmbeddingInput) -> OpenAIEmbeddingResult:
             )
 
         try:
+            logger.debug("[📝] Received request with %s inputs ", len(data.input))
             start = time.perf_counter()
 
-            # lengths, usage = await to_thread(
-            #   models.get_lengths_with_tokenize, app.tp, data.input, app.tokenize_len)
-            lengths, usage = models.get_lengths_with_tokenize(
-                data.input  # , app.tokenize_len
-            )
-            logger.debug("[📝] Received request with %s inputs ", len(lengths))
-
-            # emb = await asyncio.gather(
-            #     *[(bh.schedule(s, prio=prio)) for s, prio in zip(data.input, lengths)]
-            # )
-            emb = await bh.schedule(data.input, prios=lengths)
+            embedding, usage = await bh.schedule(data.input)
 
             duration = (time.perf_counter() - start) * 1000
             logger.debug("[✅] Done in %s ms", duration)
 
             res = list_embeddings_to_response(
-                embeddings=emb, model=data.model, usage=usage
+                embeddings=embedding, model=data.model, usage=usage
             )
 
             return res
@@ -165,6 +172,7 @@ def start_uvicorn(
         batch_size=batch_size,
         engine=engine_load,
         verbose=log_level.to_int() <= 10,
+        doc_extra=dict(host=host, port=port),
     )
     uvicorn.run(app, host=host, port=port, log_level=log_level.name)
 
@@ -174,6 +182,7 @@ def cli():
     typer.run(start_uvicorn)
 
 
+# app = create_server()
 if __name__ == "__main__":
     # for debugging
     cli()
diff --git a/libs/infinity_emb/poetry.lock b/libs/infinity_emb/poetry.lock
index c0d16eb8..299fedc6 100644
--- a/libs/infinity_emb/poetry.lock
+++ b/libs/infinity_emb/poetry.lock
@@ -1166,6 +1166,65 @@ files = [
 setuptools = "*"
 wheel = "*"
 
+[[package]]
+name = "orjson"
+version = "3.9.8"
+description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "orjson-3.9.8-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:823525bfb27b804b492acc59a45dc0973ea629d97557eac81dde7b34b5267611"},
+    {file = "orjson-3.9.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be6f2634fe6c88a0e1e785fc0b6845ad75bef6e20f1ee3d62fd81b17e7505cbf"},
+    {file = "orjson-3.9.8-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2c56dd62754e2ee5b7f64d37f3e85685d3bd5bcaa448076e9113be9069078dfc"},
+    {file = "orjson-3.9.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c863c7805a7961428a40431a8f47c3f71c74e6c5ddf1ab023e6e79bc5806e6d5"},
+    {file = "orjson-3.9.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d30621cf18a0e16a16fbcf2fa536d800f78514a46f5321130f1b54e88994267"},
+    {file = "orjson-3.9.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5311ce1457a29084146d2599588dc8ad96256feb921af8e365444fa8ad67afac"},
+    {file = "orjson-3.9.8-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f9b070c895fc81c362b1b41dc6d0c81a84ee4abb1193804de15683549aeeb0ee"},
+    {file = "orjson-3.9.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:24915b65ac19731a57a5ab7dbf463f91555e10d4ad833513e7d8cc6848487c24"},
+    {file = "orjson-3.9.8-cp310-none-win32.whl", hash = "sha256:2bcc9dc53f9e1d679515349bf299ed5e75310146c755d2ba227a7e37851ab3fb"},
+    {file = "orjson-3.9.8-cp310-none-win_amd64.whl", hash = "sha256:423774c85e73054acfef10fc3328f35c8d3e0193a7247d47308ebfccde70695d"},
+    {file = "orjson-3.9.8-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8a1c92f467f5fd0f8fb79273006b563364b1e45667b3760423498348dc2e22fa"},
+    {file = "orjson-3.9.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:742d4d16d66579ffff4b2048a8de4a0b03d731847233e92c4edd418a9c582d0f"},
+    {file = "orjson-3.9.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6d1aab08b373232f568ea9ae048f9f77e09f389068afee6dd44bb6140e2c3ea3"},
+    {file = "orjson-3.9.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:68ed63273ec4ecdd7865e9d984d65a749c0d780882cf9dde6ab2bc6323f6471a"},
+    {file = "orjson-3.9.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d23edcb32383f3d86b2f4914f9825ce2d67625abd34be6e5ed1f59ec30127b7a"},
+    {file = "orjson-3.9.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9bcd3a48b260d3dfe68b8ce93d11f99a70bd4c908efe22d195a1b1dcfb15ac2"},
+    {file = "orjson-3.9.8-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9ce982f3c1df83f7dc74f3b2690605470ff4790d12558e44359f01e822c5cb08"},
+    {file = "orjson-3.9.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4433dd903d5b022a64e9dd1dca94f08ab04d5d928a0ecd33dd46110468960879"},
+    {file = "orjson-3.9.8-cp311-none-win32.whl", hash = "sha256:a119c73520192c2882d0549151b9cdd65e0bb5396bedf8951ba5f70d6a873879"},
+    {file = "orjson-3.9.8-cp311-none-win_amd64.whl", hash = "sha256:764306f6370e6c76cbbf3139dd9b05be9c4481ee0b15966bd1907827a5777216"},
+    {file = "orjson-3.9.8-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:af8e6185516ce0c93d6ce1f4105918504da629c631fd969686f32a1be3ed3c9b"},
+    {file = "orjson-3.9.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e8f5ac250184dcb6b00543f0f82853d7e840e476d0135733e459aee058695e5"},
+    {file = "orjson-3.9.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:edafb45fc5b2063abd8a0baf6be21c38497df2d9e0b75cdb053eb0ff100fa26c"},
+    {file = "orjson-3.9.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cc449bff1d4152438615f4a6a003577942908c4e166d64dc46d1f3f0cde72ecd"},
+    {file = "orjson-3.9.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ee887aeb8ab0c1d25e9f2b540f9a34b4cbfe8894f95b63a5984441a9f337d2ff"},
+    {file = "orjson-3.9.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:235b4aa46c58ded90c8b368722c1eb941613fe5a6b18bc14cfaae929f0be902e"},
+    {file = "orjson-3.9.8-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ff2e6e429416b6287006ba0556083f62396199299ab85afd3ba1e83be14677e2"},
+    {file = "orjson-3.9.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ab9c234bfe89aeba825feb897718c65a80851f367a4a8308d6b5074a80fce6e5"},
+    {file = "orjson-3.9.8-cp312-none-win_amd64.whl", hash = "sha256:5c818f19315251d68954c529f5d8322053f1c35b500b47d008e968bf2d32ed97"},
+    {file = "orjson-3.9.8-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:e6a267c0fc64fc4d0b8fb146e1a060a40f570441a9390ec4bc6de0b5fda148cd"},
+    {file = "orjson-3.9.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3c7c4d60e21b0f10c8214d7ca9f2243019dd1bf9d2750b3b4a9250935977a24"},
+    {file = "orjson-3.9.8-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3be3da93c4d044d2f60de816320087a8494c3e75cdf3369655e014240b1a229d"},
+    {file = "orjson-3.9.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0619df2454b87d883f7f9ea95d79fc21fec0b8a4d600b549a1e91f59a3493d6b"},
+    {file = "orjson-3.9.8-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:119a6edcecef4e37d30d6998e9cedd9e0ecdc894fa07216221dc8dd2eb24dd9d"},
+    {file = "orjson-3.9.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e32ac29f9c30cc152e7432a26c665232a382678f2402bf782f73fbc985cfb37e"},
+    {file = "orjson-3.9.8-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:002f7ca314cc8fbed5f00990bf48eda098ba1bba1e0c23be4bb024381e7889d1"},
+    {file = "orjson-3.9.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e538974e2ed20504f3dad0bcdab41cd5e4fa086dabea852a150e4cc98293183d"},
+    {file = "orjson-3.9.8-cp38-none-win32.whl", hash = "sha256:9df23493a72f073b2ab1005e628a963248dc577a2816e9c82caf09ff74908414"},
+    {file = "orjson-3.9.8-cp38-none-win_amd64.whl", hash = "sha256:34eec476141a043d478651d1efbf218162cdd57add24dfa659ac89e1a001477a"},
+    {file = "orjson-3.9.8-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:c9ae634b8a55539c3d5a53813552325733ab3da3601feef8e99f91cef634f3c4"},
+    {file = "orjson-3.9.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ad73fde11117b6b103c1d4071168b0e2875d890556fa8597663a5eca81bb812"},
+    {file = "orjson-3.9.8-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:154f048e4da06275c1f173445dfbd88f038d29f7529a0dae6157293241b7f5bd"},
+    {file = "orjson-3.9.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:428fec9497d17ebb5936495bbeaf12b5952bff5f6fde8a0e64030887b8d8cf94"},
+    {file = "orjson-3.9.8-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55ae6509f078eb90d157da7717f2826e55ef08756bc4f5b89448c6b56be4ff2c"},
+    {file = "orjson-3.9.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e26836a11b88f839b6902f92e8dd997c32f49486119a1aa67d714bc288aae172"},
+    {file = "orjson-3.9.8-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0a27e5161b1f23fd1b5e549b38018bbc7a0f0bd3699d3dec04e2e62d271480d3"},
+    {file = "orjson-3.9.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4c836845177d6ee92682d0d9b61346a06b140b5666319905a5b423ebb0ecc5d3"},
+    {file = "orjson-3.9.8-cp39-none-win32.whl", hash = "sha256:ca4f3e15517bdcdb573dfe6c97d4171247ce50ec82e3a7b708941b53d5f4bc29"},
+    {file = "orjson-3.9.8-cp39-none-win_amd64.whl", hash = "sha256:52c0480d5be12697b10b4d748b86acd4999f47e1d8e44e49486d0a550f30fcba"},
+    {file = "orjson-3.9.8.tar.gz", hash = "sha256:ed1adc6db9841974170a5195b827ee4e392b1e8ca385b19fcdc3248489844059"},
+]
+
 [[package]]
 name = "outcome"
 version = "1.2.0"
@@ -2673,4 +2732,4 @@ ct2 = ["ctranslate2"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<4.0"
-content-hash = "238a1a2818488a318c73089d4a0b1281fab8fac29b22230b288736af40b0c462"
+content-hash = "015ca70062293ec1c0e0c60d9d409e32b8965dcda93052f29a2cb567b047c075"
diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml
index 1594f4ec..32491860 100644
--- a/libs/infinity_emb/pyproject.toml
+++ b/libs/infinity_emb/pyproject.toml
@@ -19,6 +19,7 @@ rich = "^13.6.0"
 numpy = "^1"
 ctranslate2 = {version = "^3.20.0", optional=true}
 typer = {extras = ["all"], version = "^0.9.0"}
+orjson = "^3.9.8"
 
 [tool.poetry.scripts]
 infinity_emb = "infinity_emb.infinity_server:cli"
diff --git a/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py b/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py
index fb4c9860..6180ca3c 100644
--- a/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py
+++ b/libs/infinity_emb/tests/end_to_end/test_ct2_sentence.py
@@ -97,9 +97,9 @@ async def _post_batch(inputs):
             f"{PREFIX}/embeddings", json=dict(input=inputs, model=MODEL)
         )
 
-    await _post_batch(inputs=dummy_sentences)
+    response = await _post_batch(inputs=dummy_sentences)
 
-    _request_size = batch_size // 2
+    _request_size = int(batch_size * 1.5)
     tasks = [
         _post_batch(inputs=sentences[sl : sl + _request_size])
         for sl in range(0, len(sentences), _request_size)
diff --git a/libs/infinity_emb/tests/script_live.py b/libs/infinity_emb/tests/script_live.py
new file mode 100644
index 00000000..a59c16cb
--- /dev/null
+++ b/libs/infinity_emb/tests/script_live.py
@@ -0,0 +1,51 @@
+import json
+import timeit
+
+import numpy as np
+import requests
+from sentence_transformers import SentenceTransformer
+
+LIVE_URL = "http://localhost:8001/v1"
+
+
+def embedding_live_performance():
+    sample = ["This is a test sentence" * 128] * 2048
+    json_d = json.dumps({"input": sample, "model": "model"})
+    session = requests.Session()
+    req = session.get(f"{LIVE_URL}/models")
+    assert req.status_code == 200
+
+    batch_size = req.json()["data"]["stats"]["batch_size"]
+    print(f"batch_size is {batch_size}")
+    model = SentenceTransformer(
+        model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"
+    )
+
+    def local(data: str):
+        enc = model.encode(data, batch_size=batch_size)
+        assert len(enc) == len(data)
+        return enc
+
+    def remote(json_data: bytes):
+        req = session.post(f"{LIVE_URL}/embeddings", data=json_data)
+        assert req.status_code == 200
+        return req
+
+    local_resp = local(sample)
+    remote_resp = [d["embedding"] for d in remote(json_d).json()["data"]]
+    np.testing.assert_almost_equal(local_resp, remote_resp, 6)
+
+    print("Measuring latency via SentenceTransformers")
+    latency_st = timeit.timeit("local(sample)", number=10, globals=locals())
+    print("SentenceTransformers latency: ", latency_st)
+    model = None
+
+    print("Measuring latency via requests")
+    latency_request = timeit.timeit("remote(json_d)", number=10, globals=locals())
+    print(f"Request latency: {latency_request}")
+
+    assert latency_st * 1.1 > latency_request
+
+
+if __name__ == "__main__":
+    embedding_live_performance()
diff --git a/libs/infinity_emb/tests/unit_test/inference/test_batch_handler.py b/libs/infinity_emb/tests/unit_test/inference/test_batch_handler.py
index 9fabf5cd..f21aeacc 100644
--- a/libs/infinity_emb/tests/unit_test/inference/test_batch_handler.py
+++ b/libs/infinity_emb/tests/unit_test/inference/test_batch_handler.py
@@ -16,7 +16,7 @@
 
 BATCH_SIZE = 32
 N_TIMINGS = 3
-LIMIT_SLOWDOWN = 1.15 if torch.cuda.is_available() else 1.3
+LIMIT_SLOWDOWN = 1.20 if torch.cuda.is_available() else 1.3
 
 
 @pytest.fixture
@@ -53,12 +53,10 @@ async def test_batch_performance_raw(get_sts_bechmark_dataset, load_patched_bh):
         async def method_batch_handler(_sentences):
             _sentences = copy.deepcopy(_sentences)
             start = time.perf_counter()
-            lengths, _ = get_lengths_with_tokenize(_sentences, model.tokenize_lengths)
             _request_size = BATCH_SIZE * 4
             tasks = [
                 bh.schedule(
                     _sentences[sl : sl + _request_size],
-                    prios=lengths[sl : sl + _request_size],
                 )
                 for sl in range(0, len(_sentences), _request_size)
             ]

From d4c42514a384d1c9c4f053d0da2239a696a84b71 Mon Sep 17 00:00:00 2001
From: michaelfeil <me@michaelfeil.eu>
Date: Thu, 12 Oct 2023 02:44:39 +0200
Subject: [PATCH 2/4] improve tokenization

---
 libs/infinity_emb/infinity_emb/inference/batch_handler.py | 2 +-
 libs/infinity_emb/infinity_emb/inference/models.py        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
index c0316f81..626876f1 100644
--- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py
+++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -142,7 +142,7 @@ async def schedule(self, sentences: List[str]) -> tuple[List[NpEmbeddingType], i
         uuid_event = []
         prioqueue = []
 
-        prios, usage = get_lengths_with_tokenize(sentences, self.model.tokenize_lengths)
+        prios, usage = get_lengths_with_tokenize(sentences) #, self.model.tokenize_lengths)
 
         for s, p in zip(sentences, prios):
             inner = EmbeddingResult(sentence=s, event=EventTS(self._threadpool))
diff --git a/libs/infinity_emb/infinity_emb/inference/models.py b/libs/infinity_emb/infinity_emb/inference/models.py
index 5c93b86f..87225a4b 100644
--- a/libs/infinity_emb/infinity_emb/inference/models.py
+++ b/libs/infinity_emb/infinity_emb/inference/models.py
@@ -112,8 +112,8 @@ def tokenize_lengths(self, sentences: List[str]) -> List[int]:
             return_token_type_ids=False,
             return_attention_mask=False,
             return_length=False,
-            max_length=self._infinity_tokenizer.model_max_length,
-            truncation="longest_first",
+            # max_length=self._infinity_tokenizer.model_max_length,
+            # truncation="longest_first",
         ).encodings
         return [len(t.tokens) for t in tks]
 

From 53a58c92256088bc1766cf8c5b08eec781deb786 Mon Sep 17 00:00:00 2001
From: michaelfeil <me@michaelfeil.eu>
Date: Thu, 12 Oct 2023 03:24:33 +0200
Subject: [PATCH 3/4] update torch: move to torch only if needed.

---
 .../infinity_emb/inference/batch_handler.py   |  4 ++--
 .../infinity_emb/inference/models.py          | 20 +++++++++----------
 libs/infinity_emb/tests/script_live.py        | 10 +++++-----
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
index 626876f1..b639bbac 100644
--- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py
+++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -117,7 +117,7 @@ def __init__(
         self._queue_prio = CustomPrioQueue()
         self._result_store = ResultKVStore()
         self._feature_queue: queue.Queue = queue.Queue(4)
-        self._postprocess_queue: queue.Queue = queue.Queue(5)
+        self._postprocess_queue: queue.Queue = queue.Queue(4)
         self.max_batch_size = max_batch_size
         self.model = model
         self.max_queue_wait = max_queue_wait
@@ -266,7 +266,7 @@ async def _postprocess_batch(self):
                 except queue.Empty:
                     # 7 ms, assuming this is below
                     # 3-50ms for inference on avg.
-                    await asyncio.sleep(7e-3)
+                    await asyncio.sleep(5e-3)
                     continue
                 embed, batch = post_batch
                 embeddings = self.model.encode_post(embed).tolist()
diff --git a/libs/infinity_emb/infinity_emb/inference/models.py b/libs/infinity_emb/infinity_emb/inference/models.py
index 87225a4b..9c869ebc 100644
--- a/libs/infinity_emb/infinity_emb/inference/models.py
+++ b/libs/infinity_emb/infinity_emb/inference/models.py
@@ -77,6 +77,7 @@ def __init__(self, *args, **kwargs):
         self._infinity_tokenizer = copy.deepcopy(self._first_module().tokenizer)
 
     def encode_pre(self, sentences) -> Dict[str, Tensor]:
+        
         features = self.tokenize(sentences)
 
         return features
@@ -85,23 +86,22 @@ def encode_core(self, features: Dict[str, Tensor]) -> Tensor:
         """
         Computes sentence embeddings
         """
-        device = self._target_device
-        features = util.batch_to_device(features, device)
-        # move forward
-
-        with torch.no_grad():
-            out_features = self.forward(features)
+         
+        with torch.inference_mode():
+            device = self._target_device
+            features = util.batch_to_device(features, device)     
+            out_features = self.forward(features)["sentence_embedding"]
 
-        return out_features["sentence_embedding"].detach().cpu()
+        return out_features
 
     def encode_post(
         self, out_features: Tensor, normalize_embeddings: bool = True
     ) -> NpEmbeddingType:
-        with torch.no_grad():
-            embeddings = out_features
+        with torch.inference_mode():
+            embeddings = out_features.detach().cpu()
             if normalize_embeddings:
                 embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-            embeddings_out: np.ndarray = embeddings.cpu().numpy()
+            embeddings_out: np.ndarray = embeddings.numpy()
 
         return embeddings_out
 
diff --git a/libs/infinity_emb/tests/script_live.py b/libs/infinity_emb/tests/script_live.py
index a59c16cb..dbb99448 100644
--- a/libs/infinity_emb/tests/script_live.py
+++ b/libs/infinity_emb/tests/script_live.py
@@ -35,16 +35,16 @@ def remote(json_data: bytes):
     remote_resp = [d["embedding"] for d in remote(json_d).json()["data"]]
     np.testing.assert_almost_equal(local_resp, remote_resp, 6)
 
-    print("Measuring latency via SentenceTransformers")
-    latency_st = timeit.timeit("local(sample)", number=10, globals=locals())
-    print("SentenceTransformers latency: ", latency_st)
-    model = None
+    # print("Measuring latency via SentenceTransformers")
+    # latency_st = timeit.timeit("local(sample)", number=10, globals=locals())
+    # print("SentenceTransformers latency: ", latency_st)
+    # model = None
 
     print("Measuring latency via requests")
     latency_request = timeit.timeit("remote(json_d)", number=10, globals=locals())
     print(f"Request latency: {latency_request}")
 
-    assert latency_st * 1.1 > latency_request
+    # assert latency_st * 1.1 > latency_request
 
 
 if __name__ == "__main__":

From a7358ee1a3d48b2688d1224ed4e606fda0c26ead Mon Sep 17 00:00:00 2001
From: michaelfeil <me@michaelfeil.eu>
Date: Thu, 12 Oct 2023 03:36:13 +0200
Subject: [PATCH 4/4] format and add  uvicorn

---
 README.md                                     |   2 +-
 .../infinity_emb/inference/batch_handler.py   |   4 +-
 .../infinity_emb/inference/models.py          |   5 +-
 libs/infinity_emb/poetry.lock                 | 227 +++++++++++++++++-
 libs/infinity_emb/pyproject.toml              |   2 +-
 .../unit_test/inference/test_batch_handler.py |   1 -
 6 files changed, 233 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index bb1f306f..0f9f01a6 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Embedding Inference Server - finding TGI for embeddings
 ## Why Infinity:
 Infinity provides the following features:
 - **Fast inference**: The inference server is built on top of [torch](https:) and [ctranslate2](https://github.com/OpenNMT/CTranslate2) under the hood, getting most out of your **CUDA** or **CPU** hardware.
-- **Dynamic, optimal batching**: New embedding requests are queued while GPU is busy with the previous ones. New requests are squeezed intro your GPU/CPU as soon as ready. 
+- **Dynamic batching**: New embedding requests are queued while GPU is busy with the previous ones. New requests are squeezed intro your GPU/CPU as soon as ready. 
 - **Correct and tested implementation**: Unit and end-to-end tested. API embeddings are identical to [sentence-transformers](https://github.com/UKPLab/sentence-transformers/) (up to numerical precision). Lets API users create embeddings till infinity and beyond.
 - **Easy to use**: The API is built on top of [FastAPI](https://fastapi.tiangolo.com/), [Swagger](https://swagger.io/) makes it fully documented. API specs are aligned to OpenAI. See below on how to get started.
 
diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
index b639bbac..bf576810 100644
--- a/libs/infinity_emb/infinity_emb/inference/batch_handler.py
+++ b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -142,7 +142,9 @@ async def schedule(self, sentences: List[str]) -> tuple[List[NpEmbeddingType], i
         uuid_event = []
         prioqueue = []
 
-        prios, usage = get_lengths_with_tokenize(sentences) #, self.model.tokenize_lengths)
+        prios, usage = get_lengths_with_tokenize(
+            sentences
+        )  # , self.model.tokenize_lengths)
 
         for s, p in zip(sentences, prios):
             inner = EmbeddingResult(sentence=s, event=EventTS(self._threadpool))
diff --git a/libs/infinity_emb/infinity_emb/inference/models.py b/libs/infinity_emb/infinity_emb/inference/models.py
index 9c869ebc..02b9791d 100644
--- a/libs/infinity_emb/infinity_emb/inference/models.py
+++ b/libs/infinity_emb/infinity_emb/inference/models.py
@@ -77,7 +77,6 @@ def __init__(self, *args, **kwargs):
         self._infinity_tokenizer = copy.deepcopy(self._first_module().tokenizer)
 
     def encode_pre(self, sentences) -> Dict[str, Tensor]:
-        
         features = self.tokenize(sentences)
 
         return features
@@ -86,10 +85,10 @@ def encode_core(self, features: Dict[str, Tensor]) -> Tensor:
         """
         Computes sentence embeddings
         """
-         
+
         with torch.inference_mode():
             device = self._target_device
-            features = util.batch_to_device(features, device)     
+            features = util.batch_to_device(features, device)
             out_features = self.forward(features)["sentence_embedding"]
 
         return out_features
diff --git a/libs/infinity_emb/poetry.lock b/libs/infinity_emb/poetry.lock
index 299fedc6..18d4f5f6 100644
--- a/libs/infinity_emb/poetry.lock
+++ b/libs/infinity_emb/poetry.lock
@@ -622,6 +622,53 @@ sniffio = "==1.*"
 http2 = ["h2 (>=3,<5)"]
 socks = ["socksio (==1.*)"]
 
+[[package]]
+name = "httptools"
+version = "0.6.0"
+description = "A collection of framework independent HTTP protocol utils."
+optional = false
+python-versions = ">=3.5.0"
+files = [
+    {file = "httptools-0.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:818325afee467d483bfab1647a72054246d29f9053fd17cc4b86cda09cc60339"},
+    {file = "httptools-0.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72205730bf1be875003692ca54a4a7c35fac77b4746008966061d9d41a61b0f5"},
+    {file = "httptools-0.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33eb1d4e609c835966e969a31b1dedf5ba16b38cab356c2ce4f3e33ffa94cad3"},
+    {file = "httptools-0.6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdc6675ec6cb79d27e0575750ac6e2b47032742e24eed011b8db73f2da9ed40"},
+    {file = "httptools-0.6.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:463c3bc5ef64b9cf091be9ac0e0556199503f6e80456b790a917774a616aff6e"},
+    {file = "httptools-0.6.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:82f228b88b0e8c6099a9c4757ce9fdbb8b45548074f8d0b1f0fc071e35655d1c"},
+    {file = "httptools-0.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:0781fedc610293a2716bc7fa142d4c85e6776bc59d617a807ff91246a95dea35"},
+    {file = "httptools-0.6.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:721e503245d591527cddd0f6fd771d156c509e831caa7a57929b55ac91ee2b51"},
+    {file = "httptools-0.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:274bf20eeb41b0956e34f6a81f84d26ed57c84dd9253f13dcb7174b27ccd8aaf"},
+    {file = "httptools-0.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:259920bbae18740a40236807915def554132ad70af5067e562f4660b62c59b90"},
+    {file = "httptools-0.6.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03bfd2ae8a2d532952ac54445a2fb2504c804135ed28b53fefaf03d3a93eb1fd"},
+    {file = "httptools-0.6.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f959e4770b3fc8ee4dbc3578fd910fab9003e093f20ac8c621452c4d62e517cb"},
+    {file = "httptools-0.6.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e22896b42b95b3237eccc42278cd72c0df6f23247d886b7ded3163452481e38"},
+    {file = "httptools-0.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:38f3cafedd6aa20ae05f81f2e616ea6f92116c8a0f8dcb79dc798df3356836e2"},
+    {file = "httptools-0.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:47043a6e0ea753f006a9d0dd076a8f8c99bc0ecae86a0888448eb3076c43d717"},
+    {file = "httptools-0.6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35a541579bed0270d1ac10245a3e71e5beeb1903b5fbbc8d8b4d4e728d48ff1d"},
+    {file = "httptools-0.6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65d802e7b2538a9756df5acc062300c160907b02e15ed15ba035b02bce43e89c"},
+    {file = "httptools-0.6.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:26326e0a8fe56829f3af483200d914a7cd16d8d398d14e36888b56de30bec81a"},
+    {file = "httptools-0.6.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e41ccac9e77cd045f3e4ee0fc62cbf3d54d7d4b375431eb855561f26ee7a9ec4"},
+    {file = "httptools-0.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:4e748fc0d5c4a629988ef50ac1aef99dfb5e8996583a73a717fc2cac4ab89932"},
+    {file = "httptools-0.6.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:cf8169e839a0d740f3d3c9c4fa630ac1a5aaf81641a34575ca6773ed7ce041a1"},
+    {file = "httptools-0.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5dcc14c090ab57b35908d4a4585ec5c0715439df07be2913405991dbb37e049d"},
+    {file = "httptools-0.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d0b0571806a5168013b8c3d180d9f9d6997365a4212cb18ea20df18b938aa0b"},
+    {file = "httptools-0.6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fb4a608c631f7dcbdf986f40af7a030521a10ba6bc3d36b28c1dc9e9035a3c0"},
+    {file = "httptools-0.6.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:93f89975465133619aea8b1952bc6fa0e6bad22a447c6d982fc338fbb4c89649"},
+    {file = "httptools-0.6.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:73e9d66a5a28b2d5d9fbd9e197a31edd02be310186db423b28e6052472dc8201"},
+    {file = "httptools-0.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:22c01fcd53648162730a71c42842f73b50f989daae36534c818b3f5050b54589"},
+    {file = "httptools-0.6.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3f96d2a351b5625a9fd9133c95744e8ca06f7a4f8f0b8231e4bbaae2c485046a"},
+    {file = "httptools-0.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72ec7c70bd9f95ef1083d14a755f321d181f046ca685b6358676737a5fecd26a"},
+    {file = "httptools-0.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b703d15dbe082cc23266bf5d9448e764c7cb3fcfe7cb358d79d3fd8248673ef9"},
+    {file = "httptools-0.6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82c723ed5982f8ead00f8e7605c53e55ffe47c47465d878305ebe0082b6a1755"},
+    {file = "httptools-0.6.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b0a816bb425c116a160fbc6f34cece097fd22ece15059d68932af686520966bd"},
+    {file = "httptools-0.6.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:dea66d94e5a3f68c5e9d86e0894653b87d952e624845e0b0e3ad1c733c6cc75d"},
+    {file = "httptools-0.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:23b09537086a5a611fad5696fc8963d67c7e7f98cb329d38ee114d588b0b74cd"},
+    {file = "httptools-0.6.0.tar.gz", hash = "sha256:9fc6e409ad38cbd68b177cd5158fc4042c796b82ca88d99ec78f07bed6c6b796"},
+]
+
+[package.extras]
+test = ["Cython (>=0.29.24,<0.30.0)"]
+
 [[package]]
 name = "httpx"
 version = "0.25.0"
@@ -1610,6 +1657,20 @@ pytest = ">=5.0"
 [package.extras]
 dev = ["pre-commit", "pytest-asyncio", "tox"]
 
+[[package]]
+name = "python-dotenv"
+version = "1.0.0"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
+    {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
 [[package]]
 name = "pyyaml"
 version = "6.0.1"
@@ -2705,12 +2766,176 @@ files = [
 
 [package.dependencies]
 click = ">=7.0"
+colorama = {version = ">=0.4", optional = true, markers = "sys_platform == \"win32\" and extra == \"standard\""}
 h11 = ">=0.8"
+httptools = {version = ">=0.5.0", optional = true, markers = "extra == \"standard\""}
+python-dotenv = {version = ">=0.13", optional = true, markers = "extra == \"standard\""}
+pyyaml = {version = ">=5.1", optional = true, markers = "extra == \"standard\""}
 typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
+uvloop = {version = ">=0.14.0,<0.15.0 || >0.15.0,<0.15.1 || >0.15.1", optional = true, markers = "(sys_platform != \"win32\" and sys_platform != \"cygwin\") and platform_python_implementation != \"PyPy\" and extra == \"standard\""}
+watchfiles = {version = ">=0.13", optional = true, markers = "extra == \"standard\""}
+websockets = {version = ">=10.4", optional = true, markers = "extra == \"standard\""}
 
 [package.extras]
 standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"]
 
+[[package]]
+name = "uvloop"
+version = "0.17.0"
+description = "Fast implementation of asyncio event loop on top of libuv"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "uvloop-0.17.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ce9f61938d7155f79d3cb2ffa663147d4a76d16e08f65e2c66b77bd41b356718"},
+    {file = "uvloop-0.17.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:68532f4349fd3900b839f588972b3392ee56042e440dd5873dfbbcd2cc67617c"},
+    {file = "uvloop-0.17.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0949caf774b9fcefc7c5756bacbbbd3fc4c05a6b7eebc7c7ad6f825b23998d6d"},
+    {file = "uvloop-0.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff3d00b70ce95adce264462c930fbaecb29718ba6563db354608f37e49e09024"},
+    {file = "uvloop-0.17.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a5abddb3558d3f0a78949c750644a67be31e47936042d4f6c888dd6f3c95f4aa"},
+    {file = "uvloop-0.17.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8efcadc5a0003d3a6e887ccc1fb44dec25594f117a94e3127954c05cf144d811"},
+    {file = "uvloop-0.17.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3378eb62c63bf336ae2070599e49089005771cc651c8769aaad72d1bd9385a7c"},
+    {file = "uvloop-0.17.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6aafa5a78b9e62493539456f8b646f85abc7093dd997f4976bb105537cf2635e"},
+    {file = "uvloop-0.17.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c686a47d57ca910a2572fddfe9912819880b8765e2f01dc0dd12a9bf8573e539"},
+    {file = "uvloop-0.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:864e1197139d651a76c81757db5eb199db8866e13acb0dfe96e6fc5d1cf45fc4"},
+    {file = "uvloop-0.17.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2a6149e1defac0faf505406259561bc14b034cdf1d4711a3ddcdfbaa8d825a05"},
+    {file = "uvloop-0.17.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6708f30db9117f115eadc4f125c2a10c1a50d711461699a0cbfaa45b9a78e376"},
+    {file = "uvloop-0.17.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:23609ca361a7fc587031429fa25ad2ed7242941adec948f9d10c045bfecab06b"},
+    {file = "uvloop-0.17.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2deae0b0fb00a6af41fe60a675cec079615b01d68beb4cc7b722424406b126a8"},
+    {file = "uvloop-0.17.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45cea33b208971e87a31c17622e4b440cac231766ec11e5d22c76fab3bf9df62"},
+    {file = "uvloop-0.17.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9b09e0f0ac29eee0451d71798878eae5a4e6a91aa275e114037b27f7db72702d"},
+    {file = "uvloop-0.17.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:dbbaf9da2ee98ee2531e0c780455f2841e4675ff580ecf93fe5c48fe733b5667"},
+    {file = "uvloop-0.17.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a4aee22ece20958888eedbad20e4dbb03c37533e010fb824161b4f05e641f738"},
+    {file = "uvloop-0.17.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:307958f9fc5c8bb01fad752d1345168c0abc5d62c1b72a4a8c6c06f042b45b20"},
+    {file = "uvloop-0.17.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ebeeec6a6641d0adb2ea71dcfb76017602ee2bfd8213e3fcc18d8f699c5104f"},
+    {file = "uvloop-0.17.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1436c8673c1563422213ac6907789ecb2b070f5939b9cbff9ef7113f2b531595"},
+    {file = "uvloop-0.17.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8887d675a64cfc59f4ecd34382e5b4f0ef4ae1da37ed665adba0c2badf0d6578"},
+    {file = "uvloop-0.17.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3db8de10ed684995a7f34a001f15b374c230f7655ae840964d51496e2f8a8474"},
+    {file = "uvloop-0.17.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7d37dccc7ae63e61f7b96ee2e19c40f153ba6ce730d8ba4d3b4e9738c1dccc1b"},
+    {file = "uvloop-0.17.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cbbe908fda687e39afd6ea2a2f14c2c3e43f2ca88e3a11964b297822358d0e6c"},
+    {file = "uvloop-0.17.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d97672dc709fa4447ab83276f344a165075fd9f366a97b712bdd3fee05efae8"},
+    {file = "uvloop-0.17.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1e507c9ee39c61bfddd79714e4f85900656db1aec4d40c6de55648e85c2799c"},
+    {file = "uvloop-0.17.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c092a2c1e736086d59ac8e41f9c98f26bbf9b9222a76f21af9dfe949b99b2eb9"},
+    {file = "uvloop-0.17.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:30babd84706115626ea78ea5dbc7dd8d0d01a2e9f9b306d24ca4ed5796c66ded"},
+    {file = "uvloop-0.17.0.tar.gz", hash = "sha256:0ddf6baf9cf11a1a22c71487f39f15b2cf78eb5bde7e5b45fbb99e8a9d91b9e1"},
+]
+
+[package.extras]
+dev = ["Cython (>=0.29.32,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "aiohttp", "flake8 (>=3.9.2,<3.10.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=22.0.0,<22.1.0)", "pycodestyle (>=2.7.0,<2.8.0)", "pytest (>=3.6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
+docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
+test = ["Cython (>=0.29.32,<0.30.0)", "aiohttp", "flake8 (>=3.9.2,<3.10.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=22.0.0,<22.1.0)", "pycodestyle (>=2.7.0,<2.8.0)"]
+
+[[package]]
+name = "watchfiles"
+version = "0.20.0"
+description = "Simple, modern and high performance file watching and code reload in python."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "watchfiles-0.20.0-cp37-abi3-macosx_10_7_x86_64.whl", hash = "sha256:3796312bd3587e14926013612b23066912cf45a14af71cf2b20db1c12dadf4e9"},
+    {file = "watchfiles-0.20.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:d0002d81c89a662b595645fb684a371b98ff90a9c7d8f8630c82f0fde8310458"},
+    {file = "watchfiles-0.20.0-cp37-abi3-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:570848706440373b4cd8017f3e850ae17f76dbdf1e9045fc79023b11e1afe490"},
+    {file = "watchfiles-0.20.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a0351d20d03c6f7ad6b2e8a226a5efafb924c7755ee1e34f04c77c3682417fa"},
+    {file = "watchfiles-0.20.0-cp37-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:007dcc4a401093010b389c044e81172c8a2520dba257c88f8828b3d460c6bb38"},
+    {file = "watchfiles-0.20.0-cp37-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0d82dbc1832da83e441d112069833eedd4cf583d983fb8dd666fbefbea9d99c0"},
+    {file = "watchfiles-0.20.0-cp37-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99f4c65fd2fce61a571b2a6fcf747d6868db0bef8a934e8ca235cc8533944d95"},
+    {file = "watchfiles-0.20.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5392dd327a05f538c56edb1c6ebba6af91afc81b40822452342f6da54907bbdf"},
+    {file = "watchfiles-0.20.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:08dc702529bb06a2b23859110c214db245455532da5eaea602921687cfcd23db"},
+    {file = "watchfiles-0.20.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d4e66a857621584869cfbad87039e65dadd7119f0d9bb9dbc957e089e32c164"},
+    {file = "watchfiles-0.20.0-cp37-abi3-win32.whl", hash = "sha256:a03d1e6feb7966b417f43c3e3783188167fd69c2063e86bad31e62c4ea794cc5"},
+    {file = "watchfiles-0.20.0-cp37-abi3-win_amd64.whl", hash = "sha256:eccc8942bcdc7d638a01435d915b913255bbd66f018f1af051cd8afddb339ea3"},
+    {file = "watchfiles-0.20.0-cp37-abi3-win_arm64.whl", hash = "sha256:b17d4176c49d207865630da5b59a91779468dd3e08692fe943064da260de2c7c"},
+    {file = "watchfiles-0.20.0-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:d97db179f7566dcf145c5179ddb2ae2a4450e3a634eb864b09ea04e68c252e8e"},
+    {file = "watchfiles-0.20.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:835df2da7a5df5464c4a23b2d963e1a9d35afa422c83bf4ff4380b3114603644"},
+    {file = "watchfiles-0.20.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:608cd94a8767f49521901aff9ae0c92cc8f5a24d528db7d6b0295290f9d41193"},
+    {file = "watchfiles-0.20.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89d1de8218874925bce7bb2ae9657efc504411528930d7a83f98b1749864f2ef"},
+    {file = "watchfiles-0.20.0-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:13f995d5152a8ba4ed7c2bbbaeee4e11a5944defc7cacd0ccb4dcbdcfd78029a"},
+    {file = "watchfiles-0.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:9b5c8d3be7b502f8c43a33c63166ada8828dbb0c6d49c8f9ce990a96de2f5a49"},
+    {file = "watchfiles-0.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e43af4464daa08723c04b43cf978ab86cc55c684c16172622bdac64b34e36af0"},
+    {file = "watchfiles-0.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87d9e1f75c4f86c93d73b5bd1ebe667558357548f11b4f8af4e0e272f79413ce"},
+    {file = "watchfiles-0.20.0.tar.gz", hash = "sha256:728575b6b94c90dd531514677201e8851708e6e4b5fe7028ac506a200b622019"},
+]
+
+[package.dependencies]
+anyio = ">=3.0.0"
+
+[[package]]
+name = "websockets"
+version = "11.0.3"
+description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3ccc8a0c387629aec40f2fc9fdcb4b9d5431954f934da3eaf16cdc94f67dbfac"},
+    {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d67ac60a307f760c6e65dad586f556dde58e683fab03323221a4e530ead6f74d"},
+    {file = "websockets-11.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84d27a4832cc1a0ee07cdcf2b0629a8a72db73f4cf6de6f0904f6661227f256f"},
+    {file = "websockets-11.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffd7dcaf744f25f82190856bc26ed81721508fc5cbf2a330751e135ff1283564"},
+    {file = "websockets-11.0.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7622a89d696fc87af8e8d280d9b421db5133ef5b29d3f7a1ce9f1a7bf7fcfa11"},
+    {file = "websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bceab846bac555aff6427d060f2fcfff71042dba6f5fca7dc4f75cac815e57ca"},
+    {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:54c6e5b3d3a8936a4ab6870d46bdd6ec500ad62bde9e44462c32d18f1e9a8e54"},
+    {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:41f696ba95cd92dc047e46b41b26dd24518384749ed0d99bea0a941ca87404c4"},
+    {file = "websockets-11.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:86d2a77fd490ae3ff6fae1c6ceaecad063d3cc2320b44377efdde79880e11526"},
+    {file = "websockets-11.0.3-cp310-cp310-win32.whl", hash = "sha256:2d903ad4419f5b472de90cd2d40384573b25da71e33519a67797de17ef849b69"},
+    {file = "websockets-11.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:1d2256283fa4b7f4c7d7d3e84dc2ece74d341bce57d5b9bf385df109c2a1a82f"},
+    {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e848f46a58b9fcf3d06061d17be388caf70ea5b8cc3466251963c8345e13f7eb"},
+    {file = "websockets-11.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aa5003845cdd21ac0dc6c9bf661c5beddd01116f6eb9eb3c8e272353d45b3288"},
+    {file = "websockets-11.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b58cbf0697721120866820b89f93659abc31c1e876bf20d0b3d03cef14faf84d"},
+    {file = "websockets-11.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:660e2d9068d2bedc0912af508f30bbeb505bbbf9774d98def45f68278cea20d3"},
+    {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c1f0524f203e3bd35149f12157438f406eff2e4fb30f71221c8a5eceb3617b6b"},
+    {file = "websockets-11.0.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:def07915168ac8f7853812cc593c71185a16216e9e4fa886358a17ed0fd9fcf6"},
+    {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b30c6590146e53149f04e85a6e4fcae068df4289e31e4aee1fdf56a0dead8f97"},
+    {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:619d9f06372b3a42bc29d0cd0354c9bb9fb39c2cbc1a9c5025b4538738dbffaf"},
+    {file = "websockets-11.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:01f5567d9cf6f502d655151645d4e8b72b453413d3819d2b6f1185abc23e82dd"},
+    {file = "websockets-11.0.3-cp311-cp311-win32.whl", hash = "sha256:e1459677e5d12be8bbc7584c35b992eea142911a6236a3278b9b5ce3326f282c"},
+    {file = "websockets-11.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:e7837cb169eca3b3ae94cc5787c4fed99eef74c0ab9506756eea335e0d6f3ed8"},
+    {file = "websockets-11.0.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9f59a3c656fef341a99e3d63189852be7084c0e54b75734cde571182c087b152"},
+    {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2529338a6ff0eb0b50c7be33dc3d0e456381157a31eefc561771ee431134a97f"},
+    {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34fd59a4ac42dff6d4681d8843217137f6bc85ed29722f2f7222bd619d15e95b"},
+    {file = "websockets-11.0.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:332d126167ddddec94597c2365537baf9ff62dfcc9db4266f263d455f2f031cb"},
+    {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:6505c1b31274723ccaf5f515c1824a4ad2f0d191cec942666b3d0f3aa4cb4007"},
+    {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f467ba0050b7de85016b43f5a22b46383ef004c4f672148a8abf32bc999a87f0"},
+    {file = "websockets-11.0.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:9d9acd80072abcc98bd2c86c3c9cd4ac2347b5a5a0cae7ed5c0ee5675f86d9af"},
+    {file = "websockets-11.0.3-cp37-cp37m-win32.whl", hash = "sha256:e590228200fcfc7e9109509e4d9125eace2042fd52b595dd22bbc34bb282307f"},
+    {file = "websockets-11.0.3-cp37-cp37m-win_amd64.whl", hash = "sha256:b16fff62b45eccb9c7abb18e60e7e446998093cdcb50fed33134b9b6878836de"},
+    {file = "websockets-11.0.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fb06eea71a00a7af0ae6aefbb932fb8a7df3cb390cc217d51a9ad7343de1b8d0"},
+    {file = "websockets-11.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8a34e13a62a59c871064dfd8ffb150867e54291e46d4a7cf11d02c94a5275bae"},
+    {file = "websockets-11.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4841ed00f1026dfbced6fca7d963c4e7043aa832648671b5138008dc5a8f6d99"},
+    {file = "websockets-11.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a073fc9ab1c8aff37c99f11f1641e16da517770e31a37265d2755282a5d28aa"},
+    {file = "websockets-11.0.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68b977f21ce443d6d378dbd5ca38621755f2063d6fdb3335bda981d552cfff86"},
+    {file = "websockets-11.0.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1a99a7a71631f0efe727c10edfba09ea6bee4166a6f9c19aafb6c0b5917d09c"},
+    {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:bee9fcb41db2a23bed96c6b6ead6489702c12334ea20a297aa095ce6d31370d0"},
+    {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4b253869ea05a5a073ebfdcb5cb3b0266a57c3764cf6fe114e4cd90f4bfa5f5e"},
+    {file = "websockets-11.0.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1553cb82942b2a74dd9b15a018dce645d4e68674de2ca31ff13ebc2d9f283788"},
+    {file = "websockets-11.0.3-cp38-cp38-win32.whl", hash = "sha256:f61bdb1df43dc9c131791fbc2355535f9024b9a04398d3bd0684fc16ab07df74"},
+    {file = "websockets-11.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:03aae4edc0b1c68498f41a6772d80ac7c1e33c06c6ffa2ac1c27a07653e79d6f"},
+    {file = "websockets-11.0.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:777354ee16f02f643a4c7f2b3eff8027a33c9861edc691a2003531f5da4f6bc8"},
+    {file = "websockets-11.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8c82f11964f010053e13daafdc7154ce7385ecc538989a354ccc7067fd7028fd"},
+    {file = "websockets-11.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3580dd9c1ad0701169e4d6fc41e878ffe05e6bdcaf3c412f9d559389d0c9e016"},
+    {file = "websockets-11.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f1a3f10f836fab6ca6efa97bb952300b20ae56b409414ca85bff2ad241d2a61"},
+    {file = "websockets-11.0.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df41b9bc27c2c25b486bae7cf42fccdc52ff181c8c387bfd026624a491c2671b"},
+    {file = "websockets-11.0.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:279e5de4671e79a9ac877427f4ac4ce93751b8823f276b681d04b2156713b9dd"},
+    {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1fdf26fa8a6a592f8f9235285b8affa72748dc12e964a5518c6c5e8f916716f7"},
+    {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:69269f3a0b472e91125b503d3c0b3566bda26da0a3261c49f0027eb6075086d1"},
+    {file = "websockets-11.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:97b52894d948d2f6ea480171a27122d77af14ced35f62e5c892ca2fae9344311"},
+    {file = "websockets-11.0.3-cp39-cp39-win32.whl", hash = "sha256:c7f3cb904cce8e1be667c7e6fef4516b98d1a6a0635a58a57528d577ac18a128"},
+    {file = "websockets-11.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:c792ea4eabc0159535608fc5658a74d1a81020eb35195dd63214dcf07556f67e"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f2e58f2c36cc52d41f2659e4c0cbf7353e28c8c9e63e30d8c6d3494dc9fdedcf"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de36fe9c02995c7e6ae6efe2e205816f5f00c22fd1fbf343d4d18c3d5ceac2f5"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0ac56b661e60edd453585f4bd68eb6a29ae25b5184fd5ba51e97652580458998"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e052b8467dd07d4943936009f46ae5ce7b908ddcac3fda581656b1b19c083d9b"},
+    {file = "websockets-11.0.3-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:42cc5452a54a8e46a032521d7365da775823e21bfba2895fb7b77633cce031bb"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:e6316827e3e79b7b8e7d8e3b08f4e331af91a48e794d5d8b099928b6f0b85f20"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8531fdcad636d82c517b26a448dcfe62f720e1922b33c81ce695d0edb91eb931"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c114e8da9b475739dde229fd3bc6b05a6537a88a578358bc8eb29b4030fac9c9"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e063b1865974611313a3849d43f2c3f5368093691349cf3c7c8f8f75ad7cb280"},
+    {file = "websockets-11.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:92b2065d642bf8c0a82d59e59053dd2fdde64d4ed44efe4870fa816c1232647b"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0ee68fe502f9031f19d495dae2c268830df2760c0524cbac5d759921ba8c8e82"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcacf2c7a6c3a84e720d1bb2b543c675bf6c40e460300b628bab1b1efc7c034c"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b67c6f5e5a401fc56394f191f00f9b3811fe843ee93f4a70df3c389d1adf857d"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d5023a4b6a5b183dc838808087033ec5df77580485fc533e7dab2567851b0a4"},
+    {file = "websockets-11.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ed058398f55163a79bb9f06a90ef9ccc063b204bb346c4de78efc5d15abfe602"},
+    {file = "websockets-11.0.3-py3-none-any.whl", hash = "sha256:6681ba9e7f8f3b19440921e99efbb40fc89f26cd71bf539e45d8c8a25c976dc6"},
+    {file = "websockets-11.0.3.tar.gz", hash = "sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016"},
+]
+
 [[package]]
 name = "wheel"
 version = "0.41.2"
@@ -2732,4 +2957,4 @@ ct2 = ["ctranslate2"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<4.0"
-content-hash = "015ca70062293ec1c0e0c60d9d409e32b8965dcda93052f29a2cb567b047c075"
+content-hash = "2db5df6bd4130cfad8cf4dc719e09414150b877301d86751c8206aead10f9869"
diff --git a/libs/infinity_emb/pyproject.toml b/libs/infinity_emb/pyproject.toml
index 32491860..969b7821 100644
--- a/libs/infinity_emb/pyproject.toml
+++ b/libs/infinity_emb/pyproject.toml
@@ -14,7 +14,7 @@ pydantic = ">=2.4.2,<3"
 torch = ">=2.0.0, !=2.0.1, !=2.1.0"
 sentence-transformers = "^2.2.2"
 prometheus-fastapi-instrumentator = "^6.1.0"
-uvicorn = "^0.23.2"
+uvicorn = {extras = ["standard"], version = "^0.23.2"}
 rich = "^13.6.0"
 numpy = "^1"
 ctranslate2 = {version = "^3.20.0", optional=true}
diff --git a/libs/infinity_emb/tests/unit_test/inference/test_batch_handler.py b/libs/infinity_emb/tests/unit_test/inference/test_batch_handler.py
index f21aeacc..8aecafc2 100644
--- a/libs/infinity_emb/tests/unit_test/inference/test_batch_handler.py
+++ b/libs/infinity_emb/tests/unit_test/inference/test_batch_handler.py
@@ -11,7 +11,6 @@
 from infinity_emb.inference import BatchHandler
 from infinity_emb.inference.models import (
     SentenceTransformerPatched,
-    get_lengths_with_tokenize,
 )
 
 BATCH_SIZE = 32