format and add uvicorn

michaelfeil · Oct 12, 2023 · a7358ee · a7358ee
1 parent 53a58c9
commit a7358ee
Show file tree

Hide file tree

Showing 6 changed files with 233 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ Embedding Inference Server - finding TGI for embeddings
 ## Why Infinity:
 Infinity provides the following features:
 - **Fast inference**: The inference server is built on top of [torch](https:) and [ctranslate2](https://github.com/OpenNMT/CTranslate2) under the hood, getting most out of your **CUDA** or **CPU** hardware.
-- **Dynamic, optimal batching**: New embedding requests are queued while GPU is busy with the previous ones. New requests are squeezed intro your GPU/CPU as soon as ready. 
+- **Dynamic batching**: New embedding requests are queued while GPU is busy with the previous ones. New requests are squeezed intro your GPU/CPU as soon as ready. 
 - **Correct and tested implementation**: Unit and end-to-end tested. API embeddings are identical to [sentence-transformers](https://github.com/UKPLab/sentence-transformers/) (up to numerical precision). Lets API users create embeddings till infinity and beyond.
 - **Easy to use**: The API is built on top of [FastAPI](https://fastapi.tiangolo.com/), [Swagger](https://swagger.io/) makes it fully documented. API specs are aligned to OpenAI. See below on how to get started.
 

diff --git a/libs/infinity_emb/infinity_emb/inference/batch_handler.py b/libs/infinity_emb/infinity_emb/inference/batch_handler.py
@@ -142,7 +142,9 @@ async def schedule(self, sentences: List[str]) -> tuple[List[NpEmbeddingType], i
         uuid_event = []
         prioqueue = []
 
-        prios, usage = get_lengths_with_tokenize(sentences) #, self.model.tokenize_lengths)
+        prios, usage = get_lengths_with_tokenize(
+            sentences
+        )  # , self.model.tokenize_lengths)
 
         for s, p in zip(sentences, prios):
             inner = EmbeddingResult(sentence=s, event=EventTS(self._threadpool))

diff --git a/libs/infinity_emb/infinity_emb/inference/models.py b/libs/infinity_emb/infinity_emb/inference/models.py
@@ -77,7 +77,6 @@ def __init__(self, *args, **kwargs):
         self._infinity_tokenizer = copy.deepcopy(self._first_module().tokenizer)
 
     def encode_pre(self, sentences) -> Dict[str, Tensor]:
-
         features = self.tokenize(sentences)
 
         return features
@@ -86,10 +85,10 @@ def encode_core(self, features: Dict[str, Tensor]) -> Tensor:
         """
         Computes sentence embeddings
         """
-         
+
         with torch.inference_mode():
             device = self._target_device
-            features = util.batch_to_device(features, device)     
+            features = util.batch_to_device(features, device)
             out_features = self.forward(features)["sentence_embedding"]
 
         return out_features