Skip to content

Commit

Permalink
Merge pull request #39 from michaelfeil/device-feature
Browse files Browse the repository at this point in the history
add `device` to for cli and python API to force a device usage. Default will be device `auto`, selecting `cuda` if available else `cpu`
  • Loading branch information
michaelfeil authored Dec 4, 2023
2 parents 234b6d7 + c310a32 commit db2060d
Show file tree
Hide file tree
Showing 11 changed files with 752 additions and 673 deletions.
4 changes: 4 additions & 0 deletions libs/infinity_emb/infinity_emb/inference/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from infinity_emb.inference.batch_handler import BatchHandler
from infinity_emb.inference.primitives import (
Device,
DeviceTypeHint,
EmbeddingResult,
NpEmbeddingType,
PrioritizedQueueItem,
Expand All @@ -10,6 +12,8 @@
"EmbeddingResult",
"NpEmbeddingType",
"PrioritizedQueueItem",
"Device",
"DeviceTypeHint",
"BatchHandler",
"select_model_to_functional",
]
13 changes: 12 additions & 1 deletion libs/infinity_emb/infinity_emb/inference/primitives.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
import asyncio
import enum
import time
from dataclasses import dataclass, field
from typing import Optional
from typing import Dict, Optional
from uuid import uuid4

import numpy as np

NpEmbeddingType = np.ndarray


class Device(enum.Enum):
cpu = "cpu"
cuda = "cuda"
auto = None


_devices: Dict[str, str] = {e.name: e.name for e in Device}
DeviceTypeHint = enum.Enum("DeviceTypeHint", _devices) # type: ignore


@dataclass(order=True)
class EmbeddingResult:
sentence: str = field(compare=False)
Expand Down
15 changes: 11 additions & 4 deletions libs/infinity_emb/infinity_emb/inference/select_model.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
from time import perf_counter
from typing import List, Optional, Tuple

from infinity_emb.inference.primitives import EmbeddingResult, NpEmbeddingType
from infinity_emb.inference.primitives import Device, EmbeddingResult, NpEmbeddingType
from infinity_emb.log_handler import logger
from infinity_emb.transformer.abstract import BaseTransformer
from infinity_emb.transformer.utils import InferenceEngine


def select_model_to_functional(
model_name_or_path: str, batch_size: int, engine: InferenceEngine, model_warmup=True
model_name_or_path: str,
batch_size: int,
engine: InferenceEngine,
model_warmup=True,
device: Device = Device.auto,
):
logger.info(f"model=`{model_name_or_path}` selected, using engine=`{engine.value}`")
init_engine = engine.value(model_name_or_path)
logger.info(
f"model=`{model_name_or_path}` selected, using engine=`{engine.value}`"
f" and device=`{device.value}`"
)
init_engine = engine.value(model_name_or_path, device=device.value)

min_inference_t = 4e-3
if model_warmup:
Expand Down
26 changes: 20 additions & 6 deletions libs/infinity_emb/infinity_emb/infinity_server.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import time
from typing import List
from typing import List, Union

# prometheus
import infinity_emb
Expand All @@ -10,7 +10,12 @@
OpenAIEmbeddingResult,
OpenAIModelInfo,
)
from infinity_emb.inference import BatchHandler, select_model_to_functional
from infinity_emb.inference import (
BatchHandler,
Device,
DeviceTypeHint,
select_model_to_functional,
)
from infinity_emb.inference.caching_layer import INFINITY_CACHE_VECTORS
from infinity_emb.log_handler import UVICORN_LOG_LEVELS, logger
from infinity_emb.transformer.utils import InferenceEngine, InferenceEngineTypeHint
Expand All @@ -24,6 +29,7 @@ def __init__(
engine: InferenceEngine = InferenceEngine.torch,
model_warmup=True,
vector_disk_cache_path: str = "",
device: Union[Device, str] = Device.auto,
) -> None:
"""Creating a Async EmbeddingEngine object.
Expand All @@ -35,6 +41,7 @@ def __init__(
model_warmup, bool: decide if warmup with max batch size . Defaults to True.
vector_disk_cache_path, str: file path to folder of cache.
Defaults to "" - default no caching.
device, Device: device to use for inference. Defaults to Device.auto
Example:
```python
Expand All @@ -49,12 +56,13 @@ def __init__(
"""
self.batch_size = batch_size
self.running = False
self._vector_disk_cache_path=vector_disk_cache_path,
self._vector_disk_cache_path = vector_disk_cache_path
self._model, self._min_inference_t = select_model_to_functional(
model_name_or_path=model_name_or_path,
batch_size=batch_size,
engine=engine,
model_warmup=model_warmup
model_warmup=model_warmup,
device=Device[device] if isinstance(device, str) else device,
)

async def astart(self):
Expand Down Expand Up @@ -129,6 +137,7 @@ def create_server(
verbose: bool = False,
model_warmup=True,
vector_disk_cache=INFINITY_CACHE_VECTORS,
device: Device = Device.auto,
doc_extra: dict = {},
):
"""
Expand Down Expand Up @@ -164,6 +173,7 @@ async def _startup():
batch_size=batch_size,
engine=engine,
model_warmup=model_warmup,
device=device,
)

app.batch_handler = BatchHandler(
Expand Down Expand Up @@ -258,7 +268,7 @@ async def _embeddings(data: OpenAIEmbeddingInput):
return app


def start_uvicorn(
def _start_uvicorn(
model_name_or_path: str = "BAAI/bge-small-en-v1.5",
batch_size: int = 64,
url_prefix: str = "/v1",
Expand All @@ -268,6 +278,7 @@ def start_uvicorn(
engine: InferenceEngineTypeHint = InferenceEngineTypeHint.torch.name, # type: ignore # noqa
model_warmup: bool = True,
vector_disk_cache: bool = INFINITY_CACHE_VECTORS,
device: DeviceTypeHint = DeviceTypeHint.auto.name,
):
"""Infinity Embedding API ♾️ cli to start a uvicorn-server instance;
MIT License; Copyright (c) 2023 Michael Feil
Expand All @@ -286,10 +297,12 @@ def start_uvicorn(
Defaults to True.
vector_disk_cache, bool: cache past embeddings in SQL.
Defaults to False or env-INFINITY_CACHE_VECTORS if set
device, Device: device to use for inference. Defaults to Device.auto or "auto"
"""
import uvicorn

engine_load: InferenceEngine = InferenceEngine[engine.name]
device: Device = Device[device.name]
logger.setLevel(log_level.to_int())

app = create_server(
Expand All @@ -301,6 +314,7 @@ def start_uvicorn(
doc_extra=dict(host=host, port=port),
model_warmup=model_warmup,
vector_disk_cache=vector_disk_cache,
device=device,
)
uvicorn.run(app, host=host, port=port, log_level=log_level.name)

Expand All @@ -309,7 +323,7 @@ def cli():
"""fires the command line using Python `typer.run()`"""
import typer

typer.run(start_uvicorn)
typer.run(_start_uvicorn)


# app = create_server()
Expand Down
8 changes: 5 additions & 3 deletions libs/infinity_emb/infinity_emb/transformer/fastembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,17 @@ def __init__(self, *args, **kwargs):
raise ImportError(
"fastembed is not installed." "`pip install infinity-emb[fastembed]`"
)
providers = ["CPUExecutionProvider"]

if not kwargs.get("cache_dir"):
from infinity_emb.transformer.utils import infinity_cache_dir

kwargs["cache_dir"] = infinity_cache_dir()
if kwargs.pop("device", None) != "cpu":
providers = ["CUDAExecutionProvider"] + providers
super(DefaultEmbedding, self).__init__(*args, **kwargs)
self._infinity_tokenizer = copy.deepcopy(self.model.tokenizer)
self.model.model.set_providers(
["CUDAExecutionProvider", "CPUExecutionProvider"]
)
self.model.model.set_providers(providers)

def encode_pre(self, sentences: List[str]) -> Dict[str, np.ndarray[int]]:
encoded = self.model.tokenizer.encode_batch(sentences)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def __init__(self, *args, **kwargs):
# without corrupting the original.
fm = self._first_module()
self._infinity_tokenizer = copy.deepcopy(fm.tokenizer)
self.eval()

if OPTIMUM_AVAILABLE and not os.environ.get("INFINITY_DISABLE_OPTIMUM", False):
logger.info(
"Adding optimizations via Huggingface optimum. "
Expand All @@ -78,7 +80,6 @@ def __init__(self, *args, **kwargs):
"install `pip install infinity-emb[optimum]`"
)

self.eval()
if self._target_device.type == "cuda" and os.environ.get(
"INFINITY_TORCH_ENABLE_HALF", False
):
Expand Down
4 changes: 2 additions & 2 deletions libs/infinity_emb/infinity_emb/transformer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class InferenceEngine(Enum):
debugengine = DummyTransformer


types: Dict[str, str] = {e.name: e.name for e in InferenceEngine}
InferenceEngineTypeHint = Enum("InferenceEngineTypeHint", types) # type: ignore
_types: Dict[str, str] = {e.name: e.name for e in InferenceEngine}
InferenceEngineTypeHint = Enum("InferenceEngineTypeHint", _types) # type: ignore


def length_tokenizer(
Expand Down
Loading

0 comments on commit db2060d

Please sign in to comment.