Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update tokenizer.py to support HF #42

Merged
merged 1 commit into from
Jan 26, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 66 additions & 27 deletions moatless/utils/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,83 @@
import os

_enc = None

_voyageai = None
_voyage_clients = {}
_tiktoken_encoders = {}
_hf_tokenizers = {}


def count_tokens(content: str, model: str = "gpt-3.5-turbo") -> int:
global _enc, _voyageai
"""
Count tokens in `content` based on the model name.

1) If `model.startswith("voyage")`, use VoyageAI client.
2) Otherwise, try to use `tiktoken`. If tiktoken raises KeyError
(unrecognized model name), fallback to Hugging Face tokenizer.
"""
if model.startswith("voyage"):
if _voyageai is None:
voyageai_import_err = "`voyageai` package not found, please run `pip install voyageai`"
if model not in _voyage_clients:
# Lazy-import VoyageAI & create a client
try:
import voyageai
except ImportError as e:
raise ImportError(voyageai_import_err) from e
raise ImportError(
"`voyageai` package not found, please run `pip install voyageai`"
) from e

_voyage_clients[model] = voyageai.Client()

# VoyageAI expects a list of texts
return _voyage_clients[model].count_tokens([content])

try:
import tiktoken

_voyageai = voyageai.Client()
# If we don't already have a tiktoken encoder for this model,
# try to fetch it. This will raise KeyError if the model name
# is unknown (e.g. a Hugging Face model).
if model not in _tiktoken_encoders:
# Temporarily set TIKTOKEN_CACHE_DIR if not present
should_revert = False
if "TIKTOKEN_CACHE_DIR" not in os.environ:
should_revert = True
os.environ["TIKTOKEN_CACHE_DIR"] = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"_static",
"tiktoken_cache"
)

return _voyageai.count_tokens([content])
_tiktoken_encoders[model] = tiktoken.encoding_for_model(model)

if _enc is None:
tiktoken_import_err = "`tiktoken` package not found, please run `pip install tiktoken`"
try:
import tiktoken
except ImportError as e:
raise ImportError(tiktoken_import_err) from e
# Clean up TIKTOKEN_CACHE_DIR if we set it
if should_revert:
del os.environ["TIKTOKEN_CACHE_DIR"]

# set tokenizer cache temporarily
should_revert = False
if "TIKTOKEN_CACHE_DIR" not in os.environ:
should_revert = True
os.environ["TIKTOKEN_CACHE_DIR"] = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"_static/tiktoken_cache",
)
# Now we can encode
encoder = _tiktoken_encoders[model]
return len(encoder.encode(content, allowed_special="all"))

_enc = tiktoken.encoding_for_model(model)
except ImportError as e:
# tiktoken isn't installed at all
raise ImportError(
"`tiktoken` package not found, please run `pip install tiktoken`"
) from e

except KeyError:
"""
tiktoken raised KeyError, meaning it does not recognize this `model`
as a valid OpenAI model. We will fallback to a Hugging Face tokenizer.
"""

if model not in _hf_tokenizers:
try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"Hugging Face `transformers` not found. "
"Please install via `pip install transformers`."
) from e

if should_revert:
del os.environ["TIKTOKEN_CACHE_DIR"]
_hf_tokenizers[model] = AutoTokenizer.from_pretrained(model)

return len(_enc.encode(content, allowed_special="all"))
hf_tokenizer = _hf_tokenizers[model]
# For HF, simply return the length of the encoded IDs
return len(hf_tokenizer.encode(content))
Loading