ModelCloud · CL-ModelCloud · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/setup.py b/setup.py
@@ -17,14 +17,16 @@
 from setuptools import setup, find_packages
 from pathlib import Path
 
-__version__ = "0.1.0-dev"
+version_vars = {}
+exec("exec(open('tokenicer/version.py').read()); version=__version__", {}, version_vars)
+tokenicer_version = version_vars['version']
 
 with open("requirements.txt") as f:
     requirements = f.read().splitlines()
 
 setup(
     name="tokenicer",
-    version=__version__,
+    version=tokenicer_version,
     author="ModelCloud",
     author_email="[email protected]",
     description="A (nicer) tokenizer you want to use for model `inference` and `training`: with all known peventable `gotchas` normalized or auto-fixed.",

diff --git a/tests/test_model_config.py b/tests/test_model_config.py
@@ -19,7 +19,6 @@
 
 
 class TestModelConfig(unittest.TestCase):
-
     def test_model_config(self):
         model_path = "/monster/data/model/mpt-7b-instruct"
         tokenicer = Tokenicer.load(model_path)
@@ -30,11 +29,11 @@ def test_model_config(self):
         self.assertEqual(
             tokenicer.model_config.bos_token_id,
             expect_bos_token_id,
-            msg=f"Expected bos_token_id: `{expect_bos_token_id}`, actual=`{tokenicer.model_config.bos_token_id}`."
+            msg=f"Expected bos_token_id: `{expect_bos_token_id}`, actual=`{tokenicer.model_config.bos_token_id}`.",
         )
 
         self.assertEqual(
             tokenicer.model_config.eos_token_id,
             expect_eos_token_id,
-            msg=f"Expected eos_token_id: `{expect_eos_token_id}`, actual=`{tokenicer.model_config.eos_token_id}`."
-        )
+            msg=f"Expected eos_token_id: `{expect_eos_token_id}`, actual=`{tokenicer.model_config.eos_token_id}`.",
+        )
diff --git a/tests/test_pad_token.py b/tests/test_pad_token.py
@@ -25,24 +25,32 @@
 class TestPadToken(unittest.TestCase):
     @parameterized.expand(
         [
-            ('/monster/data/model/Llama-3.2-1B-Instruct', '<|reserved_special_token_0|>', ['<|reserved_special_token_0|>']),
-            ('/monster/data/model/Phi-3-mini-4k-instruct', '<unk>'),
-            ('/monster/data/model/Llama-3.2-1B-Instruct', '<|finetune_right_pad_id|>'),
-            ('/monster/data/model/Qwen2.5-0.5B-Instruct', '<|fim_pad|>'),
-            ('/monster/data/model/Qwen2-VL-2B-Instruct', '<|vision_pad|>'),
-            ('/monster/data/model/gemma-2-9b', '<pad>'),
-            ('/monster/data/model/Hymba-1.5B-Instruct', '<unk>', None, True),
-            ('/monster/data/model/Mistral-7B-Instruct-v0.2', '<unk>'),
-            ('/monster/data/model/Yi-Coder-1.5B-Chat', '<unk>'),
-            (AutoTokenizer.from_pretrained('/monster/data/model/glm-4-9b-chat-hf'), '<|endoftext|>')
+            (
+                "/monster/data/model/Llama-3.2-1B-Instruct",
+                "<|reserved_special_token_0|>",
+                ["<|reserved_special_token_0|>"],
+            ),
+            ("/monster/data/model/Phi-3-mini-4k-instruct", "<unk>"),
+            ("/monster/data/model/Llama-3.2-1B-Instruct", "<|finetune_right_pad_id|>"),
+            ("/monster/data/model/Qwen2.5-0.5B-Instruct", "<|fim_pad|>"),
+            ("/monster/data/model/Qwen2-VL-2B-Instruct", "<|vision_pad|>"),
+            ("/monster/data/model/gemma-2-9b", "<pad>"),
+            ("/monster/data/model/Hymba-1.5B-Instruct", "<unk>", None, True),
+            ("/monster/data/model/Mistral-7B-Instruct-v0.2", "<unk>"),
+            ("/monster/data/model/Yi-Coder-1.5B-Chat", "<unk>"),
+            (
+                AutoTokenizer.from_pretrained("/monster/data/model/glm-4-9b-chat-hf"),
+                "<|endoftext|>",
+            ),
         ]
     )
-    def test_pad_token(self,
-                       tokenizer_or_path: str,
-                       expect_pad_token: str,
-                       pad_tokens: Optional[List[Union[str, int]]] = None,
-                       trust_remote: bool = False
-                       ):
+    def test_pad_token(
+        self,
+        tokenizer_or_path: str,
+        expect_pad_token: str,
+        pad_tokens: Optional[List[Union[str, int]]] = None,
+        trust_remote: bool = False,
+    ):
         tokenicer = Tokenicer.load(tokenizer_or_path, trust_remote_code=trust_remote)
 
         if pad_tokens is not None:
@@ -51,5 +59,5 @@ def test_pad_token(self,
         self.assertEqual(
             tokenicer.tokenizer.pad_token,
             expect_pad_token,
-            msg=f"Expected pad_token: `{expect_pad_token}`, actual=`{tokenicer.tokenizer.pad_token}`."
-        )
+            msg=f"Expected pad_token: `{expect_pad_token}`, actual=`{tokenicer.tokenizer.pad_token}`.",
+        )
diff --git a/tests/test_tokenicer_forward.py b/tests/test_tokenicer_forward.py
@@ -18,56 +18,56 @@
 from parameterized import parameterized
 import unittest
 
-class TestTokenicer(unittest.TestCase):
 
+class TestTokenicer(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         self.pretrained_model_id = "/monster/data/model/Qwen2.5-0.5B-Instruct/"
         self.tokenizer = Tokenicer.load(self.pretrained_model_id)
-        self.example = 'Test Case String'
+        self.example = "Test Case String"
         self.expect_input_ids = [2271, 11538, 923]
 
     def test_tokenicer_func(self):
-        input_ids = self.tokenizer(self.example)['input_ids']
+        input_ids = self.tokenizer(self.example)["input_ids"]
         self.assertEqual(
             input_ids,
             self.expect_input_ids,
-            msg=f"Expected input_ids=`{self.expect_input_ids}`, actual=`{input_ids}`."
+            msg=f"Expected input_ids=`{self.expect_input_ids}`, actual=`{input_ids}`.",
         )
 
     @parameterized.expand(
         [
-            ('eos_token', "<|im_end|>"),
-            ('pad_token', "<|fim_pad|>"),
-            ('vocab_size', 151643)
+            ("eos_token", "<|im_end|>"),
+            ("pad_token", "<|fim_pad|>"),
+            ("vocab_size", 151643),
         ]
     )
     def test_tokenicer_property(self, property, expect_token):
-        if property == 'eos_token':
+        if property == "eos_token":
             result = self.tokenizer.eos_token
-        elif property == 'pad_token':
+        elif property == "pad_token":
             result = self.tokenizer.pad_token
-        elif property == 'vocab_size':
+        elif property == "vocab_size":
             result = self.tokenizer.vocab_size
 
         self.assertEqual(
             result,
             expect_token,
-            msg=f"Expected {property}: `{expect_token}`, actual=`{result}`."
+            msg=f"Expected {property}: `{expect_token}`, actual=`{result}`.",
         )
 
     def test_tokenicer_encode(self):
-         input_ids = self.tokenizer.encode(self.example, add_special_tokens=False)
-         self.assertEqual(
-             input_ids,
-             self.expect_input_ids,
-             msg=f"Expected input_ids: `{self.expect_input_ids}`, actual=`{input_ids}`."
-         )
+        input_ids = self.tokenizer.encode(self.example, add_special_tokens=False)
+        self.assertEqual(
+            input_ids,
+            self.expect_input_ids,
+            msg=f"Expected input_ids: `{self.expect_input_ids}`, actual=`{input_ids}`.",
+        )
 
     def test_tokenicer_decode(self):
         example = self.tokenizer.decode(self.expect_input_ids, skip_special_tokens=True)
         self.assertEqual(
             self.example,
             example,
-            msg=f"Expected example: `{self.example}`, actual=`{example}`."
-        )
+            msg=f"Expected example: `{self.example}`, actual=`{example}`.",
+        )
diff --git a/tests/test_validate.py b/tests/test_validate.py
@@ -0,0 +1,39 @@
+# Copyright 2025 ModelCloud.ai
+# Copyright 2025 [email protected]
+# Contact: [email protected], x.com/qubitium
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+from tokenicer import Tokenicer
+from tokenicer.const import VALIDATE_JSON_FILE_NAME
+import tempfile
+
+
+class TestValidate(unittest.TestCase):
+    def test_validate(self):
+        model_path = "/monster/data/model/Qwen2.5-0.5B-Instruct"
+        tokenicer = Tokenicer.load(model_path)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tokenicer.save(tmpdir)
+            validate_json_path = os.path.join(tmpdir, VALIDATE_JSON_FILE_NAME)
+            result = os.path.isfile(validate_json_path)
+            self.assertTrue(
+                result,
+                f"Save validate file failed: {validate_json_path} does not exist.",
+            )
+
+            validate = tokenicer.validate(tmpdir)
+            self.assertTrue(validate, f"Expected validate='True' but got '{validate}'.")
diff --git a/tokenicer/config.py b/tokenicer/config.py
@@ -0,0 +1,87 @@
+# Copyright 2025 ModelCloud.ai
+# Copyright 2025 [email protected]
+# Contact: [email protected], x.com/qubitium
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union, Any, Dict
+from dataclasses import dataclass
+from enum import Enum
+
+
+class ValidateDataFormat(Enum):
+    SIMPLE = "simple"
+
+
+@dataclass
+class ValidateData:
+    format: ValidateDataFormat = ValidateDataFormat.SIMPLE
+    input: Union[str, Any] = None
+    output: List[int] = None
+
+    def __post_init__(self):
+        if self.input is None:
+            self.input = []
+
+        if self.output is None:
+            self.output = []
+
+
+@dataclass
+class ValidateMeta:
+    validator: str = None
+    uri: str = None
+
+    def __post_init__(self):
+        if self.validator is None:
+            from .version import __version__
+
+            self.validator = f"tokenicer:{__version__}"
+
+        if self.uri is None:
+            self.uri = "https://github.com/ModelCloud/Tokenicer"
+
+
+@dataclass
+class ValidateConfig:
+    meta: Optional[ValidateMeta] = None
+    data: List[ValidateData] = None
+
+    def __post_init__(self):
+        if self.meta is None:
+            self.meta = ValidateMeta()
+
+        if self.data is None:
+            self.data = []
+
+    def to_dict(self):
+        dataset_dict = [
+            {
+                "format": data.format.value,
+                "input": data.input,
+                "output": data.output,
+            }
+            for data in self.data
+        ]
+
+        meta_dict = {"validator": self.meta.validator, "uri": self.meta.uri}
+
+        return {"meta": meta_dict, "data": dataset_dict}
+
+    @classmethod
+    def from_dict(cls, data: Dict):
+        meta_data = data.get("meta", {})
+        data_list = data.get("data", [])
+        meta = ValidateMeta(**meta_data) if meta_data else None
+        validate_data = [ValidateData(**item) for item in data_list]
+        return cls(meta=meta, data=validate_data)
diff --git a/tokenicer/const.py b/tokenicer/const.py
@@ -17,20 +17,62 @@
 from collections import namedtuple
 
 DEFAULT_PAD_TOKENS = [
-        "<|finetune_right_pad_id|>",
-        "<|pad|>",
-        "<pad>",
-        "<|unk|>",
-        "<unk>"
+    "<|finetune_right_pad_id|>",
+    "<|pad|>",
+    "<pad>",
+    "<|unk|>",
+    "<unk>",
 ]
 
 TOKEN_TUPLE = namedtuple("TokenTuple", ["token", "token_id"])
 
 MODEL_PAD_TOKEN_MAP = {
-        "llama": TOKEN_TUPLE(token='<|finetune_right_pad_id|>', token_id=128004),
-        "qwen2_5_vl": TOKEN_TUPLE(token='<|vision_pad|>', token_id=151654),
-        "qwen2_vl": TOKEN_TUPLE(token='<|vision_pad|>', token_id=151654),
-        "qwen2": TOKEN_TUPLE(token='<|fim_pad|>', token_id=151662),
-        "deepseek_v3": TOKEN_TUPLE(token='<｜▁pad▁｜>', token_id=2),
-        "mpt": TOKEN_TUPLE(token='<|padding|>', token_id=1)
-}
+    "llama": TOKEN_TUPLE(token="<|finetune_right_pad_id|>", token_id=128004),
+    "qwen2_5_vl": TOKEN_TUPLE(token="<|vision_pad|>", token_id=151654),
+    "qwen2_vl": TOKEN_TUPLE(token="<|vision_pad|>", token_id=151654),
+    "qwen2": TOKEN_TUPLE(token="<|fim_pad|>", token_id=151662),
+    "deepseek_v3": TOKEN_TUPLE(token="<｜▁pad▁｜>", token_id=2),
+    "mpt": TOKEN_TUPLE(token="<|padding|>", token_id=1),
+}
+
+VALIDATE_JSON_FILE_NAME = "tokenizer_validate.json"
+VALIDATE_ENCODE_PARAMS = {"return_tensors": "pt", "add_special_tokens": False}
+
+VALIDATE_DATASETS = [
+    # English
+    "Sure! I'd be happy to help. What kind of writing prompt are you looking for?",
+    "Certainly! A comma (,) is used to separate items in a list, e.g., 'I bought apples, bananas, and oranges.' A semicolon (;) links related independent clauses, e.g., 'I have a meeting tomorrow; I need to prepare.' A colon (:) introduces a list or explanation, e.g., 'Here are the items you need: pen, paper, and ink.'",
+    "Let's do it:\n\n1. 3.14159265359 + 2.71828182846 = 5.85987448205\n2. 5.6 * 2.3 = 12.88\n3. The square root of 123.456 is approximately 11.1111047355\n\nWould you like to explore more complex calculations? I can also work with exponents (e.g., 2^10 or 5.1^3.2).",
+    "Let's break it down:\n\n1. **Balancing the chemical equation:** The unbalanced equation is: \n   H₂ + O₂ → H₂O. To balance it, we need 2 molecules of H₂ and 1 molecule of O₂ to form 2 molecules of H₂O: \n   **2H₂ + O₂ → 2H₂O.**\n\n2. **Area of a circle:** The formula for the area of a circle is \( A = \pi r^2 \). With a radius of 5.7 cm, the area is approximately: \n   \( A = 3.14159 \times (5.7)^2 = 102.041 \, \text{cm}^2.\)\n\n3. **Molar mass of NaCl:** Sodium chloride (NaCl) consists of one sodium (Na) atom and one chlorine (Cl) atom. The atomic masses are approximately: \n   Na = 22.99 g/mol, Cl = 35.45 g/mol. So, the molar mass of NaCl is: \n   **22.99 g/mol + 35.45 g/mol = 58.44 g/mol.**",
+    # Simplified Chinese
+    "在一个清晨，阳光透过窗帘缝隙洒在床单上，空气里弥漫着刚煮好的咖啡香。街道还很安静，偶尔有几只鸟儿在枝头跳跃。",
+    "2025年，科技的发展速度令人惊叹！\n量子计算机的计算能力已达到10¹⁰次操作每秒，\n而ChatGPT模型的推理速度是传统计算机的100倍以上。\n公式E=mc²揭示了质量和能量的关系。\n今天的任务包括：\n1. 完成项目报告\n2. 参加9:00的会议\n3. 下午2:00开始的代码审查\n别忘了，创新与效率是成功的关键！",
+    # Traditional Chinese
+    "2025年，科技的發展速度讓人驚訝！\n量子電腦的計算能力已達到 10¹⁰ 次操作每秒，\n而ChatGPT模型的推理速度是傳統電腦的100倍以上。\n例如，愛因斯坦的著名公式 E = mc²，\n揭示了質量和能量之間的關係。\n化學中，水的化學式 H₂O 代表著每個分子包含兩個氫原子和一個氧原子。\n今日的工作清單如下：\n1. 完成數學模型的推導：x² + 3x - 4 = 0\n2. 實驗室研究化學反應：2H₂ + O₂ → 2H₂O\n3. 進行下午3:00的會議\n每一步，都是知識積累的過程。",
+    # French
+    "Le matin, lorsque le soleil se lève lentement à l'horizon, la ville semble encore endormie. Les rues sont calmes, seules quelques personnes marchent rapidement pour commencer leur journée. L'air est frais, et les arbres, bien que dépouillés de leurs feuilles en hiver, semblent toujours veiller sur la ville. J'aime prendre un moment pour observer ce silence paisible avant que le bruit de la journée ne commence à envahir l'espace. Parfois, il suffit de quelques instants pour se reconnecter à soi-même et à l'instant présent.",
+    # German
+    "In der modernen Softwareentwicklung ist es wichtig, effizienten Code zu schreiben. Zum Beispiel kann ein einfacher `for`-Loop in Python wie folgt aussehen: ```python\nfor i in range(10):\n    print(i)\n``` Dieser Code gibt die Zahlen von 0 bis 9 aus. Es ist auch entscheidend, den Code so zu optimieren, dass er sowohl lesbar als auch schnell ist. Ein gut strukturierter Code trägt zu einer besseren Wartbarkeit bei und reduziert die Wahrscheinlichkeit von Fehlern.",
+    # Spanish
+    '# Este es un ejemplo de código en Python\ndef saludar(nombre):\n    print(f"¡Hola, {nombre}!")\n\n# Llamada a la función\nsaludar("Juan")',
+    # Arabic
+    "الكيمياء هي دراسة المادة وتفاعلاتها. وتشمل العديد من الفروع مثل الكيمياء العضوية وغير العضوية، والكيمياء التحليلية والكيمياء الفيزيائية. تلعب الكيمياء دوراً مهماً في العديد من الصناعات مثل صناعة الأدوية، والبترول، والطاقة.",
+    # Russian
+    "Привет! Как дела? Я рад познакомиться с тобой. Надеюсь, у тебя хороший день!",
+    # Danish
+    "Danmark er et smukt land med en rig kultur og historie. Det er kendt for sine maleriske landskaber, hyggelige byer og venlige mennesker. København, hovedstaden, er en moderne metropol, der samtidig bevarer sin historiske charme. Danmark har også en stærk tradition for bæredygtighed og innovation.",
+    # Portuguese
+    "Hoje está um dia lindo, perfeito para um passeio no parque.",
+    # Indonesian
+    "Selamat pagi! Apa kabar? Saya harap hari Anda menyenankan. Jika ada sesuatu yang bisa saya bantu, silakan beri tahu saya.",
+    # Italian
+    "La cucina italiana è famosa in tutto il mondo per la sua varietà e i suoi sapori deliziosi. Ogni regione ha le sue specialità uniche, ma piatti come la pasta, la pizza e il gelato sono amati da tutti. Mangiare in Italia non è solo un pasto, ma un'esperienza sociale che coinvolge amici e familiari.",
+    # Vietnamese
+    "Chào bạn! Tôi là một trí tuệ nhân tạo, rất vui được gặp bạn. Bạn cần giúp đỡ gì hôm nay?",
+    # Polish
+    "Cześć! Jak się masz? Mam nadzieję, że wszystko u Ciebie w porządku. Jeśli chcesz porozmawiać lub masz jakieś pytania, śmiało pisz!",
+    # Japanese
+    "今日はとても良い天気ですね。朝から青空が広がっていて、散歩に出かけるのにぴったりな日です。最近、忙しくてなかなか外に出る時間がなかったので、今日はゆっくりと自然の中でリラックスしたいと思っています。",
+    # Korean
+    "오늘은 정말 좋은 날씨네요. 아침부터 맑은 하늘이 펼쳐져 있고, 산책을 하기에 딱 좋은 날이에요. 요즘 바빠서 밖에 나갈 시간이 없었는데, 오늘은 자연 속에서 여유롭게 시간을 보내고 싶어요.",
+]