camel-ai · AveryYay · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025
diff --git a/camel/retrievers/vector_retriever.py b/camel/retrievers/vector_retriever.py
@@ -27,6 +27,7 @@
     VectorRecord,
 )
 from camel.utils import Constants
+from camel.utils.chunker import BaseChunker, UioChunker
 
 if TYPE_CHECKING:
     from unstructured.documents.elements import Element
@@ -78,6 +79,7 @@ def process(
         should_chunk: bool = True,
         extra_info: Optional[dict] = None,
         metadata_filename: Optional[str] = None,
+        chunker: Optional[BaseChunker] = None,
         **kwargs: Any,
     ) -> None:
         r"""Processes content from local file path, remote URL, string
@@ -101,6 +103,12 @@ def process(
                 used for storing metadata. Defaults to None.
             **kwargs (Any): Additional keyword arguments for content parsing.
         """
+        if chunker is None:
+            chunker = UioChunker(
+                chunk_type=chunk_type,
+                max_characters=max_characters,
+                metadata_filename=metadata_filename,
+            )
         from unstructured.documents.elements import Element
 
         if isinstance(content, Element):
@@ -139,15 +147,8 @@ def process(
             )
         else:
             # Chunk the content if required
-            chunks = (
-                self.uio.chunk_elements(
-                    chunk_type=chunk_type,
-                    elements=elements,
-                    max_characters=max_characters,
-                )
-                if should_chunk
-                else elements
-            )
+            chunks = chunker.chunk(content=elements) if should_chunk else (
+                elements)
 
             # Process chunks in batches and store embeddings
             for i in range(0, len(chunks), embed_batch):
@@ -157,6 +158,7 @@ def process(
                 )
 
                 records = []
+                offset = 0
                 # Prepare the payload for each vector record, includes the
                 # content path, chunk metadata, and chunk text
                 for vector, chunk in zip(batch_vectors, batch_chunks):
@@ -178,6 +180,7 @@ def process(
                     chunk_metadata["metadata"].pop("orig_elements", "")
                     chunk_metadata["extra_info"] = extra_info or {}
                     chunk_text = {"text": str(chunk)}
+                    chunk_metadata["metadata"]["piece_num"] = i + offset + 1
                     combined_dict = {
                         **content_path_info,
                         **chunk_metadata,
@@ -187,6 +190,7 @@ def process(
                     records.append(
                         VectorRecord(vector=vector, payload=combined_dict)
                     )
+                    offset += 1
 
                 self.storage.add(records=records)
 

diff --git a/camel/utils/chunker/__init__.py b/camel/utils/chunker/__init__.py
@@ -0,0 +1,22 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+from .base import BaseChunker
+from .code_chunker import CodeChunker
+from .uio_chunker import UioChunker
+
+__all__ = [
+    "BaseChunker",
+    "CodeChunker",
+    "UioChunker",
+]
diff --git a/camel/utils/chunker/base.py b/camel/utils/chunker/base.py
@@ -0,0 +1,24 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class BaseChunker(ABC):
+    r"""An abstract base class for all CAMEL chunkers."""
+
+    @abstractmethod
+    def chunk(self, content: Any) -> Any:
+        r"""Chunk the given content"""
+        pass
diff --git a/camel/utils/chunker/code_chunker.py b/camel/utils/chunker/code_chunker.py
@@ -0,0 +1,186 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+import re
+from typing import List, Optional
+
+from unstructured.documents.elements import Element, ElementMetadata
+
+from camel.utils import get_model_encoding
+
+from .base import BaseChunker
+
+
+class CodeChunker(BaseChunker):
+    r"""A class for chunking code or text while respecting structure
+        and token limits.
+
+        This class ensures that structured elements such as functions,
+        classes, and regions are not arbitrarily split across chunks.
+        It also handles oversized lines and Base64-encoded images.
+
+    Attributes:
+        chunk_size (int, optional): The maximum token size per chunk.
+            (default: :obj:`8192`)
+        model_name (str, optional): The tokenizer model name used
+            for token counting. (default: :obj:`"cl100k_base"`)
+        remove_image: (bool, optional): If the chunker should skip the images.
+    """
+
+    def __init__(
+        self,
+        chunk_size: int = 8192,
+        model_name: str = "cl100k_base",
+        remove_image: Optional[bool] = True,
+    ):
+        self.chunk_size = chunk_size
+        self.tokenizer = get_model_encoding(model_name)
+        self.remove_image = remove_image
+        self.struct_pattern = re.compile(
+            r'^\s*(?:(def|class|function)\s+\w+|'
+            r'(public|private|protected)\s+[\w<>]+\s+\w+\s*\(|'
+            r'\b(interface|enum|namespace)\s+\w+|'
+            r'#\s*(region|endregion)\b)'
+        )
+        self.image_pattern = re.compile(
+            r'!\[.*?\]\((?:data:image/[^;]+;base64,[a-zA-Z0-9+/]+=*|[^)]+)\)'
+        )
+
+    def count_tokens(self, text: str):
+        r"""Counts the number of tokens in the given text.
+
+        Args:
+            text (str): The input text to be tokenized.
+
+        Returns:
+            int: The number of tokens in the input text.
+        """
+        return len(self.tokenizer.encode(text, disallowed_special=()))
+
+    def _split_oversized(self, line) -> List[str]:
+        r"""Splits an oversized line into multiple chunks based on token limits
+
+        Args:
+            line (str): The oversized line to be split.
+
+        Returns:
+            List[str]: A list of smaller chunks after splitting the
+                oversized line.
+        """
+        tokens = self.tokenizer.encode(line, disallowed_special=())
+        chunks = []
+        buffer = []
+        current_count = 0
+
+        for token in tokens:
+            buffer.append(token)
+            current_count += 1
+
+            if current_count >= self.chunk_size:
+                chunks.append(self.tokenizer.decode(buffer).strip())
+                buffer = []
+                current_count = 0
+
+        if buffer:
+            chunks.append(self.tokenizer.decode(buffer))
+        return chunks
+
+    def chunk(self, content: List[str]) -> List[Element]:
+        r"""Splits the content into smaller chunks while preserving
+        structure and adhering to token constraints.
+
+        Args:
+            content (List[str]): The content to be chunked.
+
+        Returns:
+            List[str]: A list of chunked text segments.
+        """
+        content = "\n".join(map(str, content))
+        chunks = []
+        current_chunk: list[str] = []
+        current_tokens = 0
+        struct_buffer: list[str] = []
+        struct_tokens = 0
+
+        for line in content.splitlines(keepends=True):
+            if self.remove_image:
+                if self.image_pattern.match(line):
+                    continue
+
+            line_tokens = self.count_tokens(line)
+
+            if line_tokens > self.chunk_size:
+                if current_chunk:
+                    chunks.append("".join(current_chunk))
+                    current_chunk = []
+                    current_tokens = 0
+                chunks.extend(self._split_oversized(line))
+                continue
+
+            if self.struct_pattern.match(line):
+                if struct_buffer:
+                    if current_tokens + struct_tokens <= self.chunk_size:
+                        current_chunk.extend(struct_buffer)
+                        current_tokens += struct_tokens
+                    else:
+                        if current_chunk:
+                            chunks.append("".join(current_chunk))
+                        current_chunk = struct_buffer.copy()
+                        current_tokens = struct_tokens
+                    struct_buffer = []
+                    struct_tokens = 0
+
+                struct_buffer.append(line)
+                struct_tokens += line_tokens
+            else:
+                if struct_buffer:
+                    struct_buffer.append(line)
+                    struct_tokens += line_tokens
+                else:
+                    if current_tokens + line_tokens > self.chunk_size:
+                        chunks.append("".join(current_chunk))
+                        current_chunk = [line]
+                        current_tokens = line_tokens
+                    else:
+                        current_chunk.append(line)
+                        current_tokens += line_tokens
+
+        if struct_buffer:
+            if current_tokens + struct_tokens <= self.chunk_size:
+                current_chunk.extend(struct_buffer)
+            else:
+                if current_chunk:
+                    chunks.append("".join(current_chunk))
+                current_chunk = struct_buffer
+
+        if current_chunk:
+            chunks.append("".join(current_chunk))
+
+        final_chunks = []
+        for chunk in chunks:
+            chunk_token = self.count_tokens(chunk)
+            if chunk_token > self.chunk_size:
+                final_chunks.extend(self._split_oversized(chunk))
+            else:
+                final_chunks.append(chunk)
+
+        #TODO: need to reconsider how to correctly form metadata (maybe need
+        # to decouple the connection with unstructuredIO)
+        chunked_elements = []
+        for chunk in final_chunks:
+            element = Element(
+                metadata=ElementMetadata()
+            )
+            element.text = chunk
+            chunked_elements.append(element)
+        return chunked_elements
diff --git a/camel/utils/chunker/uio_chunker.py b/camel/utils/chunker/uio_chunker.py
@@ -0,0 +1,66 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+from typing import List, Optional
+
+from unstructured.documents.elements import Element
+
+from camel.loaders import UnstructuredIO
+from camel.utils.chunker import BaseChunker
+
+
+class UioChunker(BaseChunker):
+    r"""A class for chunking text while respecting structure and
+    character limits.
+
+    This class ensures that structured elements, such as document sections
+    and titles, are not arbitrarily split across chunks. It utilizes the
+    `UnstructuredIO` class to process and segment elements while maintaining
+    readability and coherence. The chunking method can be adjusted based on
+    the provided `chunk_type` parameter.
+
+    Args:
+        chunk_type (str, optional): The method used for chunking text.
+            (default: :obj:`"chunk_by_title"`)
+        max_characters (int, optional): The maximum number of characters
+            allowed per chunk. (default: :obj:`500`)
+        metadata_filename (Optional[str], optional): An optional filename
+            for storing metadata related to chunking. (default: :obj:`None`)
+    """
+
+    def __init__(
+        self,
+        chunk_type: str = "chunk_by_title",
+        max_characters: int = 500,
+        metadata_filename: Optional[str] = None,
+    ):
+        self.uio = UnstructuredIO()
+        self.chunk_type = chunk_type
+        self.max_characters = max_characters
+        self.metadata_filename = metadata_filename
+
+    def chunk(self, content: List[Element]) -> List[Element]:
+        r"""Splits the content into smaller chunks while preserving
+        structure and adhering to token constraints.
+
+        Args:
+            content (List[Element]): The content to be chunked.
+
+        Returns:
+            List[Element]: A list of chunked text segments.
+        """
+        return self.uio.chunk_elements(
+            chunk_type=self.chunk_type,
+            elements=content,
+            max_characters=self.max_characters,
+        )