diff --git a/camel/retrievers/vector_retriever.py b/camel/retrievers/vector_retriever.py index d51aef3fd4..6b330c6f53 100644 --- a/camel/retrievers/vector_retriever.py +++ b/camel/retrievers/vector_retriever.py @@ -27,6 +27,7 @@ VectorRecord, ) from camel.utils import Constants +from camel.utils.chunker import BaseChunker, UioChunker if TYPE_CHECKING: from unstructured.documents.elements import Element @@ -78,6 +79,7 @@ def process( should_chunk: bool = True, extra_info: Optional[dict] = None, metadata_filename: Optional[str] = None, + chunker: Optional[BaseChunker] = None, **kwargs: Any, ) -> None: r"""Processes content from local file path, remote URL, string @@ -101,6 +103,12 @@ def process( used for storing metadata. Defaults to None. **kwargs (Any): Additional keyword arguments for content parsing. """ + if chunker is None: + chunker = UioChunker( + chunk_type=chunk_type, + max_characters=max_characters, + metadata_filename=metadata_filename, + ) from unstructured.documents.elements import Element if isinstance(content, Element): @@ -139,15 +147,8 @@ def process( ) else: # Chunk the content if required - chunks = ( - self.uio.chunk_elements( - chunk_type=chunk_type, - elements=elements, - max_characters=max_characters, - ) - if should_chunk - else elements - ) + chunks = chunker.chunk(content=elements) if should_chunk else ( + elements) # Process chunks in batches and store embeddings for i in range(0, len(chunks), embed_batch): @@ -157,6 +158,7 @@ def process( ) records = [] + offset = 0 # Prepare the payload for each vector record, includes the # content path, chunk metadata, and chunk text for vector, chunk in zip(batch_vectors, batch_chunks): @@ -178,6 +180,7 @@ def process( chunk_metadata["metadata"].pop("orig_elements", "") chunk_metadata["extra_info"] = extra_info or {} chunk_text = {"text": str(chunk)} + chunk_metadata["metadata"]["piece_num"] = i + offset + 1 combined_dict = { **content_path_info, **chunk_metadata, @@ -187,6 +190,7 @@ def process( records.append( VectorRecord(vector=vector, payload=combined_dict) ) + offset += 1 self.storage.add(records=records) diff --git a/camel/utils/chunker/__init__.py b/camel/utils/chunker/__init__.py new file mode 100644 index 0000000000..f5c52b0f46 --- /dev/null +++ b/camel/utils/chunker/__init__.py @@ -0,0 +1,22 @@ +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +from .base import BaseChunker +from .code_chunker import CodeChunker +from .uio_chunker import UioChunker + +__all__ = [ + "BaseChunker", + "CodeChunker", + "UioChunker", +] diff --git a/camel/utils/chunker/base.py b/camel/utils/chunker/base.py new file mode 100644 index 0000000000..39ade2f3eb --- /dev/null +++ b/camel/utils/chunker/base.py @@ -0,0 +1,24 @@ +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +from abc import ABC, abstractmethod +from typing import Any + + +class BaseChunker(ABC): + r"""An abstract base class for all CAMEL chunkers.""" + + @abstractmethod + def chunk(self, content: Any) -> Any: + r"""Chunk the given content""" + pass diff --git a/camel/utils/chunker/code_chunker.py b/camel/utils/chunker/code_chunker.py new file mode 100644 index 0000000000..d62452b990 --- /dev/null +++ b/camel/utils/chunker/code_chunker.py @@ -0,0 +1,186 @@ +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +import re +from typing import List, Optional + +from unstructured.documents.elements import Element, ElementMetadata + +from camel.utils import get_model_encoding + +from .base import BaseChunker + + +class CodeChunker(BaseChunker): + r"""A class for chunking code or text while respecting structure + and token limits. + + This class ensures that structured elements such as functions, + classes, and regions are not arbitrarily split across chunks. + It also handles oversized lines and Base64-encoded images. + + Attributes: + chunk_size (int, optional): The maximum token size per chunk. + (default: :obj:`8192`) + model_name (str, optional): The tokenizer model name used + for token counting. (default: :obj:`"cl100k_base"`) + remove_image: (bool, optional): If the chunker should skip the images. + """ + + def __init__( + self, + chunk_size: int = 8192, + model_name: str = "cl100k_base", + remove_image: Optional[bool] = True, + ): + self.chunk_size = chunk_size + self.tokenizer = get_model_encoding(model_name) + self.remove_image = remove_image + self.struct_pattern = re.compile( + r'^\s*(?:(def|class|function)\s+\w+|' + r'(public|private|protected)\s+[\w<>]+\s+\w+\s*\(|' + r'\b(interface|enum|namespace)\s+\w+|' + r'#\s*(region|endregion)\b)' + ) + self.image_pattern = re.compile( + r'!\[.*?\]\((?:data:image/[^;]+;base64,[a-zA-Z0-9+/]+=*|[^)]+)\)' + ) + + def count_tokens(self, text: str): + r"""Counts the number of tokens in the given text. + + Args: + text (str): The input text to be tokenized. + + Returns: + int: The number of tokens in the input text. + """ + return len(self.tokenizer.encode(text, disallowed_special=())) + + def _split_oversized(self, line) -> List[str]: + r"""Splits an oversized line into multiple chunks based on token limits + + Args: + line (str): The oversized line to be split. + + Returns: + List[str]: A list of smaller chunks after splitting the + oversized line. + """ + tokens = self.tokenizer.encode(line, disallowed_special=()) + chunks = [] + buffer = [] + current_count = 0 + + for token in tokens: + buffer.append(token) + current_count += 1 + + if current_count >= self.chunk_size: + chunks.append(self.tokenizer.decode(buffer).strip()) + buffer = [] + current_count = 0 + + if buffer: + chunks.append(self.tokenizer.decode(buffer)) + return chunks + + def chunk(self, content: List[str]) -> List[Element]: + r"""Splits the content into smaller chunks while preserving + structure and adhering to token constraints. + + Args: + content (List[str]): The content to be chunked. + + Returns: + List[str]: A list of chunked text segments. + """ + content = "\n".join(map(str, content)) + chunks = [] + current_chunk: list[str] = [] + current_tokens = 0 + struct_buffer: list[str] = [] + struct_tokens = 0 + + for line in content.splitlines(keepends=True): + if self.remove_image: + if self.image_pattern.match(line): + continue + + line_tokens = self.count_tokens(line) + + if line_tokens > self.chunk_size: + if current_chunk: + chunks.append("".join(current_chunk)) + current_chunk = [] + current_tokens = 0 + chunks.extend(self._split_oversized(line)) + continue + + if self.struct_pattern.match(line): + if struct_buffer: + if current_tokens + struct_tokens <= self.chunk_size: + current_chunk.extend(struct_buffer) + current_tokens += struct_tokens + else: + if current_chunk: + chunks.append("".join(current_chunk)) + current_chunk = struct_buffer.copy() + current_tokens = struct_tokens + struct_buffer = [] + struct_tokens = 0 + + struct_buffer.append(line) + struct_tokens += line_tokens + else: + if struct_buffer: + struct_buffer.append(line) + struct_tokens += line_tokens + else: + if current_tokens + line_tokens > self.chunk_size: + chunks.append("".join(current_chunk)) + current_chunk = [line] + current_tokens = line_tokens + else: + current_chunk.append(line) + current_tokens += line_tokens + + if struct_buffer: + if current_tokens + struct_tokens <= self.chunk_size: + current_chunk.extend(struct_buffer) + else: + if current_chunk: + chunks.append("".join(current_chunk)) + current_chunk = struct_buffer + + if current_chunk: + chunks.append("".join(current_chunk)) + + final_chunks = [] + for chunk in chunks: + chunk_token = self.count_tokens(chunk) + if chunk_token > self.chunk_size: + final_chunks.extend(self._split_oversized(chunk)) + else: + final_chunks.append(chunk) + + #TODO: need to reconsider how to correctly form metadata (maybe need + # to decouple the connection with unstructuredIO) + chunked_elements = [] + for chunk in final_chunks: + element = Element( + metadata=ElementMetadata() + ) + element.text = chunk + chunked_elements.append(element) + return chunked_elements diff --git a/camel/utils/chunker/uio_chunker.py b/camel/utils/chunker/uio_chunker.py new file mode 100644 index 0000000000..6e71fcd81a --- /dev/null +++ b/camel/utils/chunker/uio_chunker.py @@ -0,0 +1,66 @@ +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +from typing import List, Optional + +from unstructured.documents.elements import Element + +from camel.loaders import UnstructuredIO +from camel.utils.chunker import BaseChunker + + +class UioChunker(BaseChunker): + r"""A class for chunking text while respecting structure and + character limits. + + This class ensures that structured elements, such as document sections + and titles, are not arbitrarily split across chunks. It utilizes the + `UnstructuredIO` class to process and segment elements while maintaining + readability and coherence. The chunking method can be adjusted based on + the provided `chunk_type` parameter. + + Args: + chunk_type (str, optional): The method used for chunking text. + (default: :obj:`"chunk_by_title"`) + max_characters (int, optional): The maximum number of characters + allowed per chunk. (default: :obj:`500`) + metadata_filename (Optional[str], optional): An optional filename + for storing metadata related to chunking. (default: :obj:`None`) + """ + + def __init__( + self, + chunk_type: str = "chunk_by_title", + max_characters: int = 500, + metadata_filename: Optional[str] = None, + ): + self.uio = UnstructuredIO() + self.chunk_type = chunk_type + self.max_characters = max_characters + self.metadata_filename = metadata_filename + + def chunk(self, content: List[Element]) -> List[Element]: + r"""Splits the content into smaller chunks while preserving + structure and adhering to token constraints. + + Args: + content (List[Element]): The content to be chunked. + + Returns: + List[Element]: A list of chunked text segments. + """ + return self.uio.chunk_elements( + chunk_type=self.chunk_type, + elements=content, + max_characters=self.max_characters, + )