Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/1672 feature request chunker module enabling custom chunking strategy #1727

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions camel/retrievers/vector_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
VectorRecord,
)
from camel.utils import Constants
from camel.utils.chunker import BaseChunker, UioChunker

if TYPE_CHECKING:
from unstructured.documents.elements import Element
Expand Down Expand Up @@ -78,6 +79,7 @@ def process(
should_chunk: bool = True,
extra_info: Optional[dict] = None,
metadata_filename: Optional[str] = None,
chunker: Optional[BaseChunker] = None,
**kwargs: Any,
) -> None:
r"""Processes content from local file path, remote URL, string
Expand All @@ -101,6 +103,12 @@ def process(
used for storing metadata. Defaults to None.
**kwargs (Any): Additional keyword arguments for content parsing.
"""
if chunker is None:
chunker = UioChunker(
chunk_type=chunk_type,
max_characters=max_characters,
metadata_filename=metadata_filename,
)
from unstructured.documents.elements import Element

if isinstance(content, Element):
Expand Down Expand Up @@ -139,15 +147,8 @@ def process(
)
else:
# Chunk the content if required
chunks = (
self.uio.chunk_elements(
chunk_type=chunk_type,
elements=elements,
max_characters=max_characters,
)
if should_chunk
else elements
)
chunks = chunker.chunk(content=elements) if should_chunk else (
elements)

# Process chunks in batches and store embeddings
for i in range(0, len(chunks), embed_batch):
Expand All @@ -157,6 +158,7 @@ def process(
)

records = []
offset = 0
# Prepare the payload for each vector record, includes the
# content path, chunk metadata, and chunk text
for vector, chunk in zip(batch_vectors, batch_chunks):
Expand All @@ -178,6 +180,7 @@ def process(
chunk_metadata["metadata"].pop("orig_elements", "")
chunk_metadata["extra_info"] = extra_info or {}
chunk_text = {"text": str(chunk)}
chunk_metadata["metadata"]["piece_num"] = i + offset + 1
combined_dict = {
**content_path_info,
**chunk_metadata,
Expand All @@ -187,6 +190,7 @@ def process(
records.append(
VectorRecord(vector=vector, payload=combined_dict)
)
offset += 1

self.storage.add(records=records)

Expand Down
22 changes: 22 additions & 0 deletions camel/utils/chunker/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from .base import BaseChunker
from .code_chunker import CodeChunker
from .uio_chunker import UioChunker

__all__ = [
"BaseChunker",
"CodeChunker",
"UioChunker",
]
24 changes: 24 additions & 0 deletions camel/utils/chunker/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from abc import ABC, abstractmethod
from typing import Any


class BaseChunker(ABC):
r"""An abstract base class for all CAMEL chunkers."""

@abstractmethod
def chunk(self, content: Any) -> Any:
r"""Chunk the given content"""
pass
186 changes: 186 additions & 0 deletions camel/utils/chunker/code_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
import re
from typing import List, Optional

from unstructured.documents.elements import Element, ElementMetadata

from camel.utils import get_model_encoding

from .base import BaseChunker


class CodeChunker(BaseChunker):
r"""A class for chunking code or text while respecting structure
and token limits.

This class ensures that structured elements such as functions,
classes, and regions are not arbitrarily split across chunks.
It also handles oversized lines and Base64-encoded images.

Attributes:
chunk_size (int, optional): The maximum token size per chunk.
(default: :obj:`8192`)
model_name (str, optional): The tokenizer model name used
for token counting. (default: :obj:`"cl100k_base"`)
remove_image: (bool, optional): If the chunker should skip the images.
"""

def __init__(
self,
chunk_size: int = 8192,
model_name: str = "cl100k_base",
remove_image: Optional[bool] = True,
):
self.chunk_size = chunk_size
self.tokenizer = get_model_encoding(model_name)
self.remove_image = remove_image
self.struct_pattern = re.compile(
r'^\s*(?:(def|class|function)\s+\w+|'
r'(public|private|protected)\s+[\w<>]+\s+\w+\s*\(|'
r'\b(interface|enum|namespace)\s+\w+|'
r'#\s*(region|endregion)\b)'
)
self.image_pattern = re.compile(
r'!\[.*?\]\((?:data:image/[^;]+;base64,[a-zA-Z0-9+/]+=*|[^)]+)\)'
)

def count_tokens(self, text: str):
r"""Counts the number of tokens in the given text.

Args:
text (str): The input text to be tokenized.

Returns:
int: The number of tokens in the input text.
"""
return len(self.tokenizer.encode(text, disallowed_special=()))

def _split_oversized(self, line) -> List[str]:
r"""Splits an oversized line into multiple chunks based on token limits

Args:
line (str): The oversized line to be split.

Returns:
List[str]: A list of smaller chunks after splitting the
oversized line.
"""
tokens = self.tokenizer.encode(line, disallowed_special=())
chunks = []
buffer = []
current_count = 0

for token in tokens:
buffer.append(token)
current_count += 1

if current_count >= self.chunk_size:
chunks.append(self.tokenizer.decode(buffer).strip())
buffer = []
current_count = 0

if buffer:
chunks.append(self.tokenizer.decode(buffer))
return chunks

def chunk(self, content: List[str]) -> List[Element]:
r"""Splits the content into smaller chunks while preserving
structure and adhering to token constraints.

Args:
content (List[str]): The content to be chunked.

Returns:
List[str]: A list of chunked text segments.
"""
content = "\n".join(map(str, content))
chunks = []
current_chunk: list[str] = []
current_tokens = 0
struct_buffer: list[str] = []
struct_tokens = 0

for line in content.splitlines(keepends=True):
if self.remove_image:
if self.image_pattern.match(line):
continue

line_tokens = self.count_tokens(line)

if line_tokens > self.chunk_size:
if current_chunk:
chunks.append("".join(current_chunk))
current_chunk = []
current_tokens = 0
chunks.extend(self._split_oversized(line))
continue

if self.struct_pattern.match(line):
if struct_buffer:
if current_tokens + struct_tokens <= self.chunk_size:
current_chunk.extend(struct_buffer)
current_tokens += struct_tokens
else:
if current_chunk:
chunks.append("".join(current_chunk))
current_chunk = struct_buffer.copy()
current_tokens = struct_tokens
struct_buffer = []
struct_tokens = 0

struct_buffer.append(line)
struct_tokens += line_tokens
else:
if struct_buffer:
struct_buffer.append(line)
struct_tokens += line_tokens
else:
if current_tokens + line_tokens > self.chunk_size:
chunks.append("".join(current_chunk))
current_chunk = [line]
current_tokens = line_tokens
else:
current_chunk.append(line)
current_tokens += line_tokens

if struct_buffer:
if current_tokens + struct_tokens <= self.chunk_size:
current_chunk.extend(struct_buffer)
else:
if current_chunk:
chunks.append("".join(current_chunk))
current_chunk = struct_buffer

if current_chunk:
chunks.append("".join(current_chunk))

final_chunks = []
for chunk in chunks:
chunk_token = self.count_tokens(chunk)
if chunk_token > self.chunk_size:
final_chunks.extend(self._split_oversized(chunk))
else:
final_chunks.append(chunk)

#TODO: need to reconsider how to correctly form metadata (maybe need
# to decouple the connection with unstructuredIO)
chunked_elements = []
for chunk in final_chunks:
element = Element(
metadata=ElementMetadata()
)
element.text = chunk
chunked_elements.append(element)
return chunked_elements
66 changes: 66 additions & 0 deletions camel/utils/chunker/uio_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from typing import List, Optional

from unstructured.documents.elements import Element

from camel.loaders import UnstructuredIO
from camel.utils.chunker import BaseChunker


class UioChunker(BaseChunker):
r"""A class for chunking text while respecting structure and
character limits.

This class ensures that structured elements, such as document sections
and titles, are not arbitrarily split across chunks. It utilizes the
`UnstructuredIO` class to process and segment elements while maintaining
readability and coherence. The chunking method can be adjusted based on
the provided `chunk_type` parameter.

Args:
chunk_type (str, optional): The method used for chunking text.
(default: :obj:`"chunk_by_title"`)
max_characters (int, optional): The maximum number of characters
allowed per chunk. (default: :obj:`500`)
metadata_filename (Optional[str], optional): An optional filename
for storing metadata related to chunking. (default: :obj:`None`)
"""

def __init__(
self,
chunk_type: str = "chunk_by_title",
max_characters: int = 500,
metadata_filename: Optional[str] = None,
):
self.uio = UnstructuredIO()
self.chunk_type = chunk_type
self.max_characters = max_characters
self.metadata_filename = metadata_filename

def chunk(self, content: List[Element]) -> List[Element]:
r"""Splits the content into smaller chunks while preserving
structure and adhering to token constraints.

Args:
content (List[Element]): The content to be chunked.

Returns:
List[Element]: A list of chunked text segments.
"""
return self.uio.chunk_elements(
chunk_type=self.chunk_type,
elements=content,
max_characters=self.max_characters,
)
Loading