From 82eb5931842077b89cfdfc00726c49cf4402f1c4 Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Fri, 7 Mar 2025 13:48:55 +0100 Subject: [PATCH 01/16] feat: Refactor Seed Dataset to be possible to be initialized from HF/Pytorch/JSON/list of Dicts, remove the need for setup call and subsequently cleanup --- camel/datasets/base.py | 183 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 167 insertions(+), 16 deletions(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index e085eeb462..1cb71d50a0 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -12,6 +12,7 @@ # limitations under the License. # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +import json import os import random from typing import ( @@ -322,26 +323,31 @@ def to_pytorch_dataset( return dataset -class SeedDataset(BaseDataset): +class SeedDataset(Dataset): r"""A dataset containing validated seed examples for data generation. Ensures that all items adhere to the DataPoint schema. - This class is used to initialize a dataset from a list of dictionary items, - validating each against the DataPoint schema. + This class can initialize from Hugging Face Datasets, + PyTorch Datasets, JSON file paths, or lists of dictionaries, + converting them into a consistent internal format. """ def __init__( self, - data: List[Dict[str, str]], + data: Union[HFDataset, Dataset, str, List[Dict[str, Any]]], cache_dir: Optional[str] = None, min_samples: int = 1, **kwargs, ): - r"""Initialize the seed dataset. + r"""Initialize the seed dataset and validate integrity. Args: - data (List[Dict[str, str]]): List of dictionary items to create the - dataset from. + data (Union[HFDataset, Dataset, str, List[Dict[str, Any]]]): + Input data, which can be: + - A Hugging Face Dataset (HFDataset) + - A PyTorch Dataset (torch.utils.data.Dataset) + - A string path to a JSON file + - A list of dictionaries with DataPoint-compatible fields cache_dir (Optional[str]): Directory to cache dataset files. (default: :obj:`None`) min_samples (int): Minimum number of samples required. @@ -349,19 +355,164 @@ def __init__( **kwargs: Additional dataset parameters. Raises: - ValueError: If dataset size is less than min_samples or if sample - validation fails. + TypeError: If the data type is not supported. + ValueError: If dataset size is less than min_samples or + if sample validation fails. + FileNotFoundError: If the JSON file path doesn't exist. + json.JSONDecodeError: If the JSON file is invalid. """ - if len(data) < min_samples: + + # Store all parameters in metadata dict for compatibility + self._cache_dir = str(cache_dir) if cache_dir is not None else None + self._metadata = { + 'cache_dir': self._cache_dir, + **kwargs, + } + + # Type checking and conversion into list of dicts + + if isinstance(data, HFDataset): + self._raw_data = [dict(item) for item in data] + elif isinstance(data, Dataset): + try: + self._raw_data = [dict(data[i]) for i in range(len(data))] + except (TypeError, KeyError, AttributeError) as e: + raise TypeError(f"Unsupported PyTorch Dataset: {e}") + elif isinstance(data, str): + if not os.path.exists(data): + raise FileNotFoundError(f"JSON file not found: {data}") + with open(data, 'r') as f: + self._raw_data = json.load(f) + if not isinstance(self._raw_data, list): + raise ValueError( + "JSON file must contain a list of dictionaries" + ) + elif isinstance(data, list): + self._raw_data = data if data is not None else [] + else: + raise TypeError("Unsupported data type") + + self.data: List[DataPoint] = [] + self._setup(min_samples) + + def sample(self) -> DataPoint: + r"""Sample a random datapoint from the dataset. + + Returns: + DataPoint: A randomly sampled DataPoint. + + Raises: + RuntimeError: If the dataset is empty. + """ + if not self.data: + raise RuntimeError("Dataset is empty, cannot sample.") + idx = random.randint(0, len(self) - 1) + return self[idx] + + def _setup(self, min_samples: int) -> None: + r"""Set up the dataset by validating and processing raw data. + + This method: + 1. Checks if the dataset meets the minimum sample requirement. + 2. Creates the cache directory if specified. + 3. Processes raw data into DataPoint objects + for validation and consistency. + + Args: + min_samples (int): Minimum number of samples required. + + Raises: + ValueError: If the dataset size is less than + min_samples or if validation fails. + OSError: If cache directory creation fails. + """ + if len(self._raw_data) < min_samples: raise ValueError( - f"Seed dataset must contain at least {min_samples} samples." + f"Dataset must have at least {min_samples} samples," + f"got {len(self._raw_data)}" ) - super().__init__( - data=data, - cache_dir=cache_dir, - **kwargs, - ) + if self._cache_dir: + try: + os.makedirs(self._cache_dir, exist_ok=True) + logger.debug(f"Created cache directory: {self._cache_dir}") + except OSError as e: + logger.error( + f"Failed to create cache directory {self._cache_dir}: {e}" + ) + raise + + # Process raw data into DataPoint objects for validation purposes + if not self._raw_data: + if min_samples > 0: + raise ValueError("No data provided, but min_samples > 0") + logger.debug("No raw data to process") + return + + if self._cache_dir: + try: + os.makedirs(self._cache_dir, exist_ok=True) + logger.debug(f"Created cache directory: {self._cache_dir}") + except OSError as e: + logger.error( + f"Failed to create cache directory {self._cache_dir}: {e}" + ) + raise + + if not self._raw_data: + if min_samples > 0: + raise ValueError("No data provided, but min_samples > 0") + logger.debug("No raw data to process") + return + + def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint: + try: + return DataPoint( + question=item.get('question', ''), + rationale=item.get('rationale', ''), + final_answer=item.get('final_answer', ''), + metadata=item.get('metadata', {}) + if isinstance(item.get('metadata'), dict) + else {}, + difficulty=item.get('difficulty', ''), # Match BaseDataset + # raw_markdown='' if DataPoint supports it + ) + except ValidationError as e: + raise ValueError( + f"Sample at index {idx} validation error: {e}" + ) + + self.data = [ + create_datapoint(item, i) for i, item in enumerate(self._raw_data) + ] + logger.debug(f"Processed {len(self.data)} data points") + + def __len__(self) -> int: + r"""Return the size of the dataset.""" + return len(self.data) + + def __getitem__(self, idx: int) -> DataPoint: + r"""Get an item from the dataset. + + Args: + idx (int): Index of the item to get. + + Returns: + DataPoint: DataPoint from the dataset with the given index. + + Raises: + IndexError: If idx is out of bounds. + """ + if idx < 0 or idx >= len(self): + raise IndexError( + f"Index {idx} out of bounds for dataset of size {len(self)}" + ) + return self.data[idx] + + @property + def metadata(self) -> Dict[str, Any]: + r"""Get dataset metadata.""" + return self._metadata.copy() class SyntheticDataset(BaseDataset): From 52f012a4d26b558b7fa45cae68f3e2f6913c2440 Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Fri, 7 Mar 2025 14:09:00 +0100 Subject: [PATCH 02/16] fix: Update Seed Dataset tests according to the changes --- test/datasets/test_base_dataset.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py index 83db9ad60a..4b6aead2e4 100644 --- a/test/datasets/test_base_dataset.py +++ b/test/datasets/test_base_dataset.py @@ -192,12 +192,22 @@ def test_base_dataset_metadata(): def test_seed_dataset_init(sample_data): - r"""Test SeedDataset initialization.""" + r"""Test SeedDataset initialization with various input types.""" + # Test with list of dictionaries dataset = SeedDataset(data=sample_data, min_samples=1) - assert dataset._raw_data == sample_data + assert dataset._raw_data == sample_data, "Raw data should match input list" + assert len(dataset.data) == 2, "Processed data should have 2 items" + assert isinstance(dataset.data[0], DataPoint), "Items should be DataPoint instances" + assert dataset.data[0].question == 'What is 2+2?', "DataPoint content should match input" - with pytest.raises(ValueError): + # Test min_samples validation + with pytest.raises(ValueError) as exc_info: SeedDataset(data=sample_data, min_samples=3) + assert "must have at least 3 samples, got 2" in str(exc_info.value), "Should raise ValueError for insufficient samples" + + # Test with empty data and min_samples=0 + dataset_empty = SeedDataset(data=[], min_samples=0) + assert len(dataset_empty.data) == 0, "Empty dataset should have no items" def test_synthetic_dataset_init(): From 93a114cb2476ceb754c35eb519c05cde01cb2a85 Mon Sep 17 00:00:00 2001 From: hallerite Date: Fri, 7 Mar 2025 15:27:35 +0100 Subject: [PATCH 03/16] fix: fix precommit and missing space for assertion --- camel/datasets/base.py | 2 +- test/datasets/test_base_dataset.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index 1cb71d50a0..480ca94b03 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -428,7 +428,7 @@ def _setup(self, min_samples: int) -> None: """ if len(self._raw_data) < min_samples: raise ValueError( - f"Dataset must have at least {min_samples} samples," + f"Dataset must have at least {min_samples} samples, " f"got {len(self._raw_data)}" ) diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py index 4b6aead2e4..318618c894 100644 --- a/test/datasets/test_base_dataset.py +++ b/test/datasets/test_base_dataset.py @@ -197,13 +197,19 @@ def test_seed_dataset_init(sample_data): dataset = SeedDataset(data=sample_data, min_samples=1) assert dataset._raw_data == sample_data, "Raw data should match input list" assert len(dataset.data) == 2, "Processed data should have 2 items" - assert isinstance(dataset.data[0], DataPoint), "Items should be DataPoint instances" - assert dataset.data[0].question == 'What is 2+2?', "DataPoint content should match input" + assert isinstance( + dataset.data[0], DataPoint + ), "Items should be DataPoint instances" + assert ( + dataset.data[0].question == 'What is 2+2?' + ), "DataPoint content should match input" # Test min_samples validation with pytest.raises(ValueError) as exc_info: SeedDataset(data=sample_data, min_samples=3) - assert "must have at least 3 samples, got 2" in str(exc_info.value), "Should raise ValueError for insufficient samples" + assert "must have at least 3 samples, got 2" in str( + exc_info.value + ), "Should raise ValueError for insufficient samples" # Test with empty data and min_samples=0 dataset_empty = SeedDataset(data=[], min_samples=0) From 7b08d62679b429d461d763c5fe231ccc997c22c7 Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Fri, 7 Mar 2025 19:10:25 +0100 Subject: [PATCH 04/16] feat: Extend test coverage to include all possible conversions --- test/datasets/test_base_dataset.py | 158 ++++++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 1 deletion(-) diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py index 318618c894..021b472e67 100644 --- a/test/datasets/test_base_dataset.py +++ b/test/datasets/test_base_dataset.py @@ -12,12 +12,14 @@ # limitations under the License. # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +import json import tempfile from unittest.mock import AsyncMock, MagicMock import pytest import torch from datasets import Dataset as HFDataset +from datasets import load_dataset from pydantic import ValidationError from camel.datasets.base import ( @@ -211,11 +213,165 @@ def test_seed_dataset_init(sample_data): exc_info.value ), "Should raise ValueError for insufficient samples" - # Test with empty data and min_samples=0 dataset_empty = SeedDataset(data=[], min_samples=0) assert len(dataset_empty.data) == 0, "Empty dataset should have no items" +# Test the conversion of different dataset formats + + +def test_seed_dataset_init_hf_dataset(): + r"""Test SeedDataset initialization with a Hugging Face Dataset.""" + + hf_dataset = load_dataset("emotion", split="train[:2]") + + mapped_dataset = hf_dataset.map( + lambda example: { + "question": example["text"], + # Dummy value since "emotion" lacks this + "rationale": "Sample rationale", + "final_answer": str(example["label"]), + } + ) + + dataset = SeedDataset(data=mapped_dataset, min_samples=1) + + assert len(dataset.data) == 2, "Should have 2 items from the HF dataset" + assert isinstance( + dataset.data[0], DataPoint + ), "Items should be DataPoint instances" + assert ( + dataset.data[0].question == mapped_dataset[0]["question"] + ), "Question should match the mapped dataset" + assert ( + dataset.data[0].rationale == "Sample rationale" + ), "Rationale should be set correctly" + assert dataset.data[0].final_answer == str( + mapped_dataset[0]["label"] + ), "Final answer should match the label" + + +def test_seed_dataset_init_pytorch_dataset(): + r"""Test SeedDataset initialization with a real PyTorch Dataset (MNIST).""" + import tempfile + + import torchvision + import torchvision.transforms as transforms + + with tempfile.TemporaryDirectory() as temp_dir: + transform = transforms.Compose([transforms.ToTensor()]) + + # Load the MNIST dataset + mnist_dataset = torchvision.datasets.MNIST( + root=temp_dir, + train=True, + download=True, + transform=transform, + ) + + def map_mnist_to_datapoint(dataset, idx): + image, label = dataset[idx] + return { + "question": f"What is the digit in this image?" + f"(MNIST sample {idx})", + "rationale": "Analyzing the image to identify the digit." + f"(MNIST sample {idx})", + "final_answer": str(label), + } + + mapped_data = [ + map_mnist_to_datapoint(mnist_dataset, i) for i in range(2) + ] + + dataset = SeedDataset(data=mapped_data, min_samples=1) + + assert ( + len(dataset.data) == 2 + ), "Should have 2 items from the mapped MNIST dataset" + assert isinstance( + dataset.data[0], DataPoint + ), "Items should be DataPoint instances" + assert dataset.data[0].question.startswith( + "What is the digit in this image?" + ), "Question should be set correctly" + assert dataset.data[0].rationale.startswith( + "Analyzing the image to identify the digit." + ), "Rationale should be set correctly" + assert dataset.data[ + 0 + ].final_answer.isdigit(), "Final answer should be a digit string" + + +def test_seed_dataset_init_list_extended(sample_data): + r"""Test SeedDataset initialization with a list of dictionaries, + including optional fields and invalid data.""" + data_with_optional = [ + *sample_data, + { + "question": "What is 5-3?", + "rationale": "Subtraction", + "final_answer": "2", + "difficulty": "easy", + "metadata": {"topic": "math"}, + }, + ] + dataset = SeedDataset(data=data_with_optional, min_samples=1) + assert len(dataset.data) == 3, "Should have 3 items from the list" + assert ( + dataset.data[2].difficulty == "easy" + ), "Optional difficulty field should be preserved" + assert dataset.data[2].metadata == { + "topic": "math" + }, "Optional metadata field should be preserved" + + # Test with invalid data (missing required field) + invalid_data = [{"question": "What is 2+2?", "rationale": "Addition"}] + with pytest.raises(ValueError) as exc_info: + SeedDataset(data=invalid_data, min_samples=1) + assert "Sample at index 0 validation error" in str( + exc_info.value + ), "Should raise ValueError for invalid data" + + +def test_seed_dataset_init_json_file(): + r"""Test SeedDataset initialization with a JSON file path.""" + # Create temp JSON file + sample_data = [ + { + "question": "What is 2+2?", + "rationale": "Addition", + "final_answer": "4", + }, + { + "question": "What is 3×3?", + "rationale": "Multiplication", + "final_answer": "9", + }, + ] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file: + json.dump(sample_data, temp_file) + temp_file.flush() + + dataset = SeedDataset(data=temp_file.name, min_samples=1) + + assert len(dataset.data) == 2, "Should have 2 items from the JSON file" + assert isinstance( + dataset.data[0], DataPoint + ), "Items should be DataPoint instances" + assert ( + dataset.data[0].question == "What is 2+2?" + ), "Question should match the JSON data" + assert ( + dataset.data[1].final_answer == "9" + ), "Final answer should match the JSON data" + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as invalid_file: + invalid_file.write("Invalid JSON") + invalid_file.flush() + with pytest.raises(json.JSONDecodeError): + SeedDataset(data=invalid_file.name, min_samples=1) + + def test_synthetic_dataset_init(): r"""Test SyntheticDataset initialization.""" dataset = SyntheticDataset() From 3f1861bb0daaf79d1fdf1915fa79e14b587d8c2f Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 10:08:55 +0100 Subject: [PATCH 05/16] fix: Choose more suitable mock data and enhance test coverage --- test/datasets/test_base_dataset.py | 396 +++++++++++++++++++++++------ 1 file changed, 316 insertions(+), 80 deletions(-) diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py index 021b472e67..ac36d7dfdd 100644 --- a/test/datasets/test_base_dataset.py +++ b/test/datasets/test_base_dataset.py @@ -19,8 +19,8 @@ import pytest import torch from datasets import Dataset as HFDataset -from datasets import load_dataset from pydantic import ValidationError +from torch.utils.data import Dataset from camel.datasets.base import ( BaseDataset, @@ -217,125 +217,289 @@ def test_seed_dataset_init(sample_data): assert len(dataset_empty.data) == 0, "Empty dataset should have no items" -# Test the conversion of different dataset formats - - def test_seed_dataset_init_hf_dataset(): - r"""Test SeedDataset initialization with a Hugging Face Dataset.""" + r"""Test SeedDataset initialization with a fake IMDB-style + Hugging Face Dataset.""" + # Mock IMDB-style data + mock_imdb_data = [ + { + "text": "This movie was absolutely fantastic, " + "a real joy to watch!", + "label": 1, + "rationale": "The reviewer uses positive adjectives " + "like 'fantastic' and 'joy'.", + }, + { + "text": "Terrible acting and a boring plot ruined this film.", + "label": 0, + "rationale": "Negative terms like 'terrible' and " + "'boring' suggest dissatisfaction.", + }, + { + "text": "An incredible cast made this a thrilling experience.", + "label": 1, + "rationale": "Words like 'incredible' and 'thrilling' " + "reflect a positive reaction.", + }, + ] - hf_dataset = load_dataset("emotion", split="train[:2]") + hf_dataset = HFDataset.from_list(mock_imdb_data) mapped_dataset = hf_dataset.map( lambda example: { - "question": example["text"], - # Dummy value since "emotion" lacks this - "rationale": "Sample rationale", - "final_answer": str(example["label"]), + "question": "What is the sentiment of this review? " + f"{example['text'][:30]}...", + "rationale": example["rationale"], + "final_answer": "positive" + if example["label"] == 1 + else "negative", } ) dataset = SeedDataset(data=mapped_dataset, min_samples=1) + assert len(dataset.data) == 3 + assert isinstance(dataset.data[0], DataPoint) + assert dataset.data[0].question == mapped_dataset[0]["question"] + assert dataset.data[0].rationale == mapped_dataset[0]["rationale"] + assert dataset.data[0].final_answer == mapped_dataset[0]["final_answer"] - assert len(dataset.data) == 2, "Should have 2 items from the HF dataset" - assert isinstance( - dataset.data[0], DataPoint - ), "Items should be DataPoint instances" - assert ( - dataset.data[0].question == mapped_dataset[0]["question"] - ), "Question should match the mapped dataset" - assert ( - dataset.data[0].rationale == "Sample rationale" - ), "Rationale should be set correctly" - assert dataset.data[0].final_answer == str( - mapped_dataset[0]["label"] - ), "Final answer should match the label" + invalid_data_missing = [ + { + "question": "What is the sentiment of this review? " + "Missing rationale...", + "final_answer": "positive", + # Missing "rationale" + } + ] + hf_invalid_missing = HFDataset.from_list(invalid_data_missing) + with pytest.raises(ValueError, match="Sample at index 0 validation error"): + SeedDataset(data=hf_invalid_missing, min_samples=1) + + # Test with empty dataset and min_samples=1 + empty_data = [] + hf_empty = HFDataset.from_list(empty_data) + with pytest.raises( + ValueError, match="Dataset must have at least 1 samples, got 0" + ): + SeedDataset(data=hf_empty, min_samples=1) + + # Test with empty dataset and min_samples=0 + dataset_empty = SeedDataset(data=hf_empty, min_samples=0) + assert len(dataset_empty.data) == 0 + + non_dict_data = [ + "Not a dictionary", + { + "question": "Valid question", + "rationale": "Valid rationale", + "final_answer": "positive", + }, + ] + with pytest.raises(TypeError, match="Unsupported data type"): + SeedDataset(data=non_dict_data, min_samples=1) + + data_with_optional = [ + { + "question": "What is the sentiment of this review? " + "This movie was awesome!...", + "rationale": "Positive sentiment detected.", + "final_answer": "positive", + "difficulty": "medium", + "metadata": {"source": "imdb"}, + } + ] + hf_optional = HFDataset.from_list(data_with_optional) + dataset_optional = SeedDataset(data=hf_optional, min_samples=1) + assert dataset_optional.data[0].difficulty == "medium" + assert dataset_optional.data[0].metadata == {"source": "imdb"} def test_seed_dataset_init_pytorch_dataset(): - r"""Test SeedDataset initialization with a real PyTorch Dataset (MNIST).""" - import tempfile + r"""Test SeedDataset initialization with a + mock IMDB-style PyTorch Dataset.""" - import torchvision - import torchvision.transforms as transforms + # Define a reusable PyTorch Dataset class + class MockIMDBDataset(Dataset): + def __init__(self, data_list): + self.data = data_list - with tempfile.TemporaryDirectory() as temp_dir: - transform = transforms.Compose([transforms.ToTensor()]) - - # Load the MNIST dataset - mnist_dataset = torchvision.datasets.MNIST( - root=temp_dir, - train=True, - download=True, - transform=transform, - ) + def __len__(self): + return len(self.data) - def map_mnist_to_datapoint(dataset, idx): - image, label = dataset[idx] - return { - "question": f"What is the digit in this image?" - f"(MNIST sample {idx})", - "rationale": "Analyzing the image to identify the digit." - f"(MNIST sample {idx})", - "final_answer": str(label), - } + def __getitem__(self, idx): + return self.data[idx] - mapped_data = [ - map_mnist_to_datapoint(mnist_dataset, i) for i in range(2) - ] + valid_data = [ + { + "text": "This movie was absolutely fantastic, " + "a real joy to watch!", + "label": 1, + "rationale": "The reviewer uses positive adjectives like " + "'fantastic' and 'joy'.", + }, + { + "text": "Terrible acting and a boring plot ruined this film.", + "label": 0, + "rationale": "Negative terms like 'terrible' and 'boring' " + "suggest dissatisfaction.", + }, + { + "text": "An incredible cast made this a thrilling experience.", + "label": 1, + "rationale": "Words like 'incredible' and 'thrilling' " + "reflect a positive reaction.", + }, + ] - dataset = SeedDataset(data=mapped_data, min_samples=1) + mapped_data = [ + { + "question": "What is the sentiment of this review? " + f"{item['text'][:30]}...", + "rationale": item["rationale"], + "final_answer": "positive" if item["label"] == 1 else "negative", + } + for item in valid_data + ] - assert ( - len(dataset.data) == 2 - ), "Should have 2 items from the mapped MNIST dataset" - assert isinstance( - dataset.data[0], DataPoint - ), "Items should be DataPoint instances" - assert dataset.data[0].question.startswith( - "What is the digit in this image?" - ), "Question should be set correctly" - assert dataset.data[0].rationale.startswith( - "Analyzing the image to identify the digit." - ), "Rationale should be set correctly" - assert dataset.data[ - 0 - ].final_answer.isdigit(), "Final answer should be a digit string" + pytorch_dataset = MockIMDBDataset(mapped_data) + dataset = SeedDataset(data=pytorch_dataset, min_samples=1) + assert len(dataset.data) == 3 + assert isinstance(dataset.data[0], DataPoint) + assert dataset.data[0].question == mapped_data[0]["question"] + assert dataset.data[0].rationale == mapped_data[0]["rationale"] + assert dataset.data[0].final_answer == mapped_data[0]["final_answer"] + + invalid_data_missing = [ + { + "question": "What is the sentiment of this review? " + "Missing rationale...", + "final_answer": "positive", + # Missing "rationale" + } + ] + pytorch_invalid_missing = MockIMDBDataset(invalid_data_missing) + with pytest.raises(ValueError, match="Sample at index 0 validation error"): + SeedDataset(data=pytorch_invalid_missing, min_samples=1) + + empty_data = [] + pytorch_empty = MockIMDBDataset(empty_data) + with pytest.raises( + ValueError, match="Dataset must have at least 1 samples, got 0" + ): + SeedDataset(data=pytorch_empty, min_samples=1) + + dataset_empty = SeedDataset(data=pytorch_empty, min_samples=0) + assert len(dataset_empty.data) == 0 + + non_dict_data = [ + "Not a dictionary", + { + "question": "Valid question", + "rationale": "Valid rationale", + "final_answer": "positive", + }, + ] + pytorch_non_dict = MockIMDBDataset(non_dict_data) + with pytest.raises(TypeError, match="Unsupported data type"): + SeedDataset(data=pytorch_non_dict, min_samples=1) + + data_with_optional = [ + { + "question": "What is the sentiment of this review? " + "This movie was awesome!...", + "rationale": "Positive sentiment detected.", + "final_answer": "positive", + "difficulty": "medium", + "metadata": {"source": "imdb"}, + } + ] + pytorch_optional = MockIMDBDataset(data_with_optional) + dataset_optional = SeedDataset(data=pytorch_optional, min_samples=1) + assert dataset_optional.data[0].difficulty == "medium" + assert dataset_optional.data[0].metadata == {"source": "imdb"} def test_seed_dataset_init_list_extended(sample_data): - r"""Test SeedDataset initialization with a list of dictionaries, - including optional fields and invalid data.""" + r"""Test SeedDataset initialization with a list of dictionaries.""" + data_with_optional = [ *sample_data, { "question": "What is 5-3?", "rationale": "Subtraction", "final_answer": "2", - "difficulty": "easy", - "metadata": {"topic": "math"}, + "difficulty": "easy", # Optional field + "metadata": {"topic": "math"}, # Optional field }, ] dataset = SeedDataset(data=data_with_optional, min_samples=1) - assert len(dataset.data) == 3, "Should have 3 items from the list" + assert len(dataset.data) == 3, "Dataset should contain 3 items" assert ( dataset.data[2].difficulty == "easy" ), "Optional difficulty field should be preserved" assert dataset.data[2].metadata == { "topic": "math" }, "Optional metadata field should be preserved" + assert ( + dataset.data[0].question == sample_data[0]["question"] + ), "First item question should match" + assert ( + dataset.data[1].final_answer == sample_data[1]["final_answer"] + ), "Second item final_answer should match" - # Test with invalid data (missing required field) - invalid_data = [{"question": "What is 2+2?", "rationale": "Addition"}] - with pytest.raises(ValueError) as exc_info: - SeedDataset(data=invalid_data, min_samples=1) - assert "Sample at index 0 validation error" in str( - exc_info.value - ), "Should raise ValueError for invalid data" + invalid_data_missing = [ + {"question": "What is 2+2?", "rationale": "Addition"} + ] + with pytest.raises(ValueError, match="Sample at index 0 validation error"): + SeedDataset(data=invalid_data_missing, min_samples=1) + + invalid_data_type = [ + { + "question": "What is 3+3?", + "rationale": "Addition", + "final_answer": 6, + } + ] + with pytest.raises(ValueError, match="Sample at index 0 validation error"): + SeedDataset(data=invalid_data_type, min_samples=1) + + empty_data = [] + with pytest.raises( + ValueError, match="Dataset must have at least 1 samples, got 0" + ): + SeedDataset(data=empty_data, min_samples=1) + + dataset_empty = SeedDataset(data=empty_data, min_samples=0) + assert ( + len(dataset_empty.data) == 0 + ), "Empty dataset with min_samples=0 should have no items" + + non_dict_data = [ + "Not a dictionary", + { + "question": "What is 4+4?", + "rationale": "Addition", + "final_answer": "8", + }, + ] + with pytest.raises(TypeError, match="Unsupported data type"): + SeedDataset(data=non_dict_data, min_samples=1) + + mixed_data = [ + { + "question": "What is 1+1?", + "rationale": "Addition", + "final_answer": "2", + }, + {"question": "What is 2+2?"}, + ] + with pytest.raises(ValueError, match="Sample at index 1 validation error"): + SeedDataset(data=mixed_data, min_samples=1) def test_seed_dataset_init_json_file(): r"""Test SeedDataset initialization with a JSON file path.""" - # Create temp JSON file + sample_data = [ { "question": "What is 2+2?", @@ -351,9 +515,7 @@ def test_seed_dataset_init_json_file(): with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file: json.dump(sample_data, temp_file) temp_file.flush() - dataset = SeedDataset(data=temp_file.name, min_samples=1) - assert len(dataset.data) == 2, "Should have 2 items from the JSON file" assert isinstance( dataset.data[0], DataPoint @@ -371,6 +533,80 @@ def test_seed_dataset_init_json_file(): with pytest.raises(json.JSONDecodeError): SeedDataset(data=invalid_file.name, min_samples=1) + invalid_data_missing = [ + { + "question": "What is 2+2?", + "rationale": "Addition", # Missing "final_answer" + } + ] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file: + json.dump(invalid_data_missing, temp_file) + temp_file.flush() + with pytest.raises( + ValueError, match="Sample at index 0 validation error" + ): + SeedDataset(data=temp_file.name, min_samples=1) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file: + json.dump([], temp_file) + temp_file.flush() + with pytest.raises( + ValueError, match="Dataset must have at least 1 samples, got 0" + ): + SeedDataset(data=temp_file.name, min_samples=1) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file: + json.dump([], temp_file) + temp_file.flush() + dataset_empty = SeedDataset(data=temp_file.name, min_samples=0) + assert ( + len(dataset_empty.data) == 0 + ), "Empty dataset with min_samples=0 should have no items" + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file: + json.dump({"not": "a list"}, temp_file) + temp_file.flush() + with pytest.raises( + ValueError, match="JSON file must contain a list of dictionaries" + ): + SeedDataset(data=temp_file.name, min_samples=1) + + data_with_optional = [ + { + "question": "What is 5-3?", + "rationale": "Subtraction", + "final_answer": "2", + "difficulty": "easy", + "metadata": {"topic": "math"}, + } + ] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file: + json.dump(data_with_optional, temp_file) + temp_file.flush() + dataset_optional = SeedDataset(data=temp_file.name, min_samples=1) + assert ( + dataset_optional.data[0].difficulty == "easy" + ), "Optional difficulty field should be preserved" + assert dataset_optional.data[0].metadata == { + "topic": "math" + }, "Optional metadata field should be preserved" + + data_with_extra = [ + { + "question": "What is 4+4?", + "rationale": "Addition", + "final_answer": "8", + "extra_field": "should be ignored", + } + ] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file: + json.dump(data_with_extra, temp_file) + temp_file.flush() + dataset_extra = SeedDataset(data=temp_file.name, min_samples=1) + assert ( + "extra_field" not in dataset_extra.data[0].__dict__ + ), "Extra fields should be ignored" + def test_synthetic_dataset_init(): r"""Test SyntheticDataset initialization.""" From ad1949bf0b5f430ffa975076972f2231cd86e392 Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 10:32:51 +0100 Subject: [PATCH 06/16] fix: Change json path handling as Path objects instead of strings and add seed for reproducibility --- camel/datasets/base.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index 480ca94b03..d6561b5f57 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -15,6 +15,7 @@ import json import os import random +from pathlib import Path from typing import ( Any, Callable, @@ -334,8 +335,9 @@ class SeedDataset(Dataset): def __init__( self, - data: Union[HFDataset, Dataset, str, List[Dict[str, Any]]], + data: Union[HFDataset, Dataset, Path, List[Dict[str, Any]]], cache_dir: Optional[str] = None, + seed: Optional[int] = None, min_samples: int = 1, **kwargs, ): @@ -346,10 +348,12 @@ def __init__( Input data, which can be: - A Hugging Face Dataset (HFDataset) - A PyTorch Dataset (torch.utils.data.Dataset) - - A string path to a JSON file + - A Path object representing the path to a JSON file - A list of dictionaries with DataPoint-compatible fields cache_dir (Optional[str]): Directory to cache dataset files. (default: :obj:`None`) + seed (Optional[int]): Seed for reproducibility. + (default: :obj:`1`) min_samples (int): Minimum number of samples required. (default: :obj:`1`) **kwargs: Additional dataset parameters. @@ -368,7 +372,7 @@ def __init__( 'cache_dir': self._cache_dir, **kwargs, } - + self._rng = random.Random(seed) # Type checking and conversion into list of dicts if isinstance(data, HFDataset): @@ -378,15 +382,17 @@ def __init__( self._raw_data = [dict(data[i]) for i in range(len(data))] except (TypeError, KeyError, AttributeError) as e: raise TypeError(f"Unsupported PyTorch Dataset: {e}") - elif isinstance(data, str): - if not os.path.exists(data): + + elif isinstance(data, Path): + if not data.exists(): raise FileNotFoundError(f"JSON file not found: {data}") - with open(data, 'r') as f: + with data.open('r') as f: self._raw_data = json.load(f) if not isinstance(self._raw_data, list): raise ValueError( "JSON file must contain a list of dictionaries" ) + elif isinstance(data, list): self._raw_data = data if data is not None else [] else: From 50d079531bc3bc3cc1f13d870642401bb89103d9 Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 10:55:05 +0100 Subject: [PATCH 07/16] fix: Move length to init and change sample method to use seed --- camel/datasets/base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index d6561b5f57..a2e1f48da0 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -400,6 +400,7 @@ def __init__( self.data: List[DataPoint] = [] self._setup(min_samples) + self._length = len(self.data) def sample(self) -> DataPoint: r"""Sample a random datapoint from the dataset. @@ -410,9 +411,9 @@ def sample(self) -> DataPoint: Raises: RuntimeError: If the dataset is empty. """ - if not self.data: + if self._length == 0: raise RuntimeError("Dataset is empty, cannot sample.") - idx = random.randint(0, len(self) - 1) + idx = self._rng.randint(0, self._length - 1) return self[idx] def _setup(self, min_samples: int) -> None: @@ -495,7 +496,7 @@ def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint: def __len__(self) -> int: r"""Return the size of the dataset.""" - return len(self.data) + return self._length def __getitem__(self, idx: int) -> DataPoint: r"""Get an item from the dataset. @@ -509,7 +510,7 @@ def __getitem__(self, idx: int) -> DataPoint: Raises: IndexError: If idx is out of bounds. """ - if idx < 0 or idx >= len(self): + if idx < 0 or idx >= self._length: raise IndexError( f"Index {idx} out of bounds for dataset of size {len(self)}" ) From 79d142900f2f016bf4c3223b2c971880deb1323e Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 11:14:40 +0100 Subject: [PATCH 08/16] feat: Implement strict flag to let user chose between simply skipping invalid datapoints in a seed dataset and throwing an exception --- camel/datasets/base.py | 47 ++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index a2e1f48da0..8187f81f40 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -339,6 +339,7 @@ def __init__( cache_dir: Optional[str] = None, seed: Optional[int] = None, min_samples: int = 1, + strict: bool = False, **kwargs, ): r"""Initialize the seed dataset and validate integrity. @@ -356,6 +357,8 @@ def __init__( (default: :obj:`1`) min_samples (int): Minimum number of samples required. (default: :obj:`1`) + strict (bool): Whether to raise an error on invalid datapoints + (True) or skip/filter them (False). (default: False) **kwargs: Additional dataset parameters. Raises: @@ -373,6 +376,8 @@ def __init__( **kwargs, } self._rng = random.Random(seed) + self._strict = strict + # Type checking and conversion into list of dicts if isinstance(data, HFDataset): @@ -423,14 +428,20 @@ def _setup(self, min_samples: int) -> None: 1. Checks if the dataset meets the minimum sample requirement. 2. Creates the cache directory if specified. 3. Processes raw data into DataPoint objects - for validation and consistency. + for validation and consistency. + + In non-strict mode, invalid datapoints are filtered out + rather than raising an error. Args: min_samples (int): Minimum number of samples required. Raises: - ValueError: If the dataset size is less than - min_samples or if validation fails. + ValueError: If the dataset size is less than min_samples or + if sample validation fails (in strict mode), + or if the dataset size is smaller than + min_samples after filtering invalid datapoints + (in non-strict mode). OSError: If cache directory creation fails. """ if len(self._raw_data) < min_samples: @@ -456,22 +467,6 @@ def _setup(self, min_samples: int) -> None: logger.debug("No raw data to process") return - if self._cache_dir: - try: - os.makedirs(self._cache_dir, exist_ok=True) - logger.debug(f"Created cache directory: {self._cache_dir}") - except OSError as e: - logger.error( - f"Failed to create cache directory {self._cache_dir}: {e}" - ) - raise - - if not self._raw_data: - if min_samples > 0: - raise ValueError("No data provided, but min_samples > 0") - logger.debug("No raw data to process") - return - def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint: try: return DataPoint( @@ -484,10 +479,18 @@ def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint: difficulty=item.get('difficulty', ''), # Match BaseDataset # raw_markdown='' if DataPoint supports it ) + except ValidationError as e: - raise ValueError( - f"Sample at index {idx} validation error: {e}" - ) + if self._strict: + raise ValueError( + f"Sample at index {idx} validation error: {e}" + ) + else: + logger.warning( + f"Skipping invalid sample at index {idx} " + f"due to validation error: {e}" + ) + return None self.data = [ create_datapoint(item, i) for i, item in enumerate(self._raw_data) From 9bd18d4009a8d658bb08b8aa19b57359402d760f Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 11:22:44 +0100 Subject: [PATCH 09/16] fix: Put __len__ and __getitem__ to the top of seed dataset to ensure they are defined before the other functions are --- camel/datasets/base.py | 44 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index 8187f81f40..c6a00f36f3 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -407,6 +407,28 @@ def __init__( self._setup(min_samples) self._length = len(self.data) + def __len__(self) -> int: + r"""Return the size of the dataset.""" + return self._length + + def __getitem__(self, idx: int) -> DataPoint: + r"""Get an item from the dataset. + + Args: + idx (int): Index of the item to get. + + Returns: + DataPoint: DataPoint from the dataset with the given index. + + Raises: + IndexError: If idx is out of bounds. + """ + if idx < 0 or idx >= self._length: + raise IndexError( + f"Index {idx} out of bounds for dataset of size {len(self)}" + ) + return self.data[idx] + def sample(self) -> DataPoint: r"""Sample a random datapoint from the dataset. @@ -497,28 +519,6 @@ def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint: ] logger.debug(f"Processed {len(self.data)} data points") - def __len__(self) -> int: - r"""Return the size of the dataset.""" - return self._length - - def __getitem__(self, idx: int) -> DataPoint: - r"""Get an item from the dataset. - - Args: - idx (int): Index of the item to get. - - Returns: - DataPoint: DataPoint from the dataset with the given index. - - Raises: - IndexError: If idx is out of bounds. - """ - if idx < 0 or idx >= self._length: - raise IndexError( - f"Index {idx} out of bounds for dataset of size {len(self)}" - ) - return self.data[idx] - @property def metadata(self) -> Dict[str, Any]: r"""Get dataset metadata.""" From 447bf07a5bc6f69f4301071afafbae35efd1dd08 Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 18:26:44 +0100 Subject: [PATCH 10/16] fix: Add explanation as to why use list of dicts to store data in Seed Dataset --- camel/datasets/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index c6a00f36f3..59e308934b 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -378,7 +378,9 @@ def __init__( self._rng = random.Random(seed) self._strict = strict - # Type checking and conversion into list of dicts + # Type checking and conversion into list of dicts to have a + # consistent internal format. Since Seed Dataset should be + # small, we can load it entirely into memmory if isinstance(data, HFDataset): self._raw_data = [dict(item) for item in data] From 521f5110abb4209a3abc30f502e98d3526c48981 Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 18:30:32 +0100 Subject: [PATCH 11/16] style: Fix code style to adhere to checks --- camel/datasets/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index 59e308934b..b5d074da3c 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -379,7 +379,7 @@ def __init__( self._strict = strict # Type checking and conversion into list of dicts to have a - # consistent internal format. Since Seed Dataset should be + # consistent internal format. Since Seed Dataset should be # small, we can load it entirely into memmory if isinstance(data, HFDataset): From 4162a9aa15bb65a553e69867c9afe713b6393349 Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 23:04:32 +0100 Subject: [PATCH 12/16] fix: Update seed init and hf dataset tests to changes in base.py --- test/datasets/test_base_dataset.py | 106 +++++++++++++++++------------ 1 file changed, 63 insertions(+), 43 deletions(-) diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py index ac36d7dfdd..ddbc8e93cc 100644 --- a/test/datasets/test_base_dataset.py +++ b/test/datasets/test_base_dataset.py @@ -194,8 +194,7 @@ def test_base_dataset_metadata(): def test_seed_dataset_init(sample_data): - r"""Test SeedDataset initialization with various input types.""" - # Test with list of dictionaries + r"""Test SeedDataset initialization with valid input data.""" dataset = SeedDataset(data=sample_data, min_samples=1) assert dataset._raw_data == sample_data, "Raw data should match input list" assert len(dataset.data) == 2, "Processed data should have 2 items" @@ -205,41 +204,68 @@ def test_seed_dataset_init(sample_data): assert ( dataset.data[0].question == 'What is 2+2?' ), "DataPoint content should match input" - - # Test min_samples validation with pytest.raises(ValueError) as exc_info: SeedDataset(data=sample_data, min_samples=3) - assert "must have at least 3 samples, got 2" in str( + assert "must have at least 3 samples" in str( exc_info.value ), "Should raise ValueError for insufficient samples" + # Test with an empty dataset when min_samples is 0 dataset_empty = SeedDataset(data=[], min_samples=0) assert len(dataset_empty.data) == 0, "Empty dataset should have no items" +def test_seed_dataset_strict_mode(): + r"""Test SeedDataset in strict mode where + invalid datapoints raise errors.""" + invalid_data = [ + { + "question": "Incomplete sample", + "rationale": "Some reasoning", + } # Missing 'final_answer' + ] + with pytest.raises(ValueError) as exc_info: + # strict=True should raise an error on the first invalid datapoint + SeedDataset(data=invalid_data, min_samples=1, strict=True) + assert "validation error" in str( + exc_info.value + ), "Strict mode should raise ValueError for invalid datapoint" + + +def test_seed_dataset_non_strict_mode(): + r"""Test SeedDataset in non-strict mode where + invalid datapoints are skipped.""" + + invalid_data = [ + {"question": "Incomplete sample", "rationale": "Some reasoning"} + ] + # strict=False should filter out invalid samples + dataset = SeedDataset(data=invalid_data, min_samples=0, strict=False) + # Expect that the invalid sample is skipped, so + # dataset.data should be empty + assert ( + len(dataset.data) == 0 + ), "Non-strict mode should filter out invalid samples" + + def test_seed_dataset_init_hf_dataset(): - r"""Test SeedDataset initialization with a fake IMDB-style - Hugging Face Dataset.""" + r"""Test SeedDataset initialization with a fake IMDB-style Hugging Face Dataset.""" # Mock IMDB-style data mock_imdb_data = [ { - "text": "This movie was absolutely fantastic, " - "a real joy to watch!", + "text": "This movie was absolutely fantastic, a real joy to watch!", "label": 1, - "rationale": "The reviewer uses positive adjectives " - "like 'fantastic' and 'joy'.", + "rationale": "The reviewer uses positive adjectives like 'fantastic' and 'joy'.", }, { "text": "Terrible acting and a boring plot ruined this film.", "label": 0, - "rationale": "Negative terms like 'terrible' and " - "'boring' suggest dissatisfaction.", + "rationale": "Negative terms like 'terrible' and 'boring' suggest dissatisfaction.", }, { "text": "An incredible cast made this a thrilling experience.", "label": 1, - "rationale": "Words like 'incredible' and 'thrilling' " - "reflect a positive reaction.", + "rationale": "Words like 'incredible' and 'thrilling' reflect a positive reaction.", }, ] @@ -247,45 +273,39 @@ def test_seed_dataset_init_hf_dataset(): mapped_dataset = hf_dataset.map( lambda example: { - "question": "What is the sentiment of this review? " - f"{example['text'][:30]}...", + "question": "What is the sentiment of this review? " f"{example['text'][:30]}...", "rationale": example["rationale"], - "final_answer": "positive" - if example["label"] == 1 - else "negative", + "final_answer": "positive" if example["label"] == 1 else "negative", } ) - dataset = SeedDataset(data=mapped_dataset, min_samples=1) - assert len(dataset.data) == 3 - assert isinstance(dataset.data[0], DataPoint) - assert dataset.data[0].question == mapped_dataset[0]["question"] - assert dataset.data[0].rationale == mapped_dataset[0]["rationale"] - assert dataset.data[0].final_answer == mapped_dataset[0]["final_answer"] + # Valid data + dataset = SeedDataset(data=mapped_dataset, min_samples=1, strict=True) + assert len(dataset.data) == 3, "There should be 3 valid data points." + assert isinstance(dataset.data[0], DataPoint), "Items should be DataPoint instances." + assert dataset.data[0].question == mapped_dataset[0]["question"], "Question should match input." + assert dataset.data[0].rationale == mapped_dataset[0]["rationale"], "Rationale should match input." + assert dataset.data[0].final_answer == mapped_dataset[0]["final_answer"], "Final answer should match input." + # Invalid data invalid_data_missing = [ { - "question": "What is the sentiment of this review? " - "Missing rationale...", + "question": "What is the sentiment of this review? Missing rationale...", "final_answer": "positive", # Missing "rationale" } ] hf_invalid_missing = HFDataset.from_list(invalid_data_missing) with pytest.raises(ValueError, match="Sample at index 0 validation error"): - SeedDataset(data=hf_invalid_missing, min_samples=1) + SeedDataset(data=hf_invalid_missing, min_samples=1, strict=True) - # Test with empty dataset and min_samples=1 empty_data = [] hf_empty = HFDataset.from_list(empty_data) - with pytest.raises( - ValueError, match="Dataset must have at least 1 samples, got 0" - ): - SeedDataset(data=hf_empty, min_samples=1) + with pytest.raises(ValueError, match="Dataset must have at least 1 samples, got 0"): + SeedDataset(data=hf_empty, min_samples=1, strict=True) - # Test with empty dataset and min_samples=0 - dataset_empty = SeedDataset(data=hf_empty, min_samples=0) - assert len(dataset_empty.data) == 0 + dataset_empty = SeedDataset(data=hf_empty, min_samples=0, strict=True) + assert len(dataset_empty.data) == 0, "Empty dataset should have no valid items." non_dict_data = [ "Not a dictionary", @@ -296,12 +316,11 @@ def test_seed_dataset_init_hf_dataset(): }, ] with pytest.raises(TypeError, match="Unsupported data type"): - SeedDataset(data=non_dict_data, min_samples=1) + SeedDataset(data=non_dict_data, min_samples=1, strict=True) data_with_optional = [ { - "question": "What is the sentiment of this review? " - "This movie was awesome!...", + "question": "What is the sentiment of this review? This movie was awesome!...", "rationale": "Positive sentiment detected.", "final_answer": "positive", "difficulty": "medium", @@ -309,9 +328,10 @@ def test_seed_dataset_init_hf_dataset(): } ] hf_optional = HFDataset.from_list(data_with_optional) - dataset_optional = SeedDataset(data=hf_optional, min_samples=1) - assert dataset_optional.data[0].difficulty == "medium" - assert dataset_optional.data[0].metadata == {"source": "imdb"} + dataset_optional = SeedDataset(data=hf_optional, min_samples=1, strict=True) + assert dataset_optional.data[0].difficulty == "medium", "Difficulty field should be 'medium'." + assert dataset_optional.data[0].metadata == {"source": "imdb"}, "Metadata should match input." + def test_seed_dataset_init_pytorch_dataset(): From 094762ec7df76e2141f16d1d96404b4246839457 Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 23:23:05 +0100 Subject: [PATCH 13/16] style: Fix code style to adhere to code style requirements --- test/datasets/test_base_dataset.py | 71 +++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py index ddbc8e93cc..0b38635d84 100644 --- a/test/datasets/test_base_dataset.py +++ b/test/datasets/test_base_dataset.py @@ -216,7 +216,7 @@ def test_seed_dataset_init(sample_data): def test_seed_dataset_strict_mode(): - r"""Test SeedDataset in strict mode where + r"""Test SeedDataset in strict mode where invalid datapoints raise errors.""" invalid_data = [ { @@ -233,7 +233,7 @@ def test_seed_dataset_strict_mode(): def test_seed_dataset_non_strict_mode(): - r"""Test SeedDataset in non-strict mode where + r"""Test SeedDataset in non-strict mode where invalid datapoints are skipped.""" invalid_data = [ @@ -241,7 +241,7 @@ def test_seed_dataset_non_strict_mode(): ] # strict=False should filter out invalid samples dataset = SeedDataset(data=invalid_data, min_samples=0, strict=False) - # Expect that the invalid sample is skipped, so + # Expect that the invalid sample is skipped, so # dataset.data should be empty assert ( len(dataset.data) == 0 @@ -249,23 +249,28 @@ def test_seed_dataset_non_strict_mode(): def test_seed_dataset_init_hf_dataset(): - r"""Test SeedDataset initialization with a fake IMDB-style Hugging Face Dataset.""" + r"""Test SeedDataset initialization with a mock + IMDB-style Hugging Face Dataset.""" # Mock IMDB-style data mock_imdb_data = [ { - "text": "This movie was absolutely fantastic, a real joy to watch!", + "text": "This movie was absolutely fantastic, " + "a real joy to watch!", "label": 1, - "rationale": "The reviewer uses positive adjectives like 'fantastic' and 'joy'.", + "rationale": "The reviewer uses positive adjectives like " + "'fantastic' and 'joy'.", }, { "text": "Terrible acting and a boring plot ruined this film.", "label": 0, - "rationale": "Negative terms like 'terrible' and 'boring' suggest dissatisfaction.", + "rationale": "Negative terms like 'terrible' and 'boring' " + "suggest dissatisfaction.", }, { "text": "An incredible cast made this a thrilling experience.", "label": 1, - "rationale": "Words like 'incredible' and 'thrilling' reflect a positive reaction.", + "rationale": "Words like 'incredible' and 'thrilling' reflect " + "a positive reaction.", }, ] @@ -273,24 +278,36 @@ def test_seed_dataset_init_hf_dataset(): mapped_dataset = hf_dataset.map( lambda example: { - "question": "What is the sentiment of this review? " f"{example['text'][:30]}...", + "question": "What is the sentiment of this review? " + f"{example['text'][:30]}...", "rationale": example["rationale"], - "final_answer": "positive" if example["label"] == 1 else "negative", + "final_answer": "positive" + if example["label"] == 1 + else "negative", } ) # Valid data dataset = SeedDataset(data=mapped_dataset, min_samples=1, strict=True) assert len(dataset.data) == 3, "There should be 3 valid data points." - assert isinstance(dataset.data[0], DataPoint), "Items should be DataPoint instances." - assert dataset.data[0].question == mapped_dataset[0]["question"], "Question should match input." - assert dataset.data[0].rationale == mapped_dataset[0]["rationale"], "Rationale should match input." - assert dataset.data[0].final_answer == mapped_dataset[0]["final_answer"], "Final answer should match input." + assert isinstance( + dataset.data[0], DataPoint + ), "Items should be DataPoint instances." + assert ( + dataset.data[0].question == mapped_dataset[0]["question"] + ), "Question should match input." + assert ( + dataset.data[0].rationale == mapped_dataset[0]["rationale"] + ), "Rationale should match input." + assert ( + dataset.data[0].final_answer == mapped_dataset[0]["final_answer"] + ), "Final answer should match input." # Invalid data invalid_data_missing = [ { - "question": "What is the sentiment of this review? Missing rationale...", + "question": "What is the sentiment of this review? " + "Missing rationale...", "final_answer": "positive", # Missing "rationale" } @@ -301,11 +318,15 @@ def test_seed_dataset_init_hf_dataset(): empty_data = [] hf_empty = HFDataset.from_list(empty_data) - with pytest.raises(ValueError, match="Dataset must have at least 1 samples, got 0"): + with pytest.raises( + ValueError, match="Dataset must have at least 1 samples, got 0" + ): SeedDataset(data=hf_empty, min_samples=1, strict=True) dataset_empty = SeedDataset(data=hf_empty, min_samples=0, strict=True) - assert len(dataset_empty.data) == 0, "Empty dataset should have no valid items." + assert ( + len(dataset_empty.data) == 0 + ), "Empty dataset should have no valid items." non_dict_data = [ "Not a dictionary", @@ -320,7 +341,8 @@ def test_seed_dataset_init_hf_dataset(): data_with_optional = [ { - "question": "What is the sentiment of this review? This movie was awesome!...", + "question": "What is the sentiment of this review? " + "This movie was awesome!...", "rationale": "Positive sentiment detected.", "final_answer": "positive", "difficulty": "medium", @@ -328,10 +350,15 @@ def test_seed_dataset_init_hf_dataset(): } ] hf_optional = HFDataset.from_list(data_with_optional) - dataset_optional = SeedDataset(data=hf_optional, min_samples=1, strict=True) - assert dataset_optional.data[0].difficulty == "medium", "Difficulty field should be 'medium'." - assert dataset_optional.data[0].metadata == {"source": "imdb"}, "Metadata should match input." - + dataset_optional = SeedDataset( + data=hf_optional, min_samples=1, strict=True + ) + assert ( + dataset_optional.data[0].difficulty == "medium" + ), "Difficulty field should be 'medium'." + assert dataset_optional.data[0].metadata == { + "source": "imdb" + }, "Metadata should match input." def test_seed_dataset_init_pytorch_dataset(): From 5d222c9af764ce321aa57b4c183d54ef7d518ece Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Sun, 9 Mar 2025 23:42:39 +0100 Subject: [PATCH 14/16] fix: Adjust code to utilize self._length in getitem and cast len(data) to a Sized to pass mypy tests --- camel/datasets/base.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index b5d074da3c..df49bfdc5e 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -22,8 +22,10 @@ Dict, List, Optional, + Sized, TypeVar, Union, + cast, ) import torch @@ -386,7 +388,9 @@ def __init__( self._raw_data = [dict(item) for item in data] elif isinstance(data, Dataset): try: - self._raw_data = [dict(data[i]) for i in range(len(data))] + self._raw_data = [ + dict(data[i]) for i in range(len(cast(Sized, data))) + ] except (TypeError, KeyError, AttributeError) as e: raise TypeError(f"Unsupported PyTorch Dataset: {e}") @@ -427,7 +431,7 @@ def __getitem__(self, idx: int) -> DataPoint: """ if idx < 0 or idx >= self._length: raise IndexError( - f"Index {idx} out of bounds for dataset of size {len(self)}" + f"Index {idx} out of bounds for dataset of size {self._length}" ) return self.data[idx] From a7f2f02bcbef961466ae60aaafe8c808efbb58de Mon Sep 17 00:00:00 2001 From: Apokryphosx Date: Mon, 10 Mar 2025 00:51:32 +0100 Subject: [PATCH 15/16] fix: Adjust create datapoint in seed dataset to properly work with strict mode --- camel/datasets/base.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index df49bfdc5e..dc2bfbef52 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -495,7 +495,9 @@ def _setup(self, min_samples: int) -> None: logger.debug("No raw data to process") return - def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint: + def create_datapoint( + item: Dict[str, Any], idx: int + ) -> Optional[DataPoint]: try: return DataPoint( question=item.get('question', ''), @@ -520,10 +522,14 @@ def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint: ) return None - self.data = [ + raw_data = [ create_datapoint(item, i) for i, item in enumerate(self._raw_data) ] - logger.debug(f"Processed {len(self.data)} data points") + self.data = [dp for dp in raw_data if dp is not None] + logger.debug( + f"Processed {len(raw_data)} data points, of which " + f"{len(self.data)} were valid." + ) @property def metadata(self) -> Dict[str, Any]: From 3f6aec08bcbefa68954c126596fa4cfbc4fd2369 Mon Sep 17 00:00:00 2001 From: hallerite Date: Mon, 10 Mar 2025 00:58:40 +0100 Subject: [PATCH 16/16] fix: remove casting --- camel/datasets/base.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/camel/datasets/base.py b/camel/datasets/base.py index dc2bfbef52..30093cc2dc 100644 --- a/camel/datasets/base.py +++ b/camel/datasets/base.py @@ -25,7 +25,6 @@ Sized, TypeVar, Union, - cast, ) import torch @@ -387,13 +386,17 @@ def __init__( if isinstance(data, HFDataset): self._raw_data = [dict(item) for item in data] elif isinstance(data, Dataset): - try: - self._raw_data = [ - dict(data[i]) for i in range(len(cast(Sized, data))) - ] - except (TypeError, KeyError, AttributeError) as e: - raise TypeError(f"Unsupported PyTorch Dataset: {e}") + if not isinstance(data, Sized): + raise TypeError( + f"{type(data).__name__} does not implement `__len__()`." + ) + # Make MyPy happy by ensuring indexability + assert callable( + getattr(data, "__getitem__", None) + ), "Dataset does not support indexing." + + self._raw_data = [dict(data[i]) for i in range(len(data))] elif isinstance(data, Path): if not data.exists(): raise FileNotFoundError(f"JSON file not found: {data}") @@ -403,7 +406,6 @@ def __init__( raise ValueError( "JSON file must contain a list of dictionaries" ) - elif isinstance(data, list): self._raw_data = data if data is not None else [] else: