From 82eb5931842077b89cfdfc00726c49cf4402f1c4 Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Fri, 7 Mar 2025 13:48:55 +0100
Subject: [PATCH 01/16] feat: Refactor Seed Dataset to be possible to be
 initialized from HF/Pytorch/JSON/list of Dicts, remove the need for setup
 call and subsequently cleanup

---
 camel/datasets/base.py | 183 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 167 insertions(+), 16 deletions(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index e085eeb462..1cb71d50a0 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
 
+import json
 import os
 import random
 from typing import (
@@ -322,26 +323,31 @@ def to_pytorch_dataset(
         return dataset
 
 
-class SeedDataset(BaseDataset):
+class SeedDataset(Dataset):
     r"""A dataset containing validated seed examples for data generation.
     Ensures that all items adhere to the DataPoint schema.
 
-    This class is used to initialize a dataset from a list of dictionary items,
-    validating each against the DataPoint schema.
+    This class can initialize from Hugging Face Datasets,
+    PyTorch Datasets, JSON file paths, or lists of dictionaries,
+    converting them into a consistent internal format.
     """
 
     def __init__(
         self,
-        data: List[Dict[str, str]],
+        data: Union[HFDataset, Dataset, str, List[Dict[str, Any]]],
         cache_dir: Optional[str] = None,
         min_samples: int = 1,
         **kwargs,
     ):
-        r"""Initialize the seed dataset.
+        r"""Initialize the seed dataset and validate integrity.
 
         Args:
-            data (List[Dict[str, str]]): List of dictionary items to create the
-                dataset from.
+            data (Union[HFDataset, Dataset, str, List[Dict[str, Any]]]):
+            Input data, which can be:
+                - A Hugging Face Dataset (HFDataset)
+                - A PyTorch Dataset (torch.utils.data.Dataset)
+                - A string path to a JSON file
+                - A list of dictionaries with DataPoint-compatible fields
             cache_dir (Optional[str]): Directory to cache dataset files.
                 (default: :obj:`None`)
             min_samples (int): Minimum number of samples required.
@@ -349,19 +355,164 @@ def __init__(
             **kwargs: Additional dataset parameters.
 
         Raises:
-            ValueError: If dataset size is less than min_samples or if sample
-                validation fails.
+            TypeError: If the data type is not supported.
+            ValueError: If dataset size is less than min_samples or
+            if sample validation fails.
+            FileNotFoundError: If the JSON file path doesn't exist.
+            json.JSONDecodeError: If the JSON file is invalid.
         """
-        if len(data) < min_samples:
+
+        # Store all parameters in metadata dict for compatibility
+        self._cache_dir = str(cache_dir) if cache_dir is not None else None
+        self._metadata = {
+            'cache_dir': self._cache_dir,
+            **kwargs,
+        }
+
+        # Type checking and conversion into list of dicts
+
+        if isinstance(data, HFDataset):
+            self._raw_data = [dict(item) for item in data]
+        elif isinstance(data, Dataset):
+            try:
+                self._raw_data = [dict(data[i]) for i in range(len(data))]
+            except (TypeError, KeyError, AttributeError) as e:
+                raise TypeError(f"Unsupported PyTorch Dataset: {e}")
+        elif isinstance(data, str):
+            if not os.path.exists(data):
+                raise FileNotFoundError(f"JSON file not found: {data}")
+            with open(data, 'r') as f:
+                self._raw_data = json.load(f)
+            if not isinstance(self._raw_data, list):
+                raise ValueError(
+                    "JSON file must contain a list of dictionaries"
+                )
+        elif isinstance(data, list):
+            self._raw_data = data if data is not None else []
+        else:
+            raise TypeError("Unsupported data type")
+
+        self.data: List[DataPoint] = []
+        self._setup(min_samples)
+
+    def sample(self) -> DataPoint:
+        r"""Sample a random datapoint from the dataset.
+
+        Returns:
+            DataPoint: A randomly sampled DataPoint.
+
+        Raises:
+            RuntimeError: If the dataset is empty.
+        """
+        if not self.data:
+            raise RuntimeError("Dataset is empty, cannot sample.")
+        idx = random.randint(0, len(self) - 1)
+        return self[idx]
+
+    def _setup(self, min_samples: int) -> None:
+        r"""Set up the dataset by validating and processing raw data.
+
+        This method:
+        1. Checks if the dataset meets the minimum sample requirement.
+        2. Creates the cache directory if specified.
+        3. Processes raw data into DataPoint objects
+        for validation and consistency.
+
+        Args:
+            min_samples (int): Minimum number of samples required.
+
+        Raises:
+            ValueError: If the dataset size is less than
+            min_samples or if validation fails.
+            OSError: If cache directory creation fails.
+        """
+        if len(self._raw_data) < min_samples:
             raise ValueError(
-                f"Seed dataset must contain at least {min_samples} samples."
+                f"Dataset must have at least {min_samples} samples,"
+                f"got {len(self._raw_data)}"
             )
 
-        super().__init__(
-            data=data,
-            cache_dir=cache_dir,
-            **kwargs,
-        )
+        if self._cache_dir:
+            try:
+                os.makedirs(self._cache_dir, exist_ok=True)
+                logger.debug(f"Created cache directory: {self._cache_dir}")
+            except OSError as e:
+                logger.error(
+                    f"Failed to create cache directory {self._cache_dir}: {e}"
+                )
+                raise
+
+        # Process raw data into DataPoint objects for validation purposes
+        if not self._raw_data:
+            if min_samples > 0:
+                raise ValueError("No data provided, but min_samples > 0")
+            logger.debug("No raw data to process")
+            return
+
+        if self._cache_dir:
+            try:
+                os.makedirs(self._cache_dir, exist_ok=True)
+                logger.debug(f"Created cache directory: {self._cache_dir}")
+            except OSError as e:
+                logger.error(
+                    f"Failed to create cache directory {self._cache_dir}: {e}"
+                )
+                raise
+
+        if not self._raw_data:
+            if min_samples > 0:
+                raise ValueError("No data provided, but min_samples > 0")
+            logger.debug("No raw data to process")
+            return
+
+        def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint:
+            try:
+                return DataPoint(
+                    question=item.get('question', ''),
+                    rationale=item.get('rationale', ''),
+                    final_answer=item.get('final_answer', ''),
+                    metadata=item.get('metadata', {})
+                    if isinstance(item.get('metadata'), dict)
+                    else {},
+                    difficulty=item.get('difficulty', ''),  # Match BaseDataset
+                    # raw_markdown='' if DataPoint supports it
+                )
+            except ValidationError as e:
+                raise ValueError(
+                    f"Sample at index {idx} validation error: {e}"
+                )
+
+        self.data = [
+            create_datapoint(item, i) for i, item in enumerate(self._raw_data)
+        ]
+        logger.debug(f"Processed {len(self.data)} data points")
+
+    def __len__(self) -> int:
+        r"""Return the size of the dataset."""
+        return len(self.data)
+
+    def __getitem__(self, idx: int) -> DataPoint:
+        r"""Get an item from the dataset.
+
+        Args:
+            idx (int): Index of the item to get.
+
+        Returns:
+            DataPoint: DataPoint from the dataset with the given index.
+
+        Raises:
+            IndexError: If idx is out of bounds.
+        """
+        if idx < 0 or idx >= len(self):
+            raise IndexError(
+                f"Index {idx} out of bounds for dataset of size {len(self)}"
+            )
+        return self.data[idx]
+
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        r"""Get dataset metadata."""
+        return self._metadata.copy()
 
 
 class SyntheticDataset(BaseDataset):

From 52f012a4d26b558b7fa45cae68f3e2f6913c2440 Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Fri, 7 Mar 2025 14:09:00 +0100
Subject: [PATCH 02/16] fix: Update Seed Dataset tests according to the changes

---
 test/datasets/test_base_dataset.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py
index 83db9ad60a..4b6aead2e4 100644
--- a/test/datasets/test_base_dataset.py
+++ b/test/datasets/test_base_dataset.py
@@ -192,12 +192,22 @@ def test_base_dataset_metadata():
 
 
 def test_seed_dataset_init(sample_data):
-    r"""Test SeedDataset initialization."""
+    r"""Test SeedDataset initialization with various input types."""
+    # Test with list of dictionaries
     dataset = SeedDataset(data=sample_data, min_samples=1)
-    assert dataset._raw_data == sample_data
+    assert dataset._raw_data == sample_data, "Raw data should match input list"
+    assert len(dataset.data) == 2, "Processed data should have 2 items"
+    assert isinstance(dataset.data[0], DataPoint), "Items should be DataPoint instances"
+    assert dataset.data[0].question == 'What is 2+2?', "DataPoint content should match input"
 
-    with pytest.raises(ValueError):
+    # Test min_samples validation
+    with pytest.raises(ValueError) as exc_info:
         SeedDataset(data=sample_data, min_samples=3)
+    assert "must have at least 3 samples, got 2" in str(exc_info.value), "Should raise ValueError for insufficient samples"
+
+    # Test with empty data and min_samples=0
+    dataset_empty = SeedDataset(data=[], min_samples=0)
+    assert len(dataset_empty.data) == 0, "Empty dataset should have no items"
 
 
 def test_synthetic_dataset_init():

From 93a114cb2476ceb754c35eb519c05cde01cb2a85 Mon Sep 17 00:00:00 2001
From: hallerite <hallerite@tuta.io>
Date: Fri, 7 Mar 2025 15:27:35 +0100
Subject: [PATCH 03/16] fix: fix precommit and missing space for assertion

---
 camel/datasets/base.py             |  2 +-
 test/datasets/test_base_dataset.py | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index 1cb71d50a0..480ca94b03 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -428,7 +428,7 @@ def _setup(self, min_samples: int) -> None:
         """
         if len(self._raw_data) < min_samples:
             raise ValueError(
-                f"Dataset must have at least {min_samples} samples,"
+                f"Dataset must have at least {min_samples} samples, "
                 f"got {len(self._raw_data)}"
             )
 
diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py
index 4b6aead2e4..318618c894 100644
--- a/test/datasets/test_base_dataset.py
+++ b/test/datasets/test_base_dataset.py
@@ -197,13 +197,19 @@ def test_seed_dataset_init(sample_data):
     dataset = SeedDataset(data=sample_data, min_samples=1)
     assert dataset._raw_data == sample_data, "Raw data should match input list"
     assert len(dataset.data) == 2, "Processed data should have 2 items"
-    assert isinstance(dataset.data[0], DataPoint), "Items should be DataPoint instances"
-    assert dataset.data[0].question == 'What is 2+2?', "DataPoint content should match input"
+    assert isinstance(
+        dataset.data[0], DataPoint
+    ), "Items should be DataPoint instances"
+    assert (
+        dataset.data[0].question == 'What is 2+2?'
+    ), "DataPoint content should match input"
 
     # Test min_samples validation
     with pytest.raises(ValueError) as exc_info:
         SeedDataset(data=sample_data, min_samples=3)
-    assert "must have at least 3 samples, got 2" in str(exc_info.value), "Should raise ValueError for insufficient samples"
+    assert "must have at least 3 samples, got 2" in str(
+        exc_info.value
+    ), "Should raise ValueError for insufficient samples"
 
     # Test with empty data and min_samples=0
     dataset_empty = SeedDataset(data=[], min_samples=0)

From 7b08d62679b429d461d763c5fe231ccc997c22c7 Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Fri, 7 Mar 2025 19:10:25 +0100
Subject: [PATCH 04/16] feat: Extend test coverage to include all possible
 conversions

---
 test/datasets/test_base_dataset.py | 158 ++++++++++++++++++++++++++++-
 1 file changed, 157 insertions(+), 1 deletion(-)

diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py
index 318618c894..021b472e67 100644
--- a/test/datasets/test_base_dataset.py
+++ b/test/datasets/test_base_dataset.py
@@ -12,12 +12,14 @@
 # limitations under the License.
 # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
 
+import json
 import tempfile
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
 import torch
 from datasets import Dataset as HFDataset
+from datasets import load_dataset
 from pydantic import ValidationError
 
 from camel.datasets.base import (
@@ -211,11 +213,165 @@ def test_seed_dataset_init(sample_data):
         exc_info.value
     ), "Should raise ValueError for insufficient samples"
 
-    # Test with empty data and min_samples=0
     dataset_empty = SeedDataset(data=[], min_samples=0)
     assert len(dataset_empty.data) == 0, "Empty dataset should have no items"
 
 
+# Test the conversion of different dataset formats
+
+
+def test_seed_dataset_init_hf_dataset():
+    r"""Test SeedDataset initialization with a Hugging Face Dataset."""
+
+    hf_dataset = load_dataset("emotion", split="train[:2]")
+
+    mapped_dataset = hf_dataset.map(
+        lambda example: {
+            "question": example["text"],
+            # Dummy value since "emotion" lacks this
+            "rationale": "Sample rationale",
+            "final_answer": str(example["label"]),
+        }
+    )
+
+    dataset = SeedDataset(data=mapped_dataset, min_samples=1)
+
+    assert len(dataset.data) == 2, "Should have 2 items from the HF dataset"
+    assert isinstance(
+        dataset.data[0], DataPoint
+    ), "Items should be DataPoint instances"
+    assert (
+        dataset.data[0].question == mapped_dataset[0]["question"]
+    ), "Question should match the mapped dataset"
+    assert (
+        dataset.data[0].rationale == "Sample rationale"
+    ), "Rationale should be set correctly"
+    assert dataset.data[0].final_answer == str(
+        mapped_dataset[0]["label"]
+    ), "Final answer should match the label"
+
+
+def test_seed_dataset_init_pytorch_dataset():
+    r"""Test SeedDataset initialization with a real PyTorch Dataset (MNIST)."""
+    import tempfile
+
+    import torchvision
+    import torchvision.transforms as transforms
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        transform = transforms.Compose([transforms.ToTensor()])
+
+        # Load the MNIST dataset
+        mnist_dataset = torchvision.datasets.MNIST(
+            root=temp_dir,
+            train=True,
+            download=True,
+            transform=transform,
+        )
+
+        def map_mnist_to_datapoint(dataset, idx):
+            image, label = dataset[idx]
+            return {
+                "question": f"What is the digit in this image?"
+                f"(MNIST sample {idx})",
+                "rationale": "Analyzing the image to identify the digit."
+                f"(MNIST sample {idx})",
+                "final_answer": str(label),
+            }
+
+        mapped_data = [
+            map_mnist_to_datapoint(mnist_dataset, i) for i in range(2)
+        ]
+
+        dataset = SeedDataset(data=mapped_data, min_samples=1)
+
+        assert (
+            len(dataset.data) == 2
+        ), "Should have 2 items from the mapped MNIST dataset"
+        assert isinstance(
+            dataset.data[0], DataPoint
+        ), "Items should be DataPoint instances"
+        assert dataset.data[0].question.startswith(
+            "What is the digit in this image?"
+        ), "Question should be set correctly"
+        assert dataset.data[0].rationale.startswith(
+            "Analyzing the image to identify the digit."
+        ), "Rationale should be set correctly"
+        assert dataset.data[
+            0
+        ].final_answer.isdigit(), "Final answer should be a digit string"
+
+
+def test_seed_dataset_init_list_extended(sample_data):
+    r"""Test SeedDataset initialization with a list of dictionaries,
+    including optional fields and invalid data."""
+    data_with_optional = [
+        *sample_data,
+        {
+            "question": "What is 5-3?",
+            "rationale": "Subtraction",
+            "final_answer": "2",
+            "difficulty": "easy",
+            "metadata": {"topic": "math"},
+        },
+    ]
+    dataset = SeedDataset(data=data_with_optional, min_samples=1)
+    assert len(dataset.data) == 3, "Should have 3 items from the list"
+    assert (
+        dataset.data[2].difficulty == "easy"
+    ), "Optional difficulty field should be preserved"
+    assert dataset.data[2].metadata == {
+        "topic": "math"
+    }, "Optional metadata field should be preserved"
+
+    # Test with invalid data (missing required field)
+    invalid_data = [{"question": "What is 2+2?", "rationale": "Addition"}]
+    with pytest.raises(ValueError) as exc_info:
+        SeedDataset(data=invalid_data, min_samples=1)
+    assert "Sample at index 0 validation error" in str(
+        exc_info.value
+    ), "Should raise ValueError for invalid data"
+
+
+def test_seed_dataset_init_json_file():
+    r"""Test SeedDataset initialization with a JSON file path."""
+    # Create temp JSON file
+    sample_data = [
+        {
+            "question": "What is 2+2?",
+            "rationale": "Addition",
+            "final_answer": "4",
+        },
+        {
+            "question": "What is 3×3?",
+            "rationale": "Multiplication",
+            "final_answer": "9",
+        },
+    ]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file:
+        json.dump(sample_data, temp_file)
+        temp_file.flush()
+
+        dataset = SeedDataset(data=temp_file.name, min_samples=1)
+
+        assert len(dataset.data) == 2, "Should have 2 items from the JSON file"
+        assert isinstance(
+            dataset.data[0], DataPoint
+        ), "Items should be DataPoint instances"
+        assert (
+            dataset.data[0].question == "What is 2+2?"
+        ), "Question should match the JSON data"
+        assert (
+            dataset.data[1].final_answer == "9"
+        ), "Final answer should match the JSON data"
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as invalid_file:
+        invalid_file.write("Invalid JSON")
+        invalid_file.flush()
+        with pytest.raises(json.JSONDecodeError):
+            SeedDataset(data=invalid_file.name, min_samples=1)
+
+
 def test_synthetic_dataset_init():
     r"""Test SyntheticDataset initialization."""
     dataset = SyntheticDataset()

From 3f1861bb0daaf79d1fdf1915fa79e14b587d8c2f Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 10:08:55 +0100
Subject: [PATCH 05/16] fix: Choose more suitable mock data and enhance test
 coverage

---
 test/datasets/test_base_dataset.py | 396 +++++++++++++++++++++++------
 1 file changed, 316 insertions(+), 80 deletions(-)

diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py
index 021b472e67..ac36d7dfdd 100644
--- a/test/datasets/test_base_dataset.py
+++ b/test/datasets/test_base_dataset.py
@@ -19,8 +19,8 @@
 import pytest
 import torch
 from datasets import Dataset as HFDataset
-from datasets import load_dataset
 from pydantic import ValidationError
+from torch.utils.data import Dataset
 
 from camel.datasets.base import (
     BaseDataset,
@@ -217,125 +217,289 @@ def test_seed_dataset_init(sample_data):
     assert len(dataset_empty.data) == 0, "Empty dataset should have no items"
 
 
-# Test the conversion of different dataset formats
-
-
 def test_seed_dataset_init_hf_dataset():
-    r"""Test SeedDataset initialization with a Hugging Face Dataset."""
+    r"""Test SeedDataset initialization with a fake IMDB-style
+    Hugging Face Dataset."""
+    # Mock IMDB-style data
+    mock_imdb_data = [
+        {
+            "text": "This movie was absolutely fantastic, "
+            "a real joy to watch!",
+            "label": 1,
+            "rationale": "The reviewer uses positive adjectives "
+            "like 'fantastic' and 'joy'.",
+        },
+        {
+            "text": "Terrible acting and a boring plot ruined this film.",
+            "label": 0,
+            "rationale": "Negative terms like 'terrible' and "
+            "'boring' suggest dissatisfaction.",
+        },
+        {
+            "text": "An incredible cast made this a thrilling experience.",
+            "label": 1,
+            "rationale": "Words like 'incredible' and 'thrilling' "
+            "reflect a positive reaction.",
+        },
+    ]
 
-    hf_dataset = load_dataset("emotion", split="train[:2]")
+    hf_dataset = HFDataset.from_list(mock_imdb_data)
 
     mapped_dataset = hf_dataset.map(
         lambda example: {
-            "question": example["text"],
-            # Dummy value since "emotion" lacks this
-            "rationale": "Sample rationale",
-            "final_answer": str(example["label"]),
+            "question": "What is the sentiment of this review? "
+            f"{example['text'][:30]}...",
+            "rationale": example["rationale"],
+            "final_answer": "positive"
+            if example["label"] == 1
+            else "negative",
         }
     )
 
     dataset = SeedDataset(data=mapped_dataset, min_samples=1)
+    assert len(dataset.data) == 3
+    assert isinstance(dataset.data[0], DataPoint)
+    assert dataset.data[0].question == mapped_dataset[0]["question"]
+    assert dataset.data[0].rationale == mapped_dataset[0]["rationale"]
+    assert dataset.data[0].final_answer == mapped_dataset[0]["final_answer"]
 
-    assert len(dataset.data) == 2, "Should have 2 items from the HF dataset"
-    assert isinstance(
-        dataset.data[0], DataPoint
-    ), "Items should be DataPoint instances"
-    assert (
-        dataset.data[0].question == mapped_dataset[0]["question"]
-    ), "Question should match the mapped dataset"
-    assert (
-        dataset.data[0].rationale == "Sample rationale"
-    ), "Rationale should be set correctly"
-    assert dataset.data[0].final_answer == str(
-        mapped_dataset[0]["label"]
-    ), "Final answer should match the label"
+    invalid_data_missing = [
+        {
+            "question": "What is the sentiment of this review? "
+            "Missing rationale...",
+            "final_answer": "positive",
+            # Missing "rationale"
+        }
+    ]
+    hf_invalid_missing = HFDataset.from_list(invalid_data_missing)
+    with pytest.raises(ValueError, match="Sample at index 0 validation error"):
+        SeedDataset(data=hf_invalid_missing, min_samples=1)
+
+    # Test with empty dataset and min_samples=1
+    empty_data = []
+    hf_empty = HFDataset.from_list(empty_data)
+    with pytest.raises(
+        ValueError, match="Dataset must have at least 1 samples, got 0"
+    ):
+        SeedDataset(data=hf_empty, min_samples=1)
+
+    # Test with empty dataset and min_samples=0
+    dataset_empty = SeedDataset(data=hf_empty, min_samples=0)
+    assert len(dataset_empty.data) == 0
+
+    non_dict_data = [
+        "Not a dictionary",
+        {
+            "question": "Valid question",
+            "rationale": "Valid rationale",
+            "final_answer": "positive",
+        },
+    ]
+    with pytest.raises(TypeError, match="Unsupported data type"):
+        SeedDataset(data=non_dict_data, min_samples=1)
+
+    data_with_optional = [
+        {
+            "question": "What is the sentiment of this review? "
+            "This movie was awesome!...",
+            "rationale": "Positive sentiment detected.",
+            "final_answer": "positive",
+            "difficulty": "medium",
+            "metadata": {"source": "imdb"},
+        }
+    ]
+    hf_optional = HFDataset.from_list(data_with_optional)
+    dataset_optional = SeedDataset(data=hf_optional, min_samples=1)
+    assert dataset_optional.data[0].difficulty == "medium"
+    assert dataset_optional.data[0].metadata == {"source": "imdb"}
 
 
 def test_seed_dataset_init_pytorch_dataset():
-    r"""Test SeedDataset initialization with a real PyTorch Dataset (MNIST)."""
-    import tempfile
+    r"""Test SeedDataset initialization with a
+    mock IMDB-style PyTorch Dataset."""
 
-    import torchvision
-    import torchvision.transforms as transforms
+    # Define a reusable PyTorch Dataset class
+    class MockIMDBDataset(Dataset):
+        def __init__(self, data_list):
+            self.data = data_list
 
-    with tempfile.TemporaryDirectory() as temp_dir:
-        transform = transforms.Compose([transforms.ToTensor()])
-
-        # Load the MNIST dataset
-        mnist_dataset = torchvision.datasets.MNIST(
-            root=temp_dir,
-            train=True,
-            download=True,
-            transform=transform,
-        )
+        def __len__(self):
+            return len(self.data)
 
-        def map_mnist_to_datapoint(dataset, idx):
-            image, label = dataset[idx]
-            return {
-                "question": f"What is the digit in this image?"
-                f"(MNIST sample {idx})",
-                "rationale": "Analyzing the image to identify the digit."
-                f"(MNIST sample {idx})",
-                "final_answer": str(label),
-            }
+        def __getitem__(self, idx):
+            return self.data[idx]
 
-        mapped_data = [
-            map_mnist_to_datapoint(mnist_dataset, i) for i in range(2)
-        ]
+    valid_data = [
+        {
+            "text": "This movie was absolutely fantastic, "
+            "a real joy to watch!",
+            "label": 1,
+            "rationale": "The reviewer uses positive adjectives like "
+            "'fantastic' and 'joy'.",
+        },
+        {
+            "text": "Terrible acting and a boring plot ruined this film.",
+            "label": 0,
+            "rationale": "Negative terms like 'terrible' and 'boring' "
+            "suggest dissatisfaction.",
+        },
+        {
+            "text": "An incredible cast made this a thrilling experience.",
+            "label": 1,
+            "rationale": "Words like 'incredible' and 'thrilling' "
+            "reflect a positive reaction.",
+        },
+    ]
 
-        dataset = SeedDataset(data=mapped_data, min_samples=1)
+    mapped_data = [
+        {
+            "question": "What is the sentiment of this review? "
+            f"{item['text'][:30]}...",
+            "rationale": item["rationale"],
+            "final_answer": "positive" if item["label"] == 1 else "negative",
+        }
+        for item in valid_data
+    ]
 
-        assert (
-            len(dataset.data) == 2
-        ), "Should have 2 items from the mapped MNIST dataset"
-        assert isinstance(
-            dataset.data[0], DataPoint
-        ), "Items should be DataPoint instances"
-        assert dataset.data[0].question.startswith(
-            "What is the digit in this image?"
-        ), "Question should be set correctly"
-        assert dataset.data[0].rationale.startswith(
-            "Analyzing the image to identify the digit."
-        ), "Rationale should be set correctly"
-        assert dataset.data[
-            0
-        ].final_answer.isdigit(), "Final answer should be a digit string"
+    pytorch_dataset = MockIMDBDataset(mapped_data)
+    dataset = SeedDataset(data=pytorch_dataset, min_samples=1)
+    assert len(dataset.data) == 3
+    assert isinstance(dataset.data[0], DataPoint)
+    assert dataset.data[0].question == mapped_data[0]["question"]
+    assert dataset.data[0].rationale == mapped_data[0]["rationale"]
+    assert dataset.data[0].final_answer == mapped_data[0]["final_answer"]
+
+    invalid_data_missing = [
+        {
+            "question": "What is the sentiment of this review? "
+            "Missing rationale...",
+            "final_answer": "positive",
+            # Missing "rationale"
+        }
+    ]
+    pytorch_invalid_missing = MockIMDBDataset(invalid_data_missing)
+    with pytest.raises(ValueError, match="Sample at index 0 validation error"):
+        SeedDataset(data=pytorch_invalid_missing, min_samples=1)
+
+    empty_data = []
+    pytorch_empty = MockIMDBDataset(empty_data)
+    with pytest.raises(
+        ValueError, match="Dataset must have at least 1 samples, got 0"
+    ):
+        SeedDataset(data=pytorch_empty, min_samples=1)
+
+    dataset_empty = SeedDataset(data=pytorch_empty, min_samples=0)
+    assert len(dataset_empty.data) == 0
+
+    non_dict_data = [
+        "Not a dictionary",
+        {
+            "question": "Valid question",
+            "rationale": "Valid rationale",
+            "final_answer": "positive",
+        },
+    ]
+    pytorch_non_dict = MockIMDBDataset(non_dict_data)
+    with pytest.raises(TypeError, match="Unsupported data type"):
+        SeedDataset(data=pytorch_non_dict, min_samples=1)
+
+    data_with_optional = [
+        {
+            "question": "What is the sentiment of this review? "
+            "This movie was awesome!...",
+            "rationale": "Positive sentiment detected.",
+            "final_answer": "positive",
+            "difficulty": "medium",
+            "metadata": {"source": "imdb"},
+        }
+    ]
+    pytorch_optional = MockIMDBDataset(data_with_optional)
+    dataset_optional = SeedDataset(data=pytorch_optional, min_samples=1)
+    assert dataset_optional.data[0].difficulty == "medium"
+    assert dataset_optional.data[0].metadata == {"source": "imdb"}
 
 
 def test_seed_dataset_init_list_extended(sample_data):
-    r"""Test SeedDataset initialization with a list of dictionaries,
-    including optional fields and invalid data."""
+    r"""Test SeedDataset initialization with a list of dictionaries."""
+
     data_with_optional = [
         *sample_data,
         {
             "question": "What is 5-3?",
             "rationale": "Subtraction",
             "final_answer": "2",
-            "difficulty": "easy",
-            "metadata": {"topic": "math"},
+            "difficulty": "easy",  # Optional field
+            "metadata": {"topic": "math"},  # Optional field
         },
     ]
     dataset = SeedDataset(data=data_with_optional, min_samples=1)
-    assert len(dataset.data) == 3, "Should have 3 items from the list"
+    assert len(dataset.data) == 3, "Dataset should contain 3 items"
     assert (
         dataset.data[2].difficulty == "easy"
     ), "Optional difficulty field should be preserved"
     assert dataset.data[2].metadata == {
         "topic": "math"
     }, "Optional metadata field should be preserved"
+    assert (
+        dataset.data[0].question == sample_data[0]["question"]
+    ), "First item question should match"
+    assert (
+        dataset.data[1].final_answer == sample_data[1]["final_answer"]
+    ), "Second item final_answer should match"
 
-    # Test with invalid data (missing required field)
-    invalid_data = [{"question": "What is 2+2?", "rationale": "Addition"}]
-    with pytest.raises(ValueError) as exc_info:
-        SeedDataset(data=invalid_data, min_samples=1)
-    assert "Sample at index 0 validation error" in str(
-        exc_info.value
-    ), "Should raise ValueError for invalid data"
+    invalid_data_missing = [
+        {"question": "What is 2+2?", "rationale": "Addition"}
+    ]
+    with pytest.raises(ValueError, match="Sample at index 0 validation error"):
+        SeedDataset(data=invalid_data_missing, min_samples=1)
+
+    invalid_data_type = [
+        {
+            "question": "What is 3+3?",
+            "rationale": "Addition",
+            "final_answer": 6,
+        }
+    ]
+    with pytest.raises(ValueError, match="Sample at index 0 validation error"):
+        SeedDataset(data=invalid_data_type, min_samples=1)
+
+    empty_data = []
+    with pytest.raises(
+        ValueError, match="Dataset must have at least 1 samples, got 0"
+    ):
+        SeedDataset(data=empty_data, min_samples=1)
+
+    dataset_empty = SeedDataset(data=empty_data, min_samples=0)
+    assert (
+        len(dataset_empty.data) == 0
+    ), "Empty dataset with min_samples=0 should have no items"
+
+    non_dict_data = [
+        "Not a dictionary",
+        {
+            "question": "What is 4+4?",
+            "rationale": "Addition",
+            "final_answer": "8",
+        },
+    ]
+    with pytest.raises(TypeError, match="Unsupported data type"):
+        SeedDataset(data=non_dict_data, min_samples=1)
+
+    mixed_data = [
+        {
+            "question": "What is 1+1?",
+            "rationale": "Addition",
+            "final_answer": "2",
+        },
+        {"question": "What is 2+2?"},
+    ]
+    with pytest.raises(ValueError, match="Sample at index 1 validation error"):
+        SeedDataset(data=mixed_data, min_samples=1)
 
 
 def test_seed_dataset_init_json_file():
     r"""Test SeedDataset initialization with a JSON file path."""
-    # Create temp JSON file
+
     sample_data = [
         {
             "question": "What is 2+2?",
@@ -351,9 +515,7 @@ def test_seed_dataset_init_json_file():
     with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file:
         json.dump(sample_data, temp_file)
         temp_file.flush()
-
         dataset = SeedDataset(data=temp_file.name, min_samples=1)
-
         assert len(dataset.data) == 2, "Should have 2 items from the JSON file"
         assert isinstance(
             dataset.data[0], DataPoint
@@ -371,6 +533,80 @@ def test_seed_dataset_init_json_file():
         with pytest.raises(json.JSONDecodeError):
             SeedDataset(data=invalid_file.name, min_samples=1)
 
+    invalid_data_missing = [
+        {
+            "question": "What is 2+2?",
+            "rationale": "Addition",  # Missing "final_answer"
+        }
+    ]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file:
+        json.dump(invalid_data_missing, temp_file)
+        temp_file.flush()
+        with pytest.raises(
+            ValueError, match="Sample at index 0 validation error"
+        ):
+            SeedDataset(data=temp_file.name, min_samples=1)
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file:
+        json.dump([], temp_file)
+        temp_file.flush()
+        with pytest.raises(
+            ValueError, match="Dataset must have at least 1 samples, got 0"
+        ):
+            SeedDataset(data=temp_file.name, min_samples=1)
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file:
+        json.dump([], temp_file)
+        temp_file.flush()
+        dataset_empty = SeedDataset(data=temp_file.name, min_samples=0)
+        assert (
+            len(dataset_empty.data) == 0
+        ), "Empty dataset with min_samples=0 should have no items"
+
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file:
+        json.dump({"not": "a list"}, temp_file)
+        temp_file.flush()
+        with pytest.raises(
+            ValueError, match="JSON file must contain a list of dictionaries"
+        ):
+            SeedDataset(data=temp_file.name, min_samples=1)
+
+    data_with_optional = [
+        {
+            "question": "What is 5-3?",
+            "rationale": "Subtraction",
+            "final_answer": "2",
+            "difficulty": "easy",
+            "metadata": {"topic": "math"},
+        }
+    ]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file:
+        json.dump(data_with_optional, temp_file)
+        temp_file.flush()
+        dataset_optional = SeedDataset(data=temp_file.name, min_samples=1)
+        assert (
+            dataset_optional.data[0].difficulty == "easy"
+        ), "Optional difficulty field should be preserved"
+        assert dataset_optional.data[0].metadata == {
+            "topic": "math"
+        }, "Optional metadata field should be preserved"
+
+    data_with_extra = [
+        {
+            "question": "What is 4+4?",
+            "rationale": "Addition",
+            "final_answer": "8",
+            "extra_field": "should be ignored",
+        }
+    ]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as temp_file:
+        json.dump(data_with_extra, temp_file)
+        temp_file.flush()
+        dataset_extra = SeedDataset(data=temp_file.name, min_samples=1)
+        assert (
+            "extra_field" not in dataset_extra.data[0].__dict__
+        ), "Extra fields should be ignored"
+
 
 def test_synthetic_dataset_init():
     r"""Test SyntheticDataset initialization."""

From ad1949bf0b5f430ffa975076972f2231cd86e392 Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 10:32:51 +0100
Subject: [PATCH 06/16] fix: Change json path handling as Path objects instead
 of strings and add seed for reproducibility

---
 camel/datasets/base.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index 480ca94b03..d6561b5f57 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -15,6 +15,7 @@
 import json
 import os
 import random
+from pathlib import Path
 from typing import (
     Any,
     Callable,
@@ -334,8 +335,9 @@ class SeedDataset(Dataset):
 
     def __init__(
         self,
-        data: Union[HFDataset, Dataset, str, List[Dict[str, Any]]],
+        data: Union[HFDataset, Dataset, Path, List[Dict[str, Any]]],
         cache_dir: Optional[str] = None,
+        seed: Optional[int] = None,
         min_samples: int = 1,
         **kwargs,
     ):
@@ -346,10 +348,12 @@ def __init__(
             Input data, which can be:
                 - A Hugging Face Dataset (HFDataset)
                 - A PyTorch Dataset (torch.utils.data.Dataset)
-                - A string path to a JSON file
+                - A Path object representing the path to a JSON file
                 - A list of dictionaries with DataPoint-compatible fields
             cache_dir (Optional[str]): Directory to cache dataset files.
                 (default: :obj:`None`)
+            seed (Optional[int]): Seed for reproducibility.
+                (default: :obj:`1`)
             min_samples (int): Minimum number of samples required.
                 (default: :obj:`1`)
             **kwargs: Additional dataset parameters.
@@ -368,7 +372,7 @@ def __init__(
             'cache_dir': self._cache_dir,
             **kwargs,
         }
-
+        self._rng = random.Random(seed)
         # Type checking and conversion into list of dicts
 
         if isinstance(data, HFDataset):
@@ -378,15 +382,17 @@ def __init__(
                 self._raw_data = [dict(data[i]) for i in range(len(data))]
             except (TypeError, KeyError, AttributeError) as e:
                 raise TypeError(f"Unsupported PyTorch Dataset: {e}")
-        elif isinstance(data, str):
-            if not os.path.exists(data):
+
+        elif isinstance(data, Path):
+            if not data.exists():
                 raise FileNotFoundError(f"JSON file not found: {data}")
-            with open(data, 'r') as f:
+            with data.open('r') as f:
                 self._raw_data = json.load(f)
             if not isinstance(self._raw_data, list):
                 raise ValueError(
                     "JSON file must contain a list of dictionaries"
                 )
+
         elif isinstance(data, list):
             self._raw_data = data if data is not None else []
         else:

From 50d079531bc3bc3cc1f13d870642401bb89103d9 Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 10:55:05 +0100
Subject: [PATCH 07/16] fix: Move length to init and change sample method to
 use seed

---
 camel/datasets/base.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index d6561b5f57..a2e1f48da0 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -400,6 +400,7 @@ def __init__(
 
         self.data: List[DataPoint] = []
         self._setup(min_samples)
+        self._length = len(self.data)
 
     def sample(self) -> DataPoint:
         r"""Sample a random datapoint from the dataset.
@@ -410,9 +411,9 @@ def sample(self) -> DataPoint:
         Raises:
             RuntimeError: If the dataset is empty.
         """
-        if not self.data:
+        if self._length == 0:
             raise RuntimeError("Dataset is empty, cannot sample.")
-        idx = random.randint(0, len(self) - 1)
+        idx = self._rng.randint(0, self._length - 1)
         return self[idx]
 
     def _setup(self, min_samples: int) -> None:
@@ -495,7 +496,7 @@ def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint:
 
     def __len__(self) -> int:
         r"""Return the size of the dataset."""
-        return len(self.data)
+        return self._length
 
     def __getitem__(self, idx: int) -> DataPoint:
         r"""Get an item from the dataset.
@@ -509,7 +510,7 @@ def __getitem__(self, idx: int) -> DataPoint:
         Raises:
             IndexError: If idx is out of bounds.
         """
-        if idx < 0 or idx >= len(self):
+        if idx < 0 or idx >= self._length:
             raise IndexError(
                 f"Index {idx} out of bounds for dataset of size {len(self)}"
             )

From 79d142900f2f016bf4c3223b2c971880deb1323e Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 11:14:40 +0100
Subject: [PATCH 08/16] feat: Implement strict flag to let user chose between
 simply skipping invalid datapoints in a seed dataset and throwing an
 exception

---
 camel/datasets/base.py | 47 ++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index a2e1f48da0..8187f81f40 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -339,6 +339,7 @@ def __init__(
         cache_dir: Optional[str] = None,
         seed: Optional[int] = None,
         min_samples: int = 1,
+        strict: bool = False,
         **kwargs,
     ):
         r"""Initialize the seed dataset and validate integrity.
@@ -356,6 +357,8 @@ def __init__(
                 (default: :obj:`1`)
             min_samples (int): Minimum number of samples required.
                 (default: :obj:`1`)
+            strict (bool): Whether to raise an error on invalid datapoints
+                (True) or skip/filter them (False). (default: False)
             **kwargs: Additional dataset parameters.
 
         Raises:
@@ -373,6 +376,8 @@ def __init__(
             **kwargs,
         }
         self._rng = random.Random(seed)
+        self._strict = strict
+
         # Type checking and conversion into list of dicts
 
         if isinstance(data, HFDataset):
@@ -423,14 +428,20 @@ def _setup(self, min_samples: int) -> None:
         1. Checks if the dataset meets the minimum sample requirement.
         2. Creates the cache directory if specified.
         3. Processes raw data into DataPoint objects
-        for validation and consistency.
+           for validation and consistency.
+
+        In non-strict mode, invalid datapoints are filtered out
+        rather than raising an error.
 
         Args:
             min_samples (int): Minimum number of samples required.
 
         Raises:
-            ValueError: If the dataset size is less than
-            min_samples or if validation fails.
+            ValueError: If the dataset size is less than min_samples or
+                if sample validation fails (in strict mode),
+                or if the dataset size is smaller than
+                min_samples after filtering invalid datapoints
+                (in non-strict mode).
             OSError: If cache directory creation fails.
         """
         if len(self._raw_data) < min_samples:
@@ -456,22 +467,6 @@ def _setup(self, min_samples: int) -> None:
             logger.debug("No raw data to process")
             return
 
-        if self._cache_dir:
-            try:
-                os.makedirs(self._cache_dir, exist_ok=True)
-                logger.debug(f"Created cache directory: {self._cache_dir}")
-            except OSError as e:
-                logger.error(
-                    f"Failed to create cache directory {self._cache_dir}: {e}"
-                )
-                raise
-
-        if not self._raw_data:
-            if min_samples > 0:
-                raise ValueError("No data provided, but min_samples > 0")
-            logger.debug("No raw data to process")
-            return
-
         def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint:
             try:
                 return DataPoint(
@@ -484,10 +479,18 @@ def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint:
                     difficulty=item.get('difficulty', ''),  # Match BaseDataset
                     # raw_markdown='' if DataPoint supports it
                 )
+
             except ValidationError as e:
-                raise ValueError(
-                    f"Sample at index {idx} validation error: {e}"
-                )
+                if self._strict:
+                    raise ValueError(
+                        f"Sample at index {idx} validation error: {e}"
+                    )
+                else:
+                    logger.warning(
+                        f"Skipping invalid sample at index {idx} "
+                        f"due to validation error: {e}"
+                    )
+                    return None
 
         self.data = [
             create_datapoint(item, i) for i, item in enumerate(self._raw_data)

From 9bd18d4009a8d658bb08b8aa19b57359402d760f Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 11:22:44 +0100
Subject: [PATCH 09/16] fix: Put __len__ and __getitem__ to the top of seed
 dataset to ensure they are defined before the other functions are

---
 camel/datasets/base.py | 44 +++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index 8187f81f40..c6a00f36f3 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -407,6 +407,28 @@ def __init__(
         self._setup(min_samples)
         self._length = len(self.data)
 
+    def __len__(self) -> int:
+        r"""Return the size of the dataset."""
+        return self._length
+
+    def __getitem__(self, idx: int) -> DataPoint:
+        r"""Get an item from the dataset.
+
+        Args:
+            idx (int): Index of the item to get.
+
+        Returns:
+            DataPoint: DataPoint from the dataset with the given index.
+
+        Raises:
+            IndexError: If idx is out of bounds.
+        """
+        if idx < 0 or idx >= self._length:
+            raise IndexError(
+                f"Index {idx} out of bounds for dataset of size {len(self)}"
+            )
+        return self.data[idx]
+
     def sample(self) -> DataPoint:
         r"""Sample a random datapoint from the dataset.
 
@@ -497,28 +519,6 @@ def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint:
         ]
         logger.debug(f"Processed {len(self.data)} data points")
 
-    def __len__(self) -> int:
-        r"""Return the size of the dataset."""
-        return self._length
-
-    def __getitem__(self, idx: int) -> DataPoint:
-        r"""Get an item from the dataset.
-
-        Args:
-            idx (int): Index of the item to get.
-
-        Returns:
-            DataPoint: DataPoint from the dataset with the given index.
-
-        Raises:
-            IndexError: If idx is out of bounds.
-        """
-        if idx < 0 or idx >= self._length:
-            raise IndexError(
-                f"Index {idx} out of bounds for dataset of size {len(self)}"
-            )
-        return self.data[idx]
-
     @property
     def metadata(self) -> Dict[str, Any]:
         r"""Get dataset metadata."""

From 447bf07a5bc6f69f4301071afafbae35efd1dd08 Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 18:26:44 +0100
Subject: [PATCH 10/16] fix: Add explanation as to why use list of dicts to
 store data in Seed Dataset

---
 camel/datasets/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index c6a00f36f3..59e308934b 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -378,7 +378,9 @@ def __init__(
         self._rng = random.Random(seed)
         self._strict = strict
 
-        # Type checking and conversion into list of dicts
+        # Type checking and conversion into list of dicts to have a
+        # consistent internal format. Since Seed Dataset should be 
+        # small, we can load it entirely into memmory
 
         if isinstance(data, HFDataset):
             self._raw_data = [dict(item) for item in data]

From 521f5110abb4209a3abc30f502e98d3526c48981 Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 18:30:32 +0100
Subject: [PATCH 11/16] style: Fix code style to adhere to checks

---
 camel/datasets/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index 59e308934b..b5d074da3c 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -379,7 +379,7 @@ def __init__(
         self._strict = strict
 
         # Type checking and conversion into list of dicts to have a
-        # consistent internal format. Since Seed Dataset should be 
+        # consistent internal format. Since Seed Dataset should be
         # small, we can load it entirely into memmory
 
         if isinstance(data, HFDataset):

From 4162a9aa15bb65a553e69867c9afe713b6393349 Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 23:04:32 +0100
Subject: [PATCH 12/16] fix: Update seed init and hf dataset tests to changes
 in base.py

---
 test/datasets/test_base_dataset.py | 106 +++++++++++++++++------------
 1 file changed, 63 insertions(+), 43 deletions(-)

diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py
index ac36d7dfdd..ddbc8e93cc 100644
--- a/test/datasets/test_base_dataset.py
+++ b/test/datasets/test_base_dataset.py
@@ -194,8 +194,7 @@ def test_base_dataset_metadata():
 
 
 def test_seed_dataset_init(sample_data):
-    r"""Test SeedDataset initialization with various input types."""
-    # Test with list of dictionaries
+    r"""Test SeedDataset initialization with valid input data."""
     dataset = SeedDataset(data=sample_data, min_samples=1)
     assert dataset._raw_data == sample_data, "Raw data should match input list"
     assert len(dataset.data) == 2, "Processed data should have 2 items"
@@ -205,41 +204,68 @@ def test_seed_dataset_init(sample_data):
     assert (
         dataset.data[0].question == 'What is 2+2?'
     ), "DataPoint content should match input"
-
-    # Test min_samples validation
     with pytest.raises(ValueError) as exc_info:
         SeedDataset(data=sample_data, min_samples=3)
-    assert "must have at least 3 samples, got 2" in str(
+    assert "must have at least 3 samples" in str(
         exc_info.value
     ), "Should raise ValueError for insufficient samples"
 
+    # Test with an empty dataset when min_samples is 0
     dataset_empty = SeedDataset(data=[], min_samples=0)
     assert len(dataset_empty.data) == 0, "Empty dataset should have no items"
 
 
+def test_seed_dataset_strict_mode():
+    r"""Test SeedDataset in strict mode where 
+    invalid datapoints raise errors."""
+    invalid_data = [
+        {
+            "question": "Incomplete sample",
+            "rationale": "Some reasoning",
+        }  # Missing 'final_answer'
+    ]
+    with pytest.raises(ValueError) as exc_info:
+        # strict=True should raise an error on the first invalid datapoint
+        SeedDataset(data=invalid_data, min_samples=1, strict=True)
+    assert "validation error" in str(
+        exc_info.value
+    ), "Strict mode should raise ValueError for invalid datapoint"
+
+
+def test_seed_dataset_non_strict_mode():
+    r"""Test SeedDataset in non-strict mode where 
+    invalid datapoints are skipped."""
+
+    invalid_data = [
+        {"question": "Incomplete sample", "rationale": "Some reasoning"}
+    ]
+    # strict=False should filter out invalid samples
+    dataset = SeedDataset(data=invalid_data, min_samples=0, strict=False)
+    # Expect that the invalid sample is skipped, so 
+    # dataset.data should be empty
+    assert (
+        len(dataset.data) == 0
+    ), "Non-strict mode should filter out invalid samples"
+
+
 def test_seed_dataset_init_hf_dataset():
-    r"""Test SeedDataset initialization with a fake IMDB-style
-    Hugging Face Dataset."""
+    r"""Test SeedDataset initialization with a fake IMDB-style Hugging Face Dataset."""
     # Mock IMDB-style data
     mock_imdb_data = [
         {
-            "text": "This movie was absolutely fantastic, "
-            "a real joy to watch!",
+            "text": "This movie was absolutely fantastic, a real joy to watch!",
             "label": 1,
-            "rationale": "The reviewer uses positive adjectives "
-            "like 'fantastic' and 'joy'.",
+            "rationale": "The reviewer uses positive adjectives like 'fantastic' and 'joy'.",
         },
         {
             "text": "Terrible acting and a boring plot ruined this film.",
             "label": 0,
-            "rationale": "Negative terms like 'terrible' and "
-            "'boring' suggest dissatisfaction.",
+            "rationale": "Negative terms like 'terrible' and 'boring' suggest dissatisfaction.",
         },
         {
             "text": "An incredible cast made this a thrilling experience.",
             "label": 1,
-            "rationale": "Words like 'incredible' and 'thrilling' "
-            "reflect a positive reaction.",
+            "rationale": "Words like 'incredible' and 'thrilling' reflect a positive reaction.",
         },
     ]
 
@@ -247,45 +273,39 @@ def test_seed_dataset_init_hf_dataset():
 
     mapped_dataset = hf_dataset.map(
         lambda example: {
-            "question": "What is the sentiment of this review? "
-            f"{example['text'][:30]}...",
+            "question": "What is the sentiment of this review? " f"{example['text'][:30]}...",
             "rationale": example["rationale"],
-            "final_answer": "positive"
-            if example["label"] == 1
-            else "negative",
+            "final_answer": "positive" if example["label"] == 1 else "negative",
         }
     )
 
-    dataset = SeedDataset(data=mapped_dataset, min_samples=1)
-    assert len(dataset.data) == 3
-    assert isinstance(dataset.data[0], DataPoint)
-    assert dataset.data[0].question == mapped_dataset[0]["question"]
-    assert dataset.data[0].rationale == mapped_dataset[0]["rationale"]
-    assert dataset.data[0].final_answer == mapped_dataset[0]["final_answer"]
+    # Valid data
+    dataset = SeedDataset(data=mapped_dataset, min_samples=1, strict=True)
+    assert len(dataset.data) == 3, "There should be 3 valid data points."
+    assert isinstance(dataset.data[0], DataPoint), "Items should be DataPoint instances."
+    assert dataset.data[0].question == mapped_dataset[0]["question"], "Question should match input."
+    assert dataset.data[0].rationale == mapped_dataset[0]["rationale"], "Rationale should match input."
+    assert dataset.data[0].final_answer == mapped_dataset[0]["final_answer"], "Final answer should match input."
 
+    # Invalid data
     invalid_data_missing = [
         {
-            "question": "What is the sentiment of this review? "
-            "Missing rationale...",
+            "question": "What is the sentiment of this review? Missing rationale...",
             "final_answer": "positive",
             # Missing "rationale"
         }
     ]
     hf_invalid_missing = HFDataset.from_list(invalid_data_missing)
     with pytest.raises(ValueError, match="Sample at index 0 validation error"):
-        SeedDataset(data=hf_invalid_missing, min_samples=1)
+        SeedDataset(data=hf_invalid_missing, min_samples=1, strict=True)
 
-    # Test with empty dataset and min_samples=1
     empty_data = []
     hf_empty = HFDataset.from_list(empty_data)
-    with pytest.raises(
-        ValueError, match="Dataset must have at least 1 samples, got 0"
-    ):
-        SeedDataset(data=hf_empty, min_samples=1)
+    with pytest.raises(ValueError, match="Dataset must have at least 1 samples, got 0"):
+        SeedDataset(data=hf_empty, min_samples=1, strict=True)
 
-    # Test with empty dataset and min_samples=0
-    dataset_empty = SeedDataset(data=hf_empty, min_samples=0)
-    assert len(dataset_empty.data) == 0
+    dataset_empty = SeedDataset(data=hf_empty, min_samples=0, strict=True)
+    assert len(dataset_empty.data) == 0, "Empty dataset should have no valid items."
 
     non_dict_data = [
         "Not a dictionary",
@@ -296,12 +316,11 @@ def test_seed_dataset_init_hf_dataset():
         },
     ]
     with pytest.raises(TypeError, match="Unsupported data type"):
-        SeedDataset(data=non_dict_data, min_samples=1)
+        SeedDataset(data=non_dict_data, min_samples=1, strict=True)
 
     data_with_optional = [
         {
-            "question": "What is the sentiment of this review? "
-            "This movie was awesome!...",
+            "question": "What is the sentiment of this review? This movie was awesome!...",
             "rationale": "Positive sentiment detected.",
             "final_answer": "positive",
             "difficulty": "medium",
@@ -309,9 +328,10 @@ def test_seed_dataset_init_hf_dataset():
         }
     ]
     hf_optional = HFDataset.from_list(data_with_optional)
-    dataset_optional = SeedDataset(data=hf_optional, min_samples=1)
-    assert dataset_optional.data[0].difficulty == "medium"
-    assert dataset_optional.data[0].metadata == {"source": "imdb"}
+    dataset_optional = SeedDataset(data=hf_optional, min_samples=1, strict=True)
+    assert dataset_optional.data[0].difficulty == "medium", "Difficulty field should be 'medium'."
+    assert dataset_optional.data[0].metadata == {"source": "imdb"}, "Metadata should match input."
+
 
 
 def test_seed_dataset_init_pytorch_dataset():

From 094762ec7df76e2141f16d1d96404b4246839457 Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 23:23:05 +0100
Subject: [PATCH 13/16] style: Fix code style to adhere to code style
 requirements

---
 test/datasets/test_base_dataset.py | 71 +++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/test/datasets/test_base_dataset.py b/test/datasets/test_base_dataset.py
index ddbc8e93cc..0b38635d84 100644
--- a/test/datasets/test_base_dataset.py
+++ b/test/datasets/test_base_dataset.py
@@ -216,7 +216,7 @@ def test_seed_dataset_init(sample_data):
 
 
 def test_seed_dataset_strict_mode():
-    r"""Test SeedDataset in strict mode where 
+    r"""Test SeedDataset in strict mode where
     invalid datapoints raise errors."""
     invalid_data = [
         {
@@ -233,7 +233,7 @@ def test_seed_dataset_strict_mode():
 
 
 def test_seed_dataset_non_strict_mode():
-    r"""Test SeedDataset in non-strict mode where 
+    r"""Test SeedDataset in non-strict mode where
     invalid datapoints are skipped."""
 
     invalid_data = [
@@ -241,7 +241,7 @@ def test_seed_dataset_non_strict_mode():
     ]
     # strict=False should filter out invalid samples
     dataset = SeedDataset(data=invalid_data, min_samples=0, strict=False)
-    # Expect that the invalid sample is skipped, so 
+    # Expect that the invalid sample is skipped, so
     # dataset.data should be empty
     assert (
         len(dataset.data) == 0
@@ -249,23 +249,28 @@ def test_seed_dataset_non_strict_mode():
 
 
 def test_seed_dataset_init_hf_dataset():
-    r"""Test SeedDataset initialization with a fake IMDB-style Hugging Face Dataset."""
+    r"""Test SeedDataset initialization with a mock
+    IMDB-style Hugging Face Dataset."""
     # Mock IMDB-style data
     mock_imdb_data = [
         {
-            "text": "This movie was absolutely fantastic, a real joy to watch!",
+            "text": "This movie was absolutely fantastic, "
+            "a real joy to watch!",
             "label": 1,
-            "rationale": "The reviewer uses positive adjectives like 'fantastic' and 'joy'.",
+            "rationale": "The reviewer uses positive adjectives like "
+            "'fantastic' and 'joy'.",
         },
         {
             "text": "Terrible acting and a boring plot ruined this film.",
             "label": 0,
-            "rationale": "Negative terms like 'terrible' and 'boring' suggest dissatisfaction.",
+            "rationale": "Negative terms like 'terrible' and 'boring' "
+            "suggest dissatisfaction.",
         },
         {
             "text": "An incredible cast made this a thrilling experience.",
             "label": 1,
-            "rationale": "Words like 'incredible' and 'thrilling' reflect a positive reaction.",
+            "rationale": "Words like 'incredible' and 'thrilling' reflect "
+            "a positive reaction.",
         },
     ]
 
@@ -273,24 +278,36 @@ def test_seed_dataset_init_hf_dataset():
 
     mapped_dataset = hf_dataset.map(
         lambda example: {
-            "question": "What is the sentiment of this review? " f"{example['text'][:30]}...",
+            "question": "What is the sentiment of this review? "
+            f"{example['text'][:30]}...",
             "rationale": example["rationale"],
-            "final_answer": "positive" if example["label"] == 1 else "negative",
+            "final_answer": "positive"
+            if example["label"] == 1
+            else "negative",
         }
     )
 
     # Valid data
     dataset = SeedDataset(data=mapped_dataset, min_samples=1, strict=True)
     assert len(dataset.data) == 3, "There should be 3 valid data points."
-    assert isinstance(dataset.data[0], DataPoint), "Items should be DataPoint instances."
-    assert dataset.data[0].question == mapped_dataset[0]["question"], "Question should match input."
-    assert dataset.data[0].rationale == mapped_dataset[0]["rationale"], "Rationale should match input."
-    assert dataset.data[0].final_answer == mapped_dataset[0]["final_answer"], "Final answer should match input."
+    assert isinstance(
+        dataset.data[0], DataPoint
+    ), "Items should be DataPoint instances."
+    assert (
+        dataset.data[0].question == mapped_dataset[0]["question"]
+    ), "Question should match input."
+    assert (
+        dataset.data[0].rationale == mapped_dataset[0]["rationale"]
+    ), "Rationale should match input."
+    assert (
+        dataset.data[0].final_answer == mapped_dataset[0]["final_answer"]
+    ), "Final answer should match input."
 
     # Invalid data
     invalid_data_missing = [
         {
-            "question": "What is the sentiment of this review? Missing rationale...",
+            "question": "What is the sentiment of this review? "
+            "Missing rationale...",
             "final_answer": "positive",
             # Missing "rationale"
         }
@@ -301,11 +318,15 @@ def test_seed_dataset_init_hf_dataset():
 
     empty_data = []
     hf_empty = HFDataset.from_list(empty_data)
-    with pytest.raises(ValueError, match="Dataset must have at least 1 samples, got 0"):
+    with pytest.raises(
+        ValueError, match="Dataset must have at least 1 samples, got 0"
+    ):
         SeedDataset(data=hf_empty, min_samples=1, strict=True)
 
     dataset_empty = SeedDataset(data=hf_empty, min_samples=0, strict=True)
-    assert len(dataset_empty.data) == 0, "Empty dataset should have no valid items."
+    assert (
+        len(dataset_empty.data) == 0
+    ), "Empty dataset should have no valid items."
 
     non_dict_data = [
         "Not a dictionary",
@@ -320,7 +341,8 @@ def test_seed_dataset_init_hf_dataset():
 
     data_with_optional = [
         {
-            "question": "What is the sentiment of this review? This movie was awesome!...",
+            "question": "What is the sentiment of this review? "
+            "This movie was awesome!...",
             "rationale": "Positive sentiment detected.",
             "final_answer": "positive",
             "difficulty": "medium",
@@ -328,10 +350,15 @@ def test_seed_dataset_init_hf_dataset():
         }
     ]
     hf_optional = HFDataset.from_list(data_with_optional)
-    dataset_optional = SeedDataset(data=hf_optional, min_samples=1, strict=True)
-    assert dataset_optional.data[0].difficulty == "medium", "Difficulty field should be 'medium'."
-    assert dataset_optional.data[0].metadata == {"source": "imdb"}, "Metadata should match input."
-
+    dataset_optional = SeedDataset(
+        data=hf_optional, min_samples=1, strict=True
+    )
+    assert (
+        dataset_optional.data[0].difficulty == "medium"
+    ), "Difficulty field should be 'medium'."
+    assert dataset_optional.data[0].metadata == {
+        "source": "imdb"
+    }, "Metadata should match input."
 
 
 def test_seed_dataset_init_pytorch_dataset():

From 5d222c9af764ce321aa57b4c183d54ef7d518ece Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Sun, 9 Mar 2025 23:42:39 +0100
Subject: [PATCH 14/16] fix: Adjust code to utilize self._length in getitem and
 cast len(data) to a Sized to pass mypy tests

---
 camel/datasets/base.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index b5d074da3c..df49bfdc5e 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -22,8 +22,10 @@
     Dict,
     List,
     Optional,
+    Sized,
     TypeVar,
     Union,
+    cast,
 )
 
 import torch
@@ -386,7 +388,9 @@ def __init__(
             self._raw_data = [dict(item) for item in data]
         elif isinstance(data, Dataset):
             try:
-                self._raw_data = [dict(data[i]) for i in range(len(data))]
+                self._raw_data = [
+                    dict(data[i]) for i in range(len(cast(Sized, data)))
+                ]
             except (TypeError, KeyError, AttributeError) as e:
                 raise TypeError(f"Unsupported PyTorch Dataset: {e}")
 
@@ -427,7 +431,7 @@ def __getitem__(self, idx: int) -> DataPoint:
         """
         if idx < 0 or idx >= self._length:
             raise IndexError(
-                f"Index {idx} out of bounds for dataset of size {len(self)}"
+                f"Index {idx} out of bounds for dataset of size {self._length}"
             )
         return self.data[idx]
 

From a7f2f02bcbef961466ae60aaafe8c808efbb58de Mon Sep 17 00:00:00 2001
From: Apokryphosx <apokryphosx@proton.me>
Date: Mon, 10 Mar 2025 00:51:32 +0100
Subject: [PATCH 15/16] fix: Adjust create datapoint in seed dataset to
 properly work with strict mode

---
 camel/datasets/base.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index df49bfdc5e..dc2bfbef52 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -495,7 +495,9 @@ def _setup(self, min_samples: int) -> None:
             logger.debug("No raw data to process")
             return
 
-        def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint:
+        def create_datapoint(
+            item: Dict[str, Any], idx: int
+        ) -> Optional[DataPoint]:
             try:
                 return DataPoint(
                     question=item.get('question', ''),
@@ -520,10 +522,14 @@ def create_datapoint(item: Dict[str, Any], idx: int) -> DataPoint:
                     )
                     return None
 
-        self.data = [
+        raw_data = [
             create_datapoint(item, i) for i, item in enumerate(self._raw_data)
         ]
-        logger.debug(f"Processed {len(self.data)} data points")
+        self.data = [dp for dp in raw_data if dp is not None]
+        logger.debug(
+            f"Processed {len(raw_data)} data points, of which "
+            f"{len(self.data)} were valid."
+        )
 
     @property
     def metadata(self) -> Dict[str, Any]:

From 3f6aec08bcbefa68954c126596fa4cfbc4fd2369 Mon Sep 17 00:00:00 2001
From: hallerite <hallerite@tuta.io>
Date: Mon, 10 Mar 2025 00:58:40 +0100
Subject: [PATCH 16/16] fix: remove casting

---
 camel/datasets/base.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/camel/datasets/base.py b/camel/datasets/base.py
index dc2bfbef52..30093cc2dc 100644
--- a/camel/datasets/base.py
+++ b/camel/datasets/base.py
@@ -25,7 +25,6 @@
     Sized,
     TypeVar,
     Union,
-    cast,
 )
 
 import torch
@@ -387,13 +386,17 @@ def __init__(
         if isinstance(data, HFDataset):
             self._raw_data = [dict(item) for item in data]
         elif isinstance(data, Dataset):
-            try:
-                self._raw_data = [
-                    dict(data[i]) for i in range(len(cast(Sized, data)))
-                ]
-            except (TypeError, KeyError, AttributeError) as e:
-                raise TypeError(f"Unsupported PyTorch Dataset: {e}")
+            if not isinstance(data, Sized):
+                raise TypeError(
+                    f"{type(data).__name__} does not implement `__len__()`."
+                )
 
+            # Make MyPy happy by ensuring indexability
+            assert callable(
+                getattr(data, "__getitem__", None)
+            ), "Dataset does not support indexing."
+
+            self._raw_data = [dict(data[i]) for i in range(len(data))]
         elif isinstance(data, Path):
             if not data.exists():
                 raise FileNotFoundError(f"JSON file not found: {data}")
@@ -403,7 +406,6 @@ def __init__(
                 raise ValueError(
                     "JSON file must contain a list of dictionaries"
                 )
-
         elif isinstance(data, list):
             self._raw_data = data if data is not None else []
         else: