diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 1075589cb..78507f625 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -9,6 +9,8 @@ import pathlib import uuid from typing import ( + TYPE_CHECKING, + Any, AsyncIterable, AsyncIterator, Awaitable, @@ -45,6 +47,7 @@ _resolve_data, _resolve_evaluators, _resolve_experiment, + _to_pandas, _wrap_summary_evaluators, ) from langsmith.evaluation.evaluator import ( @@ -53,6 +56,13 @@ RunEvaluator, ) +if TYPE_CHECKING: + import pandas as pd + + DataFrame = pd.DataFrame +else: + DataFrame = Any + logger = logging.getLogger(__name__) ATARGET_T = Callable[[dict], Awaitable[dict]] @@ -852,6 +862,20 @@ async def _process_data(self, manager: _AsyncExperimentManager) -> None: async with self._lock: self._summary_results = summary_scores + def to_pandas( + self, start: Optional[int] = 0, end: Optional[int] = None + ) -> DataFrame: + return _to_pandas(self._results, start=start, end=end) + + def _repr_html_(self) -> str: + import importlib.util + + if self._results and importlib.util.find_spec("pandas"): + df = self.to_pandas(0, 5) + return df._repr_html_() # type: ignore[operator] + else: + return self.__repr__() + def __len__(self) -> int: return len(self._results) diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index d5f6a6fc1..111986b76 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -18,6 +18,8 @@ import uuid from contextvars import copy_context from typing import ( + TYPE_CHECKING, + Any, Awaitable, Callable, DefaultDict, @@ -54,6 +56,12 @@ ) from langsmith.evaluation.integrations import LangChainStringEvaluator +if TYPE_CHECKING: + import pandas as pd + + DataFrame = pd.DataFrame +else: + DataFrame = Any logger = logging.getLogger(__name__) TARGET_T = Callable[[dict], dict] @@ -452,6 +460,20 @@ def _process_data(self) -> None: def __len__(self) -> int: return len(self._results) + def to_pandas( + self, start: Optional[int] = 0, end: Optional[int] = None + ) -> DataFrame: + return _to_pandas(self._results, start=start, end=end) + + def _repr_html_(self) -> str: + import importlib.util + + if self._results and importlib.util.find_spec("pandas"): + df = self.to_pandas() + return df._repr_html_() # type: ignore[operator] + else: + return self.__repr__() + def __repr__(self) -> str: return f"" @@ -1853,3 +1875,51 @@ def extract_evaluation_results_keys(node, variables): except SyntaxError: return [] + + +def _to_pandas( + results: list[ExperimentResultRow], + start: Optional[int] = 0, + end: Optional[int] = None, +): + try: + import pandas as pd + except ImportError as e: + raise ImportError( + "The 'pandas' library is required to use the 'to_pandas' function. " + "Please install it using 'pip install pandas' or " + "'conda install pandas' before calling this method." + ) from e + + return pd.DataFrame(_flatten_experiment_results(results, start=start, end=end)) + + +def _flatten_experiment_results( + results: list[ExperimentResultRow], + start: Optional[int] = 0, + end: Optional[int] = None, +): + return [ + { + **{f"inputs.{k}": v for k, v in x["example"].inputs.items()}, + **{f"outputs.{k}": v for k, v in (x["run"].outputs or {}).items()}, + "error": x["run"].error, + **( + {f"reference.{k}": v for k, v in x["example"].outputs.items()} + if x["example"].outputs is not None + else {} + ), + **{ + f"feedback.{r.key}": r.score if r.score is not None else r.value + for r in x["evaluation_results"]["results"] + }, + "execution_time": ( + (x["run"].end_time - x["run"].start_time).total_seconds() + if x["run"].end_time + else None + ), + "example_id": x["run"].reference_example_id, + "id": x["run"].id, + } + for x in results[start:end] + ] diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py index 87ca42ac5..432a7df89 100644 --- a/python/tests/evaluation/test_evaluation.py +++ b/python/tests/evaluation/test_evaluation.py @@ -1,4 +1,5 @@ import asyncio +import functools import logging import time from contextlib import contextmanager @@ -189,6 +190,17 @@ async def apredict(inputs: dict): check_results([res async for res in async_results]) +@functools.lru_cache(maxsize=1) +def _has_pandas() -> bool: + try: + import pandas # noqa + + return True + + except Exception: + return False + + def test_evaluate(): client = Client() _ = client.clone_public_dataset( @@ -213,7 +225,7 @@ def predict(inputs: dict) -> dict: results = evaluate( predict, - data=dataset_name, + data=client.list_examples(dataset_name=dataset_name, as_of="test_version"), evaluators=[accuracy], summary_evaluators=[precision], description="My sync experiment", @@ -224,14 +236,28 @@ def predict(inputs: dict) -> dict: num_repetitions=3, ) assert len(results) == 30 - examples = client.list_examples(dataset_name=dataset_name) + if _has_pandas(): + df = results.to_pandas() + assert len(df) == 30 + assert set(df.columns) == { + "inputs.context", + "inputs.question", + "outputs.output", + "error", + "reference.answer", + "feedback.accuracy", + "execution_time", + "example_id", + "id", + } + examples = client.list_examples(dataset_name=dataset_name, as_of="test_version") for example in examples: assert len([r for r in results if r["example"].id == example.id]) == 3 # Run it again with the existing project results2 = evaluate( predict, - data=dataset_name, + data=client.list_examples(dataset_name=dataset_name, as_of="test_version"), evaluators=[accuracy], summary_evaluators=[precision], experiment=results.experiment_name, @@ -242,7 +268,7 @@ def predict(inputs: dict) -> dict: experiment = client.read_project(project_name=results.experiment_name) results3 = evaluate( predict, - data=dataset_name, + data=client.list_examples(dataset_name=dataset_name, as_of="test_version"), evaluators=[accuracy], summary_evaluators=[precision], experiment=experiment, @@ -252,7 +278,7 @@ def predict(inputs: dict) -> dict: # ... and again with the ID results4 = evaluate( predict, - data=dataset_name, + data=client.list_examples(dataset_name=dataset_name, as_of="test_version"), evaluators=[accuracy], summary_evaluators=[precision], experiment=str(experiment.id), @@ -260,7 +286,6 @@ def predict(inputs: dict) -> dict: assert len(results4) == 10 -@pytest.mark.skip(reason="Skipping this test for now. Should remove in the future.") async def test_aevaluate(): client = Client() _ = client.clone_public_dataset( @@ -292,7 +317,7 @@ async def apredict(inputs: dict) -> dict: results = await aevaluate( apredict, - data=dataset_name, + data=client.list_examples(dataset_name=dataset_name, as_of="test_version"), evaluators=[accuracy, slow_accuracy], summary_evaluators=[precision], experiment_prefix="My Experiment", @@ -304,7 +329,9 @@ async def apredict(inputs: dict) -> dict: num_repetitions=2, ) assert len(results) == 20 - examples = client.list_examples(dataset_name=dataset_name) + df = results.to_pandas() + assert len(df) == 20 + examples = client.list_examples(dataset_name=dataset_name, as_of="test_version") all_results = [r async for r in results] all_examples = [] for example in examples: @@ -334,7 +361,7 @@ def check_run_count(): # Run it again with the existing project results2 = await aevaluate( apredict, - data=dataset_name, + data=client.list_examples(dataset_name=dataset_name, as_of="test_version"), evaluators=[accuracy], summary_evaluators=[precision], experiment=results.experiment_name, @@ -345,7 +372,7 @@ def check_run_count(): experiment = client.read_project(project_name=results.experiment_name) results3 = await aevaluate( apredict, - data=dataset_name, + data=client.list_examples(dataset_name=dataset_name, as_of="test_version"), evaluators=[accuracy], summary_evaluators=[precision], experiment=experiment, @@ -355,7 +382,7 @@ def check_run_count(): # ... and again with the ID results4 = await aevaluate( apredict, - data=dataset_name, + data=client.list_examples(dataset_name=dataset_name, as_of="test_version"), evaluators=[accuracy], summary_evaluators=[precision], experiment=str(experiment.id),