Skip to content

Commit

Permalink
[Python] to_pandas() method for ExperimentResults (#1170)
Browse files Browse the repository at this point in the history
![Screenshot 2024-11-04 at 2 56
09 PM](https://github.com/user-attachments/assets/eb19018c-eb9e-447a-a454-6471fad7d122)

---------

Co-authored-by: William FH <[email protected]>
  • Loading branch information
baskaryan and hinthornw authored Nov 5, 2024
1 parent 0d42ba6 commit 0a4fb6d
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 11 deletions.
24 changes: 24 additions & 0 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import pathlib
import uuid
from typing import (
TYPE_CHECKING,
Any,
AsyncIterable,
AsyncIterator,
Awaitable,
Expand Down Expand Up @@ -45,6 +47,7 @@
_resolve_data,
_resolve_evaluators,
_resolve_experiment,
_to_pandas,
_wrap_summary_evaluators,
)
from langsmith.evaluation.evaluator import (
Expand All @@ -53,6 +56,13 @@
RunEvaluator,
)

if TYPE_CHECKING:
import pandas as pd

DataFrame = pd.DataFrame
else:
DataFrame = Any

logger = logging.getLogger(__name__)

ATARGET_T = Callable[[dict], Awaitable[dict]]
Expand Down Expand Up @@ -852,6 +862,20 @@ async def _process_data(self, manager: _AsyncExperimentManager) -> None:
async with self._lock:
self._summary_results = summary_scores

def to_pandas(
self, start: Optional[int] = 0, end: Optional[int] = None
) -> DataFrame:
return _to_pandas(self._results, start=start, end=end)

def _repr_html_(self) -> str:
import importlib.util

if self._results and importlib.util.find_spec("pandas"):
df = self.to_pandas(0, 5)
return df._repr_html_() # type: ignore[operator]
else:
return self.__repr__()

def __len__(self) -> int:
return len(self._results)

Expand Down
70 changes: 70 additions & 0 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import uuid
from contextvars import copy_context
from typing import (
TYPE_CHECKING,
Any,
Awaitable,
Callable,
DefaultDict,
Expand Down Expand Up @@ -54,6 +56,12 @@
)
from langsmith.evaluation.integrations import LangChainStringEvaluator

if TYPE_CHECKING:
import pandas as pd

DataFrame = pd.DataFrame
else:
DataFrame = Any
logger = logging.getLogger(__name__)

TARGET_T = Callable[[dict], dict]
Expand Down Expand Up @@ -452,6 +460,20 @@ def _process_data(self) -> None:
def __len__(self) -> int:
return len(self._results)

def to_pandas(
self, start: Optional[int] = 0, end: Optional[int] = None
) -> DataFrame:
return _to_pandas(self._results, start=start, end=end)

def _repr_html_(self) -> str:
import importlib.util

if self._results and importlib.util.find_spec("pandas"):
df = self.to_pandas()
return df._repr_html_() # type: ignore[operator]
else:
return self.__repr__()

def __repr__(self) -> str:
return f"<ExperimentResults {self.experiment_name}>"

Expand Down Expand Up @@ -1853,3 +1875,51 @@ def extract_evaluation_results_keys(node, variables):

except SyntaxError:
return []


def _to_pandas(
results: list[ExperimentResultRow],
start: Optional[int] = 0,
end: Optional[int] = None,
):
try:
import pandas as pd
except ImportError as e:
raise ImportError(
"The 'pandas' library is required to use the 'to_pandas' function. "
"Please install it using 'pip install pandas' or "
"'conda install pandas' before calling this method."
) from e

return pd.DataFrame(_flatten_experiment_results(results, start=start, end=end))


def _flatten_experiment_results(
results: list[ExperimentResultRow],
start: Optional[int] = 0,
end: Optional[int] = None,
):
return [
{
**{f"inputs.{k}": v for k, v in x["example"].inputs.items()},
**{f"outputs.{k}": v for k, v in (x["run"].outputs or {}).items()},
"error": x["run"].error,
**(
{f"reference.{k}": v for k, v in x["example"].outputs.items()}
if x["example"].outputs is not None
else {}
),
**{
f"feedback.{r.key}": r.score if r.score is not None else r.value
for r in x["evaluation_results"]["results"]
},
"execution_time": (
(x["run"].end_time - x["run"].start_time).total_seconds()
if x["run"].end_time
else None
),
"example_id": x["run"].reference_example_id,
"id": x["run"].id,
}
for x in results[start:end]
]
49 changes: 38 additions & 11 deletions python/tests/evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import functools
import logging
import time
from contextlib import contextmanager
Expand Down Expand Up @@ -189,6 +190,17 @@ async def apredict(inputs: dict):
check_results([res async for res in async_results])


@functools.lru_cache(maxsize=1)
def _has_pandas() -> bool:
try:
import pandas # noqa

return True

except Exception:
return False


def test_evaluate():
client = Client()
_ = client.clone_public_dataset(
Expand All @@ -213,7 +225,7 @@ def predict(inputs: dict) -> dict:

results = evaluate(
predict,
data=dataset_name,
data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
evaluators=[accuracy],
summary_evaluators=[precision],
description="My sync experiment",
Expand All @@ -224,14 +236,28 @@ def predict(inputs: dict) -> dict:
num_repetitions=3,
)
assert len(results) == 30
examples = client.list_examples(dataset_name=dataset_name)
if _has_pandas():
df = results.to_pandas()
assert len(df) == 30
assert set(df.columns) == {
"inputs.context",
"inputs.question",
"outputs.output",
"error",
"reference.answer",
"feedback.accuracy",
"execution_time",
"example_id",
"id",
}
examples = client.list_examples(dataset_name=dataset_name, as_of="test_version")
for example in examples:
assert len([r for r in results if r["example"].id == example.id]) == 3

# Run it again with the existing project
results2 = evaluate(
predict,
data=dataset_name,
data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=results.experiment_name,
Expand All @@ -242,7 +268,7 @@ def predict(inputs: dict) -> dict:
experiment = client.read_project(project_name=results.experiment_name)
results3 = evaluate(
predict,
data=dataset_name,
data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=experiment,
Expand All @@ -252,15 +278,14 @@ def predict(inputs: dict) -> dict:
# ... and again with the ID
results4 = evaluate(
predict,
data=dataset_name,
data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=str(experiment.id),
)
assert len(results4) == 10


@pytest.mark.skip(reason="Skipping this test for now. Should remove in the future.")
async def test_aevaluate():
client = Client()
_ = client.clone_public_dataset(
Expand Down Expand Up @@ -292,7 +317,7 @@ async def apredict(inputs: dict) -> dict:

results = await aevaluate(
apredict,
data=dataset_name,
data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
evaluators=[accuracy, slow_accuracy],
summary_evaluators=[precision],
experiment_prefix="My Experiment",
Expand All @@ -304,7 +329,9 @@ async def apredict(inputs: dict) -> dict:
num_repetitions=2,
)
assert len(results) == 20
examples = client.list_examples(dataset_name=dataset_name)
df = results.to_pandas()
assert len(df) == 20
examples = client.list_examples(dataset_name=dataset_name, as_of="test_version")
all_results = [r async for r in results]
all_examples = []
for example in examples:
Expand Down Expand Up @@ -334,7 +361,7 @@ def check_run_count():
# Run it again with the existing project
results2 = await aevaluate(
apredict,
data=dataset_name,
data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=results.experiment_name,
Expand All @@ -345,7 +372,7 @@ def check_run_count():
experiment = client.read_project(project_name=results.experiment_name)
results3 = await aevaluate(
apredict,
data=dataset_name,
data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=experiment,
Expand All @@ -355,7 +382,7 @@ def check_run_count():
# ... and again with the ID
results4 = await aevaluate(
apredict,
data=dataset_name,
data=client.list_examples(dataset_name=dataset_name, as_of="test_version"),
evaluators=[accuracy],
summary_evaluators=[precision],
experiment=str(experiment.id),
Expand Down

0 comments on commit 0a4fb6d

Please sign in to comment.