Skip to content

Commit

Permalink
Make pydantic model serialization consistent regardless of surrogat…
Browse files Browse the repository at this point in the history
…es. (#1405)

Without this code, Pydantic models containing surrogates get serialized
differently than models that don't contain surrogates. This leads to a
less smooth user experience in LangSmith for users whose data contains
surrogates.

With this fix, Pydantic models and other tricky Python data types are
always serialized in the same way, regardless of whether they contain
surrogates or not.
  • Loading branch information
obi1kenobi authored Jan 10, 2025
1 parent c36cf67 commit 843d55d
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 1 deletion.
2 changes: 1 addition & 1 deletion python/langsmith/_internal/_serde.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def dumps_json(obj: Any) -> bytes:
logger.debug(f"Orjson serialization failed: {repr(e)}. Falling back to json.")
result = json.dumps(
obj,
default=_simple_default,
default=_serialize_json,
ensure_ascii=True,
).encode("utf-8")
try:
Expand Down
32 changes: 32 additions & 0 deletions python/tests/integration_tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pydantic import BaseModel
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor

from langsmith._internal._serde import dumps_json
from langsmith.client import ID_TYPE, Client
from langsmith.evaluation import aevaluate, evaluate
from langsmith.schemas import (
Expand Down Expand Up @@ -1155,6 +1156,37 @@ def test_surrogates():
)


def test_fallback_json_serialization():
class Document(BaseModel):
content: str

raw_surrogates = [
("Hello\ud83d\ude00", "Hello😀"),
("Python\ud83d\udc0d", "Python🐍"),
("Surrogate\ud834\udd1e", "Surrogate𝄞"),
("Example\ud83c\udf89", "Example🎉"),
("String\ud83c\udfa7", "String🎧"),
("With\ud83c\udf08", "With🌈"),
("Surrogates\ud83d\ude0e", "Surrogates😎"),
("Embedded\ud83d\udcbb", "Embedded💻"),
("In\ud83c\udf0e", "In🌎"),
("The\ud83d\udcd6", "The📖"),
("Text\ud83d\udcac", "Text💬"),
("收花🙄·到", "收花🙄·到"),
]
pydantic_surrogates = [
(Document(content=item), expected) for item, expected in raw_surrogates
]

for item, expected in raw_surrogates:
output = dumps_json(item).decode("utf8")
assert f'"{expected}"' == output

for item, expected in pydantic_surrogates:
output = dumps_json(item).decode("utf8")
assert f'{{"content":"{expected}"}}' == output


def test_runs_stats():
langchain_client = Client()
# We always have stuff in the "default" project...
Expand Down

0 comments on commit 843d55d

Please sign in to comment.