Skip to content

Commit

Permalink
Feature/schema to json (#184)
Browse files Browse the repository at this point in the history
This PR makes the schema serializable to JSON via Object.json()

PR includes feature, tests and documentation

Link to Issue #177
  • Loading branch information
BorisWilhelms authored Jul 3, 2023
1 parent 713c2bf commit f910fc4
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 16 deletions.
2 changes: 1 addition & 1 deletion docs/source/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ untyped_objects
apis
validation
document_extraction
schema_from_json
schema_serialization
guidelines
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,9 @@
"id": "4b3a0584-b52c-4873-abb8-8382e13ff5c0",
"metadata": {},
"source": [
"# Schema from JSON\n",
"# Schema serialization\n",
"\n",
"Kor lets you define the schema in JSON. The structure of the JSON matches the struture of the `Object` type.\n",
"\n",
"The following attribute types must be annotated with a type descrimintator (`$type`):\n",
"\n",
"- Number\n",
"- Text\n",
"- Bool\n",
"- Selection"
"A Kor schema can be serialized and deserialzed to JSON. This lets you store the schema outside of the code."
]
},
{
Expand Down Expand Up @@ -49,7 +42,96 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"id": "47a11a37",
"metadata": {},
"outputs": [],
"source": [
"from kor.nodes import Object, Text, Number"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "ac403159",
"metadata": {},
"source": [
"## Serialization\n",
"\n",
"To serialize a schema just call the `json()` method on the schema"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "67cb9713",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"id\": \"personal_info\", \"description\": \"Personal information about a given person.\", \"many\": true, \"attributes\": [{\"id\": \"first_name\", \"description\": \"The first name of the person\", \"many\": false, \"examples\": [[\"John Smith went to the store\", \"John\"]], \"$type\": \"Text\"}, {\"id\": \"last_name\", \"description\": \"The last name of the person\", \"many\": false, \"examples\": [[\"John Smith went to the store\", \"Smith\"]], \"$type\": \"Text\"}, {\"id\": \"age\", \"description\": \"The age of the person in years.\", \"many\": false, \"examples\": [[\"23 years old\", \"23\"], [\"I turned three on sunday\", \"3\"]], \"$type\": \"Number\"}], \"examples\": [[\"John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.\", [{\"first_name\": \"John\", \"last_name\": \"Smith\", \"age\": 23}, {\"first_name\": \"Jane\", \"last_name\": \"Doe\", \"age\": 5}]]]}\n"
]
}
],
"source": [
"schema = Object(\n",
" id=\"personal_info\",\n",
" description=\"Personal information about a given person.\",\n",
" attributes=[\n",
" Text(\n",
" id=\"first_name\",\n",
" description=\"The first name of the person\",\n",
" examples=[(\"John Smith went to the store\", \"John\")],\n",
" ),\n",
" Text(\n",
" id=\"last_name\",\n",
" description=\"The last name of the person\",\n",
" examples=[(\"John Smith went to the store\", \"Smith\")],\n",
" ),\n",
" Number(\n",
" id=\"age\",\n",
" description=\"The age of the person in years.\",\n",
" examples=[(\"23 years old\", \"23\"), (\"I turned three on sunday\", \"3\")],\n",
" ),\n",
" ],\n",
" examples=[\n",
" (\n",
" \"John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.\",\n",
" [\n",
" {\"first_name\": \"John\", \"last_name\": \"Smith\", \"age\": 23},\n",
" {\"first_name\": \"Jane\", \"last_name\": \"Doe\", \"age\": 5},\n",
" ],\n",
" )\n",
" ],\n",
" many=True,\n",
")\n",
"\n",
"print(schema.json())"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "11712477",
"metadata": {},
"source": [
"## Deserialization\n",
"\n",
"Kor lets you define the schema in JSON. The structure of the JSON matches the struture of the `Object` type.\n",
"\n",
"The following attribute types must be annotated with a type descrimintator (`$type`):\n",
"\n",
"- Number\n",
"- Text\n",
"- Bool\n",
"- Selection"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3bd33817",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -92,14 +174,22 @@
"\"\"\""
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "3581b713",
"metadata": {},
"source": [
"To deserialize a schema from JSON simply call the `parse_raw()` method."
]
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 4,
"id": "6088c98a",
"metadata": {},
"outputs": [],
"source": [
"from kor import Object\n",
"schema = Object.parse_raw(json)"
]
},
Expand Down Expand Up @@ -213,7 +303,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down
11 changes: 9 additions & 2 deletions kor/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
# not worth the effort for a v0.
VALID_IDENTIFIER_PATTERN = re.compile(r"^[a-z_][0-9a-z_]*$")

# Name of field to store the type discriminator
TYPE_DISCRIMINATOR_FIELD = "$type"

T = TypeVar("T")


Expand Down Expand Up @@ -130,12 +133,16 @@ class ExtractionSchemaNode(AbstractSchemaNode, abc.ABC):

examples: Sequence[Tuple[str, Union[str, Sequence[str]]]] = tuple()

def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.__dict__[TYPE_DISCRIMINATOR_FIELD] = type(self).__name__

@classmethod
def parse_obj(cls: Type[ExtractionSchemaNode], data: dict) -> ExtractionSchemaNode:
"""Parse an object."""
type_ = data.pop("$type", None)
type_ = data.pop(TYPE_DISCRIMINATOR_FIELD, None)
if type_ is None:
raise ValueError("Need to specify type ($type)")
raise ValueError(f"Need to specify type ({TYPE_DISCRIMINATOR_FIELD})")
for sub in cls.__subclasses__():
if type_ == sub.__name__:
return sub(**data)
Expand Down
36 changes: 36 additions & 0 deletions tests/test_serialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import Any

import pytest

from kor import Bool, Number, Object, Text
from kor.nodes import ExtractionSchemaNode


@pytest.fixture(params=ExtractionSchemaNode.__subclasses__())
def extraction_subclass(request: Any) -> Any:
return request.param


def test_extractionschemanode_has_type_discriminator(
extraction_subclass: Any,
) -> None:
sut = extraction_subclass(id="test")
assert sut.dict()["$type"] == extraction_subclass.__name__


def test_serialize_deserialize_equals() -> None:
expected = Object(
id="root",
description="root-object",
attributes=[
Number(id="number", description="Number description", examples=[]),
Text(id="text", description="text description", examples=[]),
Bool(id="bool", description="bool description", examples=[]),
],
examples=[],
)

json = expected.json()
sut = Object.parse_raw(json)

assert sut == expected

0 comments on commit f910fc4

Please sign in to comment.