From f910fc492415dbc0d2e011e9ff4780da121b322f Mon Sep 17 00:00:00 2001 From: Boris Wilhelms Date: Mon, 3 Jul 2023 18:05:05 +0200 Subject: [PATCH] Feature/schema to json (#184) This PR makes the schema serializable to JSON via Object.json() PR includes feature, tests and documentation Link to Issue #177 --- docs/source/index.md | 2 +- ..._json.ipynb => schema_serialization.ipynb} | 116 ++++++++++++++++-- kor/nodes.py | 11 +- tests/test_serialization.py | 36 ++++++ 4 files changed, 149 insertions(+), 16 deletions(-) rename docs/source/{schema_from_json.ipynb => schema_serialization.ipynb} (61%) create mode 100644 tests/test_serialization.py diff --git a/docs/source/index.md b/docs/source/index.md index de5f130..a282678 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -194,7 +194,7 @@ untyped_objects apis validation document_extraction -schema_from_json +schema_serialization guidelines ``` diff --git a/docs/source/schema_from_json.ipynb b/docs/source/schema_serialization.ipynb similarity index 61% rename from docs/source/schema_from_json.ipynb rename to docs/source/schema_serialization.ipynb index fb000e1..1872414 100644 --- a/docs/source/schema_from_json.ipynb +++ b/docs/source/schema_serialization.ipynb @@ -6,16 +6,9 @@ "id": "4b3a0584-b52c-4873-abb8-8382e13ff5c0", "metadata": {}, "source": [ - "# Schema from JSON\n", + "# Schema serialization\n", "\n", - "Kor lets you define the schema in JSON. The structure of the JSON matches the struture of the `Object` type.\n", - "\n", - "The following attribute types must be annotated with a type descrimintator (`$type`):\n", - "\n", - "- Number\n", - "- Text\n", - "- Bool\n", - "- Selection" + "A Kor schema can be serialized and deserialzed to JSON. This lets you store the schema outside of the code." ] }, { @@ -49,7 +42,96 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, + "id": "47a11a37", + "metadata": {}, + "outputs": [], + "source": [ + "from kor.nodes import Object, Text, Number" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ac403159", + "metadata": {}, + "source": [ + "## Serialization\n", + "\n", + "To serialize a schema just call the `json()` method on the schema" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "67cb9713", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"id\": \"personal_info\", \"description\": \"Personal information about a given person.\", \"many\": true, \"attributes\": [{\"id\": \"first_name\", \"description\": \"The first name of the person\", \"many\": false, \"examples\": [[\"John Smith went to the store\", \"John\"]], \"$type\": \"Text\"}, {\"id\": \"last_name\", \"description\": \"The last name of the person\", \"many\": false, \"examples\": [[\"John Smith went to the store\", \"Smith\"]], \"$type\": \"Text\"}, {\"id\": \"age\", \"description\": \"The age of the person in years.\", \"many\": false, \"examples\": [[\"23 years old\", \"23\"], [\"I turned three on sunday\", \"3\"]], \"$type\": \"Number\"}], \"examples\": [[\"John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.\", [{\"first_name\": \"John\", \"last_name\": \"Smith\", \"age\": 23}, {\"first_name\": \"Jane\", \"last_name\": \"Doe\", \"age\": 5}]]]}\n" + ] + } + ], + "source": [ + "schema = Object(\n", + " id=\"personal_info\",\n", + " description=\"Personal information about a given person.\",\n", + " attributes=[\n", + " Text(\n", + " id=\"first_name\",\n", + " description=\"The first name of the person\",\n", + " examples=[(\"John Smith went to the store\", \"John\")],\n", + " ),\n", + " Text(\n", + " id=\"last_name\",\n", + " description=\"The last name of the person\",\n", + " examples=[(\"John Smith went to the store\", \"Smith\")],\n", + " ),\n", + " Number(\n", + " id=\"age\",\n", + " description=\"The age of the person in years.\",\n", + " examples=[(\"23 years old\", \"23\"), (\"I turned three on sunday\", \"3\")],\n", + " ),\n", + " ],\n", + " examples=[\n", + " (\n", + " \"John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.\",\n", + " [\n", + " {\"first_name\": \"John\", \"last_name\": \"Smith\", \"age\": 23},\n", + " {\"first_name\": \"Jane\", \"last_name\": \"Doe\", \"age\": 5},\n", + " ],\n", + " )\n", + " ],\n", + " many=True,\n", + ")\n", + "\n", + "print(schema.json())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "11712477", + "metadata": {}, + "source": [ + "## Deserialization\n", + "\n", + "Kor lets you define the schema in JSON. The structure of the JSON matches the struture of the `Object` type.\n", + "\n", + "The following attribute types must be annotated with a type descrimintator (`$type`):\n", + "\n", + "- Number\n", + "- Text\n", + "- Bool\n", + "- Selection" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "id": "3bd33817", "metadata": {}, "outputs": [], @@ -92,14 +174,22 @@ "\"\"\"" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3581b713", + "metadata": {}, + "source": [ + "To deserialize a schema from JSON simply call the `parse_raw()` method." + ] + }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 4, "id": "6088c98a", "metadata": {}, "outputs": [], "source": [ - "from kor import Object\n", "schema = Object.parse_raw(json)" ] }, @@ -213,7 +303,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/kor/nodes.py b/kor/nodes.py index 613603a..4671317 100644 --- a/kor/nodes.py +++ b/kor/nodes.py @@ -27,6 +27,9 @@ # not worth the effort for a v0. VALID_IDENTIFIER_PATTERN = re.compile(r"^[a-z_][0-9a-z_]*$") +# Name of field to store the type discriminator +TYPE_DISCRIMINATOR_FIELD = "$type" + T = TypeVar("T") @@ -130,12 +133,16 @@ class ExtractionSchemaNode(AbstractSchemaNode, abc.ABC): examples: Sequence[Tuple[str, Union[str, Sequence[str]]]] = tuple() + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self.__dict__[TYPE_DISCRIMINATOR_FIELD] = type(self).__name__ + @classmethod def parse_obj(cls: Type[ExtractionSchemaNode], data: dict) -> ExtractionSchemaNode: """Parse an object.""" - type_ = data.pop("$type", None) + type_ = data.pop(TYPE_DISCRIMINATOR_FIELD, None) if type_ is None: - raise ValueError("Need to specify type ($type)") + raise ValueError(f"Need to specify type ({TYPE_DISCRIMINATOR_FIELD})") for sub in cls.__subclasses__(): if type_ == sub.__name__: return sub(**data) diff --git a/tests/test_serialization.py b/tests/test_serialization.py new file mode 100644 index 0000000..0d189d4 --- /dev/null +++ b/tests/test_serialization.py @@ -0,0 +1,36 @@ +from typing import Any + +import pytest + +from kor import Bool, Number, Object, Text +from kor.nodes import ExtractionSchemaNode + + +@pytest.fixture(params=ExtractionSchemaNode.__subclasses__()) +def extraction_subclass(request: Any) -> Any: + return request.param + + +def test_extractionschemanode_has_type_discriminator( + extraction_subclass: Any, +) -> None: + sut = extraction_subclass(id="test") + assert sut.dict()["$type"] == extraction_subclass.__name__ + + +def test_serialize_deserialize_equals() -> None: + expected = Object( + id="root", + description="root-object", + attributes=[ + Number(id="number", description="Number description", examples=[]), + Text(id="text", description="text description", examples=[]), + Bool(id="bool", description="bool description", examples=[]), + ], + examples=[], + ) + + json = expected.json() + sut = Object.parse_raw(json) + + assert sut == expected