Feature/schema to json (#184)

This PR makes the schema serializable to JSON via Object.json() PR includes feature, tests and documentation Link to Issue #177
eyurtsev · Jul 3, 2023 · f910fc4 · f910fc4
1 parent 713c2bf
commit f910fc4
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 16 deletions.
diff --git a/docs/source/index.md b/docs/source/index.md
@@ -194,7 +194,7 @@ untyped_objects
 apis
 validation
 document_extraction
-schema_from_json
+schema_serialization
 guidelines
 ```
 

diff --git a/docs/source/schema_from_json.ipynb → docs/source/schema_serialization.ipynb b/docs/source/schema_from_json.ipynb → docs/source/schema_serialization.ipynb
@@ -6,16 +6,9 @@
    "id": "4b3a0584-b52c-4873-abb8-8382e13ff5c0",
    "metadata": {},
    "source": [
-    "# Schema from JSON\n",
+    "# Schema serialization\n",
     "\n",
-    "Kor lets you define the schema in JSON. The structure of the JSON matches the struture of the `Object` type.\n",
-    "\n",
-    "The following attribute types must be annotated with a type descrimintator (`$type`):\n",
-    "\n",
-    "- Number\n",
-    "- Text\n",
-    "- Bool\n",
-    "- Selection"
+    "A Kor schema can be serialized and deserialzed to JSON. This lets you store the schema outside of the code."
    ]
   },
   {
@@ -49,7 +42,96 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
+   "id": "47a11a37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kor.nodes import Object, Text, Number"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "ac403159",
+   "metadata": {},
+   "source": [
+    "## Serialization\n",
+    "\n",
+    "To serialize a schema just call the `json()` method on the schema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "67cb9713",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"id\": \"personal_info\", \"description\": \"Personal information about a given person.\", \"many\": true, \"attributes\": [{\"id\": \"first_name\", \"description\": \"The first name of the person\", \"many\": false, \"examples\": [[\"John Smith went to the store\", \"John\"]], \"$type\": \"Text\"}, {\"id\": \"last_name\", \"description\": \"The last name of the person\", \"many\": false, \"examples\": [[\"John Smith went to the store\", \"Smith\"]], \"$type\": \"Text\"}, {\"id\": \"age\", \"description\": \"The age of the person in years.\", \"many\": false, \"examples\": [[\"23 years old\", \"23\"], [\"I turned three on sunday\", \"3\"]], \"$type\": \"Number\"}], \"examples\": [[\"John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.\", [{\"first_name\": \"John\", \"last_name\": \"Smith\", \"age\": 23}, {\"first_name\": \"Jane\", \"last_name\": \"Doe\", \"age\": 5}]]]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "schema = Object(\n",
+    "    id=\"personal_info\",\n",
+    "    description=\"Personal information about a given person.\",\n",
+    "    attributes=[\n",
+    "        Text(\n",
+    "            id=\"first_name\",\n",
+    "            description=\"The first name of the person\",\n",
+    "            examples=[(\"John Smith went to the store\", \"John\")],\n",
+    "        ),\n",
+    "        Text(\n",
+    "            id=\"last_name\",\n",
+    "            description=\"The last name of the person\",\n",
+    "            examples=[(\"John Smith went to the store\", \"Smith\")],\n",
+    "        ),\n",
+    "        Number(\n",
+    "            id=\"age\",\n",
+    "            description=\"The age of the person in years.\",\n",
+    "            examples=[(\"23 years old\", \"23\"), (\"I turned three on sunday\", \"3\")],\n",
+    "        ),\n",
+    "    ],\n",
+    "    examples=[\n",
+    "        (\n",
+    "            \"John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.\",\n",
+    "            [\n",
+    "                {\"first_name\": \"John\", \"last_name\": \"Smith\", \"age\": 23},\n",
+    "                {\"first_name\": \"Jane\", \"last_name\": \"Doe\", \"age\": 5},\n",
+    "            ],\n",
+    "        )\n",
+    "    ],\n",
+    "    many=True,\n",
+    ")\n",
+    "\n",
+    "print(schema.json())"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "11712477",
+   "metadata": {},
+   "source": [
+    "## Deserialization\n",
+    "\n",
+    "Kor lets you define the schema in JSON. The structure of the JSON matches the struture of the `Object` type.\n",
+    "\n",
+    "The following attribute types must be annotated with a type descrimintator (`$type`):\n",
+    "\n",
+    "- Number\n",
+    "- Text\n",
+    "- Bool\n",
+    "- Selection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "id": "3bd33817",
    "metadata": {},
    "outputs": [],
@@ -92,14 +174,22 @@
     "\"\"\""
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "3581b713",
+   "metadata": {},
+   "source": [
+    "To deserialize a schema from JSON simply call the `parse_raw()` method."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 4,
    "id": "6088c98a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from kor import Object\n",
     "schema = Object.parse_raw(json)"
    ]
   },
@@ -213,7 +303,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.3"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,

diff --git a/kor/nodes.py b/kor/nodes.py
@@ -27,6 +27,9 @@
 # not worth the effort for a v0.
 VALID_IDENTIFIER_PATTERN = re.compile(r"^[a-z_][0-9a-z_]*$")
 
+# Name of field to store the type discriminator
+TYPE_DISCRIMINATOR_FIELD = "$type"
+
 T = TypeVar("T")
 
 
@@ -130,12 +133,16 @@ class ExtractionSchemaNode(AbstractSchemaNode, abc.ABC):
 
     examples: Sequence[Tuple[str, Union[str, Sequence[str]]]] = tuple()
 
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.__dict__[TYPE_DISCRIMINATOR_FIELD] = type(self).__name__
+
     @classmethod
     def parse_obj(cls: Type[ExtractionSchemaNode], data: dict) -> ExtractionSchemaNode:
         """Parse an object."""
-        type_ = data.pop("$type", None)
+        type_ = data.pop(TYPE_DISCRIMINATOR_FIELD, None)
         if type_ is None:
-            raise ValueError("Need to specify type ($type)")
+            raise ValueError(f"Need to specify type ({TYPE_DISCRIMINATOR_FIELD})")
         for sub in cls.__subclasses__():
             if type_ == sub.__name__:
                 return sub(**data)

diff --git a/tests/test_serialization.py b/tests/test_serialization.py
@@ -0,0 +1,36 @@
+from typing import Any
+
+import pytest
+
+from kor import Bool, Number, Object, Text
+from kor.nodes import ExtractionSchemaNode
+
+
+@pytest.fixture(params=ExtractionSchemaNode.__subclasses__())
+def extraction_subclass(request: Any) -> Any:
+    return request.param
+
+
+def test_extractionschemanode_has_type_discriminator(
+    extraction_subclass: Any,
+) -> None:
+    sut = extraction_subclass(id="test")
+    assert sut.dict()["$type"] == extraction_subclass.__name__
+
+
+def test_serialize_deserialize_equals() -> None:
+    expected = Object(
+        id="root",
+        description="root-object",
+        attributes=[
+            Number(id="number", description="Number description", examples=[]),
+            Text(id="text", description="text description", examples=[]),
+            Bool(id="bool", description="bool description", examples=[]),
+        ],
+        examples=[],
+    )
+
+    json = expected.json()
+    sut = Object.parse_raw(json)
+
+    assert sut == expected