Write metadata to file (#351)

* Initial commit * Add private/ to .gitignore for local files * Update poetry.lock * Properly write files * Make CI happy * Add Datadoc metadata README * Minor CI fixes
statisticsnorway · Mar 7, 2024 · 0a6a41d · 0a6a41d
1 parent da49ee8
commit 0a6a41d
Show file tree

Hide file tree

Showing 9 changed files with 439 additions and 218 deletions.
diff --git a/.gitignore b/.gitignore
@@ -134,3 +134,5 @@ dmypy.json
 /.python-version
 /.pytype/
 /docs/_build/
+
+private/
diff --git a/README.md b/README.md
@@ -264,15 +264,63 @@ result_df = (
 )
 ```
 
+
+_Note that depseudonymization requires elevated access privileges._
+
 ### Repseudonymize
 
 ```python
 
 ## TODO
 ```
 
+### Datadoc
 
-_Note that depseudonymization requires elevated access privileges._
+Datadoc metadata is gathered while pseudonymizing, and can be seen like so:
+
+```python
+result = (
+    Pseudonymize.from_polars(df)
+    .on_fields("fornavn")
+    .with_default_encryption()
+    .run()
+)
+
+print(result.datadoc)
+```
+
+Datadoc metadata is automatically written to the folder or bucket as the pseudonymized
+data, when using the `to_file()` method on the result object.
+The metadata file has the suffix `__DOC`, and is always a `.json` file.
+The data and metadata is written to the file like so:
+
+```python
+result = (
+    Pseudonymize.from_polars(df)
+    .on_fields("fornavn")
+    .with_default_encryption()
+    .run()
+)
+
+# The line of code below also writes the file "gs://bucket/test__DOC.json"
+result.to_file("gs://bucket/test.parquet")
+```
+
+Note that if you choose to only use the DataFrame from the result, **the metadata will be lost forever**!
+An example of how this can happen:
+
+```python
+import dapla as dp
+result = (
+    Pseudonymize.from_polars(df)
+    .on_fields("fornavn")
+    .with_default_encryption()
+    .run()
+)
+df = result.to_pandas()
+
+dp.write_pandas(df, "gs://bucket/test.parquet", file_format="parquet") # The metadata is lost!!
+```
 
 
 ## Requirements

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dapla-toolbelt-pseudo"
-version = "1.2.2"
+version = "1.3.0"
 description = "Pseudonymization extensions for Dapla"
 authors = ["Dapla Developers <[email protected]>"]
 license = "MIT"
@@ -29,7 +29,8 @@ polars = ">=0.18.2"
 pygments = ">2.15.0"
 click = ">=8.0.1"
 ssb-datadoc-model = ">=5.0.0"
-
+cloudpathlib = { extras = ["gs"], version = ">=0.17.0" }
+pyarrow = ">=14.0.2, <15"
 
 [tool.poetry.group.test.dependencies]
 typeguard = ">=2.13.3"

diff --git a/src/dapla_pseudo/v1/client.py b/src/dapla_pseudo/v1/client.py
@@ -49,7 +49,7 @@ def __auth_token(self) -> str:
             auth_req = google.auth.transport.requests.Request()
             token = t.cast(
                 str,
-                google.oauth2.id_token.fetch_id_token(auth_req, audience),  # type: ignore[no-untyped-call]
+                google.oauth2.id_token.fetch_id_token(auth_req, audience),
             )
             return token
         else:

diff --git a/src/dapla_pseudo/v1/result.py b/src/dapla_pseudo/v1/result.py
@@ -1,22 +1,25 @@
 """Common API models for builder packages."""
 
-import json
 import typing as t
+from io import BufferedWriter
 from pathlib import Path
 from typing import Any
 
 import pandas as pd
 import polars as pl
-from dapla import FileClient
+from cloudpathlib import GSClient
+from cloudpathlib import GSPath
+from dapla import AuthClient
 from datadoc_model.model import MetadataContainer
 from datadoc_model.model import PseudonymizationMetadata
 from datadoc_model.model import PseudoVariable
-from fsspec.spec import AbstractBufferedFile
 
 from dapla_pseudo.utils import get_file_format_from_file_name
 from dapla_pseudo.v1.pseudo_commons import PseudoFieldResponse
 from dapla_pseudo.v1.pseudo_commons import PseudoFileResponse
+from dapla_pseudo.v1.supported_file_format import SupportedOutputFileFormat
 from dapla_pseudo.v1.supported_file_format import write_from_df
+from dapla_pseudo.v1.supported_file_format import write_from_dicts
 
 
 class Result:
@@ -120,11 +123,11 @@ def to_pandas(self, **kwargs: t.Any) -> pd.DataFrame:
             case _ as invalid_pseudo_data:
                 raise ValueError(f"Invalid response type: {type(invalid_pseudo_data)}")
 
-    def to_file(self, file_path: str | Path, **kwargs: t.Any) -> None:
-        """Write pseudonymized data to a file.
+    def to_file(self, file_path: str, **kwargs: t.Any) -> None:
+        """Write pseudonymized data to a file, with the metadata being written to the same folder.
 
         Args:
-            file_path (str | Path): The path to the file to be written.
+            file_path (str): The path to the file to be written. If writing to a bucket, use the "gs://" prefix.
             **kwargs: Additional keyword arguments to be passed the Polars writer function *if* the input data is a DataFrame.
                 The specific writer function depends on the format of the output file, e.g. `write_csv()` for CSV files.
 
@@ -135,29 +138,42 @@ def to_file(self, file_path: str | Path, **kwargs: t.Any) -> None:
         """
         file_format = get_file_format_from_file_name(file_path)
 
+        datadoc_file_name = f"{Path(file_path).stem}__DOC.json"
+
+        datadoc_file_path: Path | GSPath
+        if file_path.startswith(GSPath.cloud_prefix):
+            client = GSClient(credentials=AuthClient.fetch_google_credentials())
+            gs_path = GSPath(file_path, client)
+
+            file_handle = gs_path.open(mode="wb")
+
+            datadoc_file_path = gs_path.parent.joinpath(Path(datadoc_file_name))
+            datadoc_file_handle = datadoc_file_path.open(mode="w")
+        else:
+            file_handle = Path(file_path).open(mode="wb")
+
+            datadoc_file_path = Path(file_path).parent.joinpath(Path(datadoc_file_name))
+            datadoc_file_handle = datadoc_file_path.open(mode="w")
+
+        file_handle = t.cast(
+            BufferedWriter, file_handle
+        )  # file handle is always BufferedWriter when opening with "wb"
+
         match self._pseudo_data:
             case pl.DataFrame() as df:
-                if str(file_path).startswith("gs://"):
-                    with FileClient().gcs_open(
-                        str(file_path), mode="wb"
-                    ) as file_handle:
-                        # If we ask for a file to be opened in binary mode, we know that the type is AbstractBufferedFile
-                        file_handle = t.cast(AbstractBufferedFile, file_handle)
-                        write_from_df(df, file_format, file_handle, **kwargs)
-                else:
-                    write_from_df(df, file_format, str(file_path), **kwargs)
+                write_from_df(df, file_format, file_handle, **kwargs)
+                datadoc_file_handle.write(self.datadoc)
             case list() as file_data:
-                if str(file_path).startswith("gs://"):
-                    with FileClient().gcs_open(str(file_path), mode="w") as file_handle:
-                        # If we ask for a file to be opened in binary mode, we know that the type is AbstractBufferedFile
-                        file_handle.write(json.dumps(file_data))
-                else:
-                    with open(file_path, mode="w") as file_handle:
-                        file_handle.write(json.dumps(file_data))
-
+                write_from_dicts(
+                    file_data, SupportedOutputFileFormat(file_format), file_handle
+                )
+                datadoc_file_handle.write(self.datadoc)
             case _ as invalid_pseudo_data:
                 raise ValueError(f"Invalid response type: {type(invalid_pseudo_data)}")
 
+        file_handle.close()
+        datadoc_file_handle.close()
+
     @property
     def metadata(self) -> dict[str, Any]:
         """Returns the pseudonymization metadata as a dictionary.

diff --git a/src/dapla_pseudo/v1/supported_file_format.py b/src/dapla_pseudo/v1/supported_file_format.py
@@ -1,13 +1,15 @@
 """Classes used to support reading of dataframes from file."""
 
+import json
+import typing as t
 from enum import Enum
+from io import BufferedWriter
 from io import BytesIO
 from pathlib import Path
 from typing import Any
 
 import pandas as pd
 import polars as pl
-from fsspec.spec import AbstractBufferedFile
 
 from dapla_pseudo.exceptions import ExtensionNotValidError
 
@@ -77,10 +79,33 @@ def read_to_polars_df(
             )
 
 
+def write_from_dicts(
+    data: list[dict[str, t.Any]],
+    supported_format: SupportedOutputFileFormat,
+    file_like: BufferedWriter,
+) -> None:
+    """Writes data from a list of dicts to a file of the given format."""
+    match supported_format:
+        case SupportedOutputFileFormat.PARQUET:
+            df = pl.DataFrame(data)
+            # type hints lying
+            df.write_parquet(file_like)  # type: ignore[arg-type]
+        case SupportedOutputFileFormat.CSV:
+            df = pl.DataFrame(data)
+            df.write_csv(file_like)
+        case SupportedOutputFileFormat.JSON:
+            file_like.write(bytes(json.dumps(data), encoding="utf-8"))
+        case SupportedOutputFileFormat.XML:
+            df_pandas = pd.DataFrame.from_records(data)
+            df_pandas.to_xml(file_like)
+        case _:
+            raise ValueError("Unsupported output file format")
+
+
 def write_from_df(
     df: pl.DataFrame,
     supported_format: SupportedOutputFileFormat,
-    file_like: AbstractBufferedFile | str,
+    file_like: BufferedWriter,
     **kwargs: Any,
 ) -> None:
     """Writes to a file with a supported file format from a Dataframe."""
@@ -92,4 +117,5 @@ def write_from_df(
         case SupportedOutputFileFormat.XML:
             df.to_pandas().to_xml(file_like, **kwargs)
         case SupportedOutputFileFormat.PARQUET:
-            df.write_parquet(file_like, **kwargs)
+            # type hints lying
+            df.write_parquet(file_like, **kwargs)  # type: ignore[arg-type]
diff --git a/tests/v1/test_result.py b/tests/v1/test_result.py
@@ -70,7 +70,7 @@ def test_result_from_polars_to_pandas(df_personer: pl.DataFrame) -> None:
 
 def test_result_from_polars_to_file(tmp_path: Path, df_personer: pl.DataFrame) -> None:
     result = Result(PseudoFieldResponse(data=df_personer, raw_metadata=[]))
-    result.to_file(tmp_path / "polars_to_file.json")
+    result.to_file(str(tmp_path / "polars_to_file.json"))
 
 
 def test_result_from_file_to_polars(pseudo_file_response: PseudoFileResponse) -> None:
@@ -88,4 +88,4 @@ def test_result_from_file_to_file(
 ) -> None:
     result = Result(pseudo_response=pseudo_file_response)
     file_extension = pseudo_file_response.content_type.name.lower()
-    result.to_file(tmp_path / f"file_to_file.{file_extension}")
+    result.to_file(str(tmp_path / f"file_to_file.{file_extension}"))
diff --git a/tests/v1/test_supported_file_format.py b/tests/v1/test_supported_file_format.py
@@ -1,3 +1,4 @@
+import json
 from pathlib import Path
 
 import pandas as pd
@@ -9,6 +10,7 @@
 from dapla_pseudo.v1.supported_file_format import read_to_pandas_df
 from dapla_pseudo.v1.supported_file_format import read_to_polars_df
 from dapla_pseudo.v1.supported_file_format import write_from_df
+from dapla_pseudo.v1.supported_file_format import write_from_dicts
 
 PKG = "dapla_pseudo.v1.supported_file_format"
 TEST_FILE_PATH = "tests/v1/test_files"
@@ -51,5 +53,18 @@ def test_write_from_df(
     tmp_path: Path, df_personer: pl.DataFrame, file_format: str
 ) -> None:
     supported_format = SupportedOutputFileFormat(file_format)
+    file_handle = open(f"{tmp_path}/test.{file_format}", mode="wb")
+    write_from_df(df_personer, supported_format, file_handle)
 
-    write_from_df(df_personer, supported_format, f"{tmp_path}/test.{file_format}")
+
+@pytest.mark.parametrize("file_format", ["json", "csv", "parquet", "xml"])
+def test_write_from_dicts(
+    tmp_path: Path, personer_file_path: str, file_format: str
+) -> None:
+    supported_format = SupportedOutputFileFormat(file_format)
+    print(open(personer_file_path).read())
+    file_data = json.loads(open(personer_file_path).read())
+    assert isinstance(file_data, list)
+
+    dest_path = tmp_path / f"test.{file_format}"
+    write_from_dicts(file_data, supported_format, open(dest_path, mode="wb"))