Skip to content

Commit

Permalink
Write metadata to file (#351)
Browse files Browse the repository at this point in the history
* Initial commit

* Add private/ to .gitignore for local files

* Update poetry.lock

* Properly write files

* Make CI happy

* Add Datadoc metadata README

* Minor CI fixes
  • Loading branch information
mallport authored Mar 7, 2024
1 parent da49ee8 commit 0a6a41d
Show file tree
Hide file tree
Showing 9 changed files with 439 additions and 218 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,5 @@ dmypy.json
/.python-version
/.pytype/
/docs/_build/

private/
50 changes: 49 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -264,15 +264,63 @@ result_df = (
)
```


_Note that depseudonymization requires elevated access privileges._

### Repseudonymize

```python

## TODO
```

### Datadoc

_Note that depseudonymization requires elevated access privileges._
Datadoc metadata is gathered while pseudonymizing, and can be seen like so:

```python
result = (
Pseudonymize.from_polars(df)
.on_fields("fornavn")
.with_default_encryption()
.run()
)

print(result.datadoc)
```

Datadoc metadata is automatically written to the folder or bucket as the pseudonymized
data, when using the `to_file()` method on the result object.
The metadata file has the suffix `__DOC`, and is always a `.json` file.
The data and metadata is written to the file like so:

```python
result = (
Pseudonymize.from_polars(df)
.on_fields("fornavn")
.with_default_encryption()
.run()
)

# The line of code below also writes the file "gs://bucket/test__DOC.json"
result.to_file("gs://bucket/test.parquet")
```

Note that if you choose to only use the DataFrame from the result, **the metadata will be lost forever**!
An example of how this can happen:

```python
import dapla as dp
result = (
Pseudonymize.from_polars(df)
.on_fields("fornavn")
.with_default_encryption()
.run()
)
df = result.to_pandas()

dp.write_pandas(df, "gs://bucket/test.parquet", file_format="parquet") # The metadata is lost!!
```


## Requirements
Expand Down
483 changes: 298 additions & 185 deletions poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dapla-toolbelt-pseudo"
version = "1.2.2"
version = "1.3.0"
description = "Pseudonymization extensions for Dapla"
authors = ["Dapla Developers <[email protected]>"]
license = "MIT"
Expand Down Expand Up @@ -29,7 +29,8 @@ polars = ">=0.18.2"
pygments = ">2.15.0"
click = ">=8.0.1"
ssb-datadoc-model = ">=5.0.0"

cloudpathlib = { extras = ["gs"], version = ">=0.17.0" }
pyarrow = ">=14.0.2, <15"

[tool.poetry.group.test.dependencies]
typeguard = ">=2.13.3"
Expand Down
2 changes: 1 addition & 1 deletion src/dapla_pseudo/v1/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __auth_token(self) -> str:
auth_req = google.auth.transport.requests.Request()
token = t.cast(
str,
google.oauth2.id_token.fetch_id_token(auth_req, audience), # type: ignore[no-untyped-call]
google.oauth2.id_token.fetch_id_token(auth_req, audience),
)
return token
else:
Expand Down
62 changes: 39 additions & 23 deletions src/dapla_pseudo/v1/result.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
"""Common API models for builder packages."""

import json
import typing as t
from io import BufferedWriter
from pathlib import Path
from typing import Any

import pandas as pd
import polars as pl
from dapla import FileClient
from cloudpathlib import GSClient
from cloudpathlib import GSPath
from dapla import AuthClient
from datadoc_model.model import MetadataContainer
from datadoc_model.model import PseudonymizationMetadata
from datadoc_model.model import PseudoVariable
from fsspec.spec import AbstractBufferedFile

from dapla_pseudo.utils import get_file_format_from_file_name
from dapla_pseudo.v1.pseudo_commons import PseudoFieldResponse
from dapla_pseudo.v1.pseudo_commons import PseudoFileResponse
from dapla_pseudo.v1.supported_file_format import SupportedOutputFileFormat
from dapla_pseudo.v1.supported_file_format import write_from_df
from dapla_pseudo.v1.supported_file_format import write_from_dicts


class Result:
Expand Down Expand Up @@ -120,11 +123,11 @@ def to_pandas(self, **kwargs: t.Any) -> pd.DataFrame:
case _ as invalid_pseudo_data:
raise ValueError(f"Invalid response type: {type(invalid_pseudo_data)}")

def to_file(self, file_path: str | Path, **kwargs: t.Any) -> None:
"""Write pseudonymized data to a file.
def to_file(self, file_path: str, **kwargs: t.Any) -> None:
"""Write pseudonymized data to a file, with the metadata being written to the same folder.
Args:
file_path (str | Path): The path to the file to be written.
file_path (str): The path to the file to be written. If writing to a bucket, use the "gs://" prefix.
**kwargs: Additional keyword arguments to be passed the Polars writer function *if* the input data is a DataFrame.
The specific writer function depends on the format of the output file, e.g. `write_csv()` for CSV files.
Expand All @@ -135,29 +138,42 @@ def to_file(self, file_path: str | Path, **kwargs: t.Any) -> None:
"""
file_format = get_file_format_from_file_name(file_path)

datadoc_file_name = f"{Path(file_path).stem}__DOC.json"

datadoc_file_path: Path | GSPath
if file_path.startswith(GSPath.cloud_prefix):
client = GSClient(credentials=AuthClient.fetch_google_credentials())
gs_path = GSPath(file_path, client)

file_handle = gs_path.open(mode="wb")

datadoc_file_path = gs_path.parent.joinpath(Path(datadoc_file_name))
datadoc_file_handle = datadoc_file_path.open(mode="w")
else:
file_handle = Path(file_path).open(mode="wb")

datadoc_file_path = Path(file_path).parent.joinpath(Path(datadoc_file_name))
datadoc_file_handle = datadoc_file_path.open(mode="w")

file_handle = t.cast(
BufferedWriter, file_handle
) # file handle is always BufferedWriter when opening with "wb"

match self._pseudo_data:
case pl.DataFrame() as df:
if str(file_path).startswith("gs://"):
with FileClient().gcs_open(
str(file_path), mode="wb"
) as file_handle:
# If we ask for a file to be opened in binary mode, we know that the type is AbstractBufferedFile
file_handle = t.cast(AbstractBufferedFile, file_handle)
write_from_df(df, file_format, file_handle, **kwargs)
else:
write_from_df(df, file_format, str(file_path), **kwargs)
write_from_df(df, file_format, file_handle, **kwargs)
datadoc_file_handle.write(self.datadoc)
case list() as file_data:
if str(file_path).startswith("gs://"):
with FileClient().gcs_open(str(file_path), mode="w") as file_handle:
# If we ask for a file to be opened in binary mode, we know that the type is AbstractBufferedFile
file_handle.write(json.dumps(file_data))
else:
with open(file_path, mode="w") as file_handle:
file_handle.write(json.dumps(file_data))

write_from_dicts(
file_data, SupportedOutputFileFormat(file_format), file_handle
)
datadoc_file_handle.write(self.datadoc)
case _ as invalid_pseudo_data:
raise ValueError(f"Invalid response type: {type(invalid_pseudo_data)}")

file_handle.close()
datadoc_file_handle.close()

@property
def metadata(self) -> dict[str, Any]:
"""Returns the pseudonymization metadata as a dictionary.
Expand Down
32 changes: 29 additions & 3 deletions src/dapla_pseudo/v1/supported_file_format.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""Classes used to support reading of dataframes from file."""

import json
import typing as t
from enum import Enum
from io import BufferedWriter
from io import BytesIO
from pathlib import Path
from typing import Any

import pandas as pd
import polars as pl
from fsspec.spec import AbstractBufferedFile

from dapla_pseudo.exceptions import ExtensionNotValidError

Expand Down Expand Up @@ -77,10 +79,33 @@ def read_to_polars_df(
)


def write_from_dicts(
data: list[dict[str, t.Any]],
supported_format: SupportedOutputFileFormat,
file_like: BufferedWriter,
) -> None:
"""Writes data from a list of dicts to a file of the given format."""
match supported_format:
case SupportedOutputFileFormat.PARQUET:
df = pl.DataFrame(data)
# type hints lying
df.write_parquet(file_like) # type: ignore[arg-type]
case SupportedOutputFileFormat.CSV:
df = pl.DataFrame(data)
df.write_csv(file_like)
case SupportedOutputFileFormat.JSON:
file_like.write(bytes(json.dumps(data), encoding="utf-8"))
case SupportedOutputFileFormat.XML:
df_pandas = pd.DataFrame.from_records(data)
df_pandas.to_xml(file_like)
case _:
raise ValueError("Unsupported output file format")


def write_from_df(
df: pl.DataFrame,
supported_format: SupportedOutputFileFormat,
file_like: AbstractBufferedFile | str,
file_like: BufferedWriter,
**kwargs: Any,
) -> None:
"""Writes to a file with a supported file format from a Dataframe."""
Expand All @@ -92,4 +117,5 @@ def write_from_df(
case SupportedOutputFileFormat.XML:
df.to_pandas().to_xml(file_like, **kwargs)
case SupportedOutputFileFormat.PARQUET:
df.write_parquet(file_like, **kwargs)
# type hints lying
df.write_parquet(file_like, **kwargs) # type: ignore[arg-type]
4 changes: 2 additions & 2 deletions tests/v1/test_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_result_from_polars_to_pandas(df_personer: pl.DataFrame) -> None:

def test_result_from_polars_to_file(tmp_path: Path, df_personer: pl.DataFrame) -> None:
result = Result(PseudoFieldResponse(data=df_personer, raw_metadata=[]))
result.to_file(tmp_path / "polars_to_file.json")
result.to_file(str(tmp_path / "polars_to_file.json"))


def test_result_from_file_to_polars(pseudo_file_response: PseudoFileResponse) -> None:
Expand All @@ -88,4 +88,4 @@ def test_result_from_file_to_file(
) -> None:
result = Result(pseudo_response=pseudo_file_response)
file_extension = pseudo_file_response.content_type.name.lower()
result.to_file(tmp_path / f"file_to_file.{file_extension}")
result.to_file(str(tmp_path / f"file_to_file.{file_extension}"))
17 changes: 16 additions & 1 deletion tests/v1/test_supported_file_format.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from pathlib import Path

import pandas as pd
Expand All @@ -9,6 +10,7 @@
from dapla_pseudo.v1.supported_file_format import read_to_pandas_df
from dapla_pseudo.v1.supported_file_format import read_to_polars_df
from dapla_pseudo.v1.supported_file_format import write_from_df
from dapla_pseudo.v1.supported_file_format import write_from_dicts

PKG = "dapla_pseudo.v1.supported_file_format"
TEST_FILE_PATH = "tests/v1/test_files"
Expand Down Expand Up @@ -51,5 +53,18 @@ def test_write_from_df(
tmp_path: Path, df_personer: pl.DataFrame, file_format: str
) -> None:
supported_format = SupportedOutputFileFormat(file_format)
file_handle = open(f"{tmp_path}/test.{file_format}", mode="wb")
write_from_df(df_personer, supported_format, file_handle)

write_from_df(df_personer, supported_format, f"{tmp_path}/test.{file_format}")

@pytest.mark.parametrize("file_format", ["json", "csv", "parquet", "xml"])
def test_write_from_dicts(
tmp_path: Path, personer_file_path: str, file_format: str
) -> None:
supported_format = SupportedOutputFileFormat(file_format)
print(open(personer_file_path).read())
file_data = json.loads(open(personer_file_path).read())
assert isinstance(file_data, list)

dest_path = tmp_path / f"test.{file_format}"
write_from_dicts(file_data, supported_format, open(dest_path, mode="wb"))

0 comments on commit 0a6a41d

Please sign in to comment.