Skip to content

Commit

Permalink
Construct a new builder from a previously generated result (#435)
Browse files Browse the repository at this point in the history
* Construct a new builder from a previously generated result

* Add more tests
  • Loading branch information
mallport authored Feb 14, 2025
1 parent b85cc6b commit 6d405fb
Show file tree
Hide file tree
Showing 7 changed files with 501 additions and 4 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dapla-toolbelt-pseudo"
version = "2.2.7"
version = "2.2.8"
description = "Pseudonymization extensions for Dapla"
authors = ["Dapla Developers <[email protected]>"]
license = "MIT"
Expand Down
79 changes: 78 additions & 1 deletion src/dapla_pseudo/v1/depseudo.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Builder for submitting a pseudonymization request."""

from datetime import date
from typing import Any
from typing import ClassVar

import pandas as pd
import polars as pl
from datadoc_model.model import MetadataContainer

from dapla_pseudo.constants import TIMEOUT_DEFAULT
from dapla_pseudo.constants import MapFailureStrategy
Expand All @@ -28,13 +30,17 @@ class Depseudonymize:
"""

dataset: File | pl.DataFrame
prev_metadata: dict[str, dict[str, list[Any]]] | None # Used in "from_result()"
prev_datadoc: MetadataContainer | None # Used in "from_result()"

@staticmethod
def from_pandas(
dataframe: pd.DataFrame, run_as_file: bool = False
) -> "Depseudonymize._Depseudonymizer":
"""Initialize a depseudonymization request from a pandas DataFrame."""
dataset: pl.DataFrame = pl.from_pandas(dataframe)
Depseudonymize.prev_metadata = None
Depseudonymize.prev_datadoc = None
if run_as_file:
file_handle, content_type = get_file_data_from_dataset(dataset)
Depseudonymize.dataset = File(file_handle, content_type)
Expand All @@ -47,6 +53,8 @@ def from_polars(
dataframe: pl.DataFrame, run_as_file: bool = False
) -> "Depseudonymize._Depseudonymizer":
"""Initialize a depseudonymization request from a polars DataFrame."""
Depseudonymize.prev_metadata = None
Depseudonymize.prev_datadoc = None
if run_as_file:
file_handle, content_type = get_file_data_from_dataset(dataframe)
Depseudonymize.dataset = File(file_handle, content_type)
Expand Down Expand Up @@ -77,10 +85,66 @@ def from_file(dataset: FileLikeDatasetDecl) -> "Depseudonymize._Depseudonymizer"
local_path = "some_file.csv"
field_selector = Depseudonymize.from_file(local_path))
"""
Depseudonymize.prev_metadata = None
Depseudonymize.prev_datadoc = None
file_handle, content_type = get_file_data_from_dataset(dataset)
Depseudonymize.dataset = File(file_handle, content_type)
return Depseudonymize._Depseudonymizer()

@staticmethod
def from_result(
result: Result, run_as_file: bool = False
) -> "Depseudonymize._Depseudonymizer":
"""Initialize from a previously computed Result.
This allows the user to compose results from different pseudonymization operations,
(pseudo/depseudo/repseudo), while preserving the metadata as it was a single run.
This should not be used for operations of the same pseudo operation,
in which case the builder pattern is preserved.
Args:
result: A previously pseudonymized DataFrame
run_as_file: Force the dataset to be pseudonymized as a single file.
Raises:
ValueError: If the data structure in the "Result" object is not a DataFrame.
Returns:
_Depseudonymizer: An instance of the _Depseudonymizer class.
Examples:
result = (
Pseudonymize
.from_polars(df)
.on_fields("fornavn","etternavn")
.with_default_encryption()
.run()
)
result = (
Depseudonymize
.from_result(result)
.on_fields("bolig")
.with_default_encryption()
.run()
)
result.to_file("gs://ssb-play-obr-data-delt-ledstill-prod/")
"""
Depseudonymize.prev_metadata = result._metadata
Depseudonymize.prev_datadoc = result._datadoc

if run_as_file:
file_handle, content_type = get_file_data_from_dataset(result._pseudo_data)
Depseudonymize.dataset = File(file_handle, content_type)
else:
if type(result._pseudo_data) is not pl.DataFrame:
raise ValueError(
"Chaining pseudo results can only be done with DataFrames"
)
Depseudonymize.dataset = result._pseudo_data

return Depseudonymize._Depseudonymizer()

class _Depseudonymizer(_BasePseudonymizer):
"""Select one or multiple fields to be pseudonymized."""

Expand Down Expand Up @@ -120,7 +184,20 @@ def run(
dataset=Depseudonymize.dataset,
hierarchical=hierarchical,
)
return super()._execute_pseudo_operation(self.rules, timeout, custom_keyset)

result = super()._execute_pseudo_operation(
self.rules, timeout, custom_keyset
)
if (
Depseudonymize.prev_datadoc is not None
and Depseudonymize.prev_metadata is not None
): # Add metadata from previous Result
result.add_previous_metadata(
Depseudonymize.prev_metadata, Depseudonymize.prev_datadoc
)
return result
else:
return result

class _DepseudoFuncSelector(_BaseRuleConstructor):
def __init__(self, fields: list[str]) -> None:
Expand Down
80 changes: 79 additions & 1 deletion src/dapla_pseudo/v1/pseudo.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Builder for submitting a pseudonymization request."""

from datetime import date
from typing import Any
from typing import ClassVar

import pandas as pd
import polars as pl
from datadoc_model.model import MetadataContainer

from dapla_pseudo.constants import TIMEOUT_DEFAULT
from dapla_pseudo.constants import MapFailureStrategy
Expand All @@ -28,6 +30,8 @@ class Pseudonymize:
"""

dataset: File | pl.DataFrame
prev_metadata: dict[str, dict[str, list[Any]]] | None # Used in "from_result()"
prev_datadoc: MetadataContainer | None # Used in "from_result()"

@staticmethod
def from_pandas(
Expand All @@ -43,6 +47,8 @@ def from_pandas(
_Pseudonymizer: An instance of the _Pseudonymizer class.
"""
dataset: pl.DataFrame = pl.from_pandas(dataframe)
Pseudonymize.prev_metadata = None
Pseudonymize.prev_datadoc = None
if run_as_file:
file_handle, content_type = get_file_data_from_dataset(dataset)
Pseudonymize.dataset = File(file_handle, content_type)
Expand All @@ -63,6 +69,8 @@ def from_polars(
Returns:
_Pseudonymizer: An instance of the _Pseudonymizer class.
"""
Pseudonymize.prev_metadata = None
Pseudonymize.prev_datadoc = None
if run_as_file:
file_handle, content_type = get_file_data_from_dataset(dataframe)
Pseudonymize.dataset = File(file_handle, content_type)
Expand Down Expand Up @@ -93,10 +101,67 @@ def from_file(dataset: FileLikeDatasetDecl) -> "Pseudonymize._Pseudonymizer":
local_path = "some_file.csv"
field_selector = Pseudonymize.from_file(local_path))
"""
Pseudonymize.prev_metadata = None
Pseudonymize.prev_datadoc = None

file_handle, content_type = get_file_data_from_dataset(dataset)
Pseudonymize.dataset = File(file_handle, content_type)
return Pseudonymize._Pseudonymizer()

@staticmethod
def from_result(
result: Result, run_as_file: bool = False
) -> "Pseudonymize._Pseudonymizer":
"""Initialize a pseudonymization request from a previously computed Result.
This allows the user to compose results from different pseudonymization operations,
(pseudo/depseudo/repseudo), while preserving the metadata as it was a single run.
This should not be used for operations of the same pseudo operation,
in which case the builder pattern is preserved.
Args:
result: A previously pseudonymized DataFrame
run_as_file: Force the dataset to be pseudonymized as a single file.
Raises:
ValueError: If the data structure in the "Result" object is not a DataFrame.
Returns:
_Pseudonymizer: An instance of the _Pseudonymizer class.
Examples:
result = (
Pseudonymize
.from_polars(df)
.on_fields("fornavn","etternavn")
.with_default_encryption()
.run()
)
result = (
Depseudonymize
.from_result(result)
.on_fields("bolig")
.with_default_encryption()
.run()
)
result.to_file("gs://ssb-play-obr-data-delt-ledstill-prod/")
"""
Pseudonymize.prev_metadata = result._metadata
Pseudonymize.prev_datadoc = result._datadoc

if run_as_file:
file_handle, content_type = get_file_data_from_dataset(result._pseudo_data)
Pseudonymize.dataset = File(file_handle, content_type)
else:
if type(result._pseudo_data) is not pl.DataFrame:
raise ValueError(
"Chaining pseudo results can only be done with DataFrames"
)
Pseudonymize.dataset = result._pseudo_data

return Pseudonymize._Pseudonymizer()

class _Pseudonymizer(_BasePseudonymizer):
"""Select one or multiple fields to be pseudonymized."""

Expand Down Expand Up @@ -143,7 +208,20 @@ def run(
dataset=Pseudonymize.dataset,
hierarchical=hierarchical,
)
return super()._execute_pseudo_operation(self.rules, timeout, custom_keyset)

result = super()._execute_pseudo_operation(
self.rules, timeout, custom_keyset
)
if (
Pseudonymize.prev_datadoc is not None
and Pseudonymize.prev_metadata is not None
): # Add metadata from previous Result
result.add_previous_metadata(
Pseudonymize.prev_metadata, Pseudonymize.prev_datadoc
)
return result
else:
return result

class _PseudoFuncSelector(_BaseRuleConstructor):
def __init__(self, fields: list[str]) -> None:
Expand Down
75 changes: 74 additions & 1 deletion src/dapla_pseudo/v1/repseudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas as pd
import polars as pl
from datadoc_model.model import MetadataContainer

from dapla_pseudo.constants import TIMEOUT_DEFAULT
from dapla_pseudo.constants import PredefinedKeys
Expand All @@ -27,13 +28,17 @@ class Repseudonymize:
"""

dataset: File | pl.DataFrame
prev_metadata: dict[str, dict[str, list[t.Any]]] | None # Used in "from_result()"
prev_datadoc: MetadataContainer | None # Used in "from_result()"

@staticmethod
def from_pandas(
dataframe: pd.DataFrame, run_as_file: bool = False
) -> "Repseudonymize._Repseudonymizer":
"""Initialize a pseudonymization request from a pandas DataFrame."""
dataset: pl.DataFrame = pl.from_pandas(dataframe)
Repseudonymize.prev_metadata = None
Repseudonymize.prev_datadoc = None
if run_as_file:
file_handle, content_type = get_file_data_from_dataset(dataset)
Repseudonymize.dataset = File(file_handle, content_type)
Expand All @@ -46,6 +51,8 @@ def from_polars(
dataframe: pl.DataFrame, run_as_file: bool = False
) -> "Repseudonymize._Repseudonymizer":
"""Initialize a pseudonymization request from a polars DataFrame."""
Repseudonymize.prev_metadata = None
Repseudonymize.prev_datadoc = None
if run_as_file:
file_handle, content_type = get_file_data_from_dataset(dataframe)
Repseudonymize.dataset = File(file_handle, content_type)
Expand Down Expand Up @@ -76,10 +83,66 @@ def from_file(dataset: FileLikeDatasetDecl) -> "Repseudonymize._Repseudonymizer"
local_path = "some_file.csv"
field_selector = Pseudonymize.from_file(local_path))
"""
Repseudonymize.prev_metadata = None
Repseudonymize.prev_datadoc = None
file_handle, content_type = get_file_data_from_dataset(dataset)
Repseudonymize.dataset = File(file_handle, content_type)
return Repseudonymize._Repseudonymizer()

@staticmethod
def from_result(
result: Result, run_as_file: bool = False
) -> "Repseudonymize._Repseudonymizer":
"""Initialize a pseudonymization request from a previously computed Result.
This allows the user to compose results from different pseudonymization operations,
(pseudo/depseudo/repseudo), while preserving the metadata as it was a single run.
This should not be used for operations of the same pseudo operation,
in which case the builder pattern is preserved.
Args:
result: A previously pseudonymized DataFrame
run_as_file: Force the dataset to be pseudonymized as a single file.
Raises:
ValueError: If the data structure in the "Result" object is not a DataFrame.
Returns:
_Repseudonymizer: An instance of the _Repseudonymizer class.
Examples:
result = (
Pseudonymize
.from_polars(df)
.on_fields("fornavn","etternavn")
.with_default_encryption()
.run()
)
result = (
Repseudonymize
.from_result(result)
.on_fields("bolig")
.with_default_encryption()
.run()
)
result.to_file("gs://ssb-play-obr-data-delt-ledstill-prod/")
"""
Repseudonymize.prev_metadata = result._metadata
Repseudonymize.prev_datadoc = result._datadoc

if run_as_file:
file_handle, content_type = get_file_data_from_dataset(result._pseudo_data)
Repseudonymize.dataset = File(file_handle, content_type)
else:
if type(result._pseudo_data) is not pl.DataFrame:
raise ValueError(
"Chaining pseudo results can only be done with DataFrames"
)
Repseudonymize.dataset = result._pseudo_data

return Repseudonymize._Repseudonymizer()

class _Repseudonymizer(_BasePseudonymizer):
"""Select one or multiple fields to be pseudonymized."""

Expand Down Expand Up @@ -135,13 +198,23 @@ def run(
hierarchical=hierarchical,
)

return super()._execute_pseudo_operation(
result = super()._execute_pseudo_operation(
rules=self.source_rules,
target_rules=self.target_rules,
custom_keyset=source_custom_keyset,
target_custom_keyset=target_custom_keyset,
timeout=timeout,
)
if (
Repseudonymize.prev_datadoc is not None
and Repseudonymize.prev_metadata is not None
): # Add metadata from previous Result
result.add_previous_metadata(
Repseudonymize.prev_metadata, Repseudonymize.prev_datadoc
)
return result
else:
return result

class _RepseudoFuncSelectorSource(_BaseRuleConstructor):
def __init__(self, fields: list[str]) -> None:
Expand Down
Loading

0 comments on commit 6d405fb

Please sign in to comment.