diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..b5126420 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,93 @@ +from unittest.mock import MagicMock + +import pandas as pd +import polars as pl +import pytest + + +@pytest.fixture +def df_personer() -> pl.DataFrame: + JSON_FILE = "tests/data/personer.json" + return pl.read_json( + JSON_FILE, + schema={ + "fnr": pl.String, + "fornavn": pl.String, + "etternavn": pl.String, + "kjonn": pl.String, + "fodselsdato": pl.String, + }, + ) + + +@pytest.fixture +def df_personer_pandas() -> pd.DataFrame: + JSON_FILE = "tests/data/personer.json" + return pd.read_json( + JSON_FILE, + dtype={ + "fnr": str, + "fornavn": str, + "etternavn": str, + "kjonn": str, + "fodselsdato": str, + }, + ) + + +@pytest.fixture() +def personer_hierarch_file_path() -> str: + return "tests/data/personer_hierarchical.json" + + +@pytest.fixture() +def personer_pseudonymized_hierarch_file_path() -> str: + return "tests/data/personer_hierarchical_pseudonymized.json" + + +@pytest.fixture +def personer_file_path() -> str: + return "tests/data/personer.json" + + +@pytest.fixture() +def personer_pseudonymized_file_path() -> str: + return "tests/data/personer_pseudonymized_default_encryption.json" + + +@pytest.fixture +def df_personer_fnr_daead_encrypted() -> pl.DataFrame: + JSON_FILE = "tests/data/personer_pseudonymized_default_encryption.json" + return pl.read_json( + JSON_FILE, + schema={ + "fnr": pl.String, + "fornavn": pl.String, + "etternavn": pl.String, + "kjonn": pl.String, + "fodselsdato": pl.String, + }, + ) + + +@pytest.fixture +def df_pandas_personer_fnr_daead_encrypted() -> pd.DataFrame: + JSON_FILE = "tests/data/personer_pseudonymized_default_encryption.json" + return pd.read_json( + JSON_FILE, + dtype={ + "fnr": str, + "fornavn": str, + "etternavn": str, + "kjonn": str, + "fodselsdato": str, + }, + ) + + +@pytest.fixture() +def single_field_response() -> MagicMock: + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.content = b'{"data": ["Donald","Mikke","Anton"], "datadoc_metadata": {"pseudo_variables": []}, "metrics": [], "logs": []}' + return mock_response diff --git a/tests/data/personer_pseudonymized_default_encryption.csv b/tests/data/personer_pseudonymized_default_encryption.csv index 37baa2e8..e8e57355 100644 --- a/tests/data/personer_pseudonymized_default_encryption.csv +++ b/tests/data/personer_pseudonymized_default_encryption.csv @@ -1,4 +1,4 @@ fnr,fornavn,etternavn,kjonn,fodselsdato -AQ24fCDk0CgMxRkmKt4ok4S/Ora4E06co27zSpHVog==,Donald,Duck,M,020995 -AQ24fCDk02LuH0Bob3QmvjG0LG9UXv54mcGjjzrf+g==,Mikke,Mus,M,060970 -AQ24fCDZGzjjbKI2B9s0FOsEXSP33OrssHQXonYVcr8=,Anton,Duck,M,180999 +AWIRfKLSNfR0ID+wBzogEcUT7JQPayk7Gosij6SXr8s=,Donald,Duck,M,020995 +AWIRfKKLagk0LqYCKpiC4xfPkHqIWGVfc3wg5gUwRNE=,Mikke,Mus,M,060970 +AWIRfKIzL1T9iZqt+pLjNbHMsLa0aKSszsRrLiLSAAg=,Anton,Duck,M,180999 diff --git a/tests/integration/test_integration_deseudonymize.py b/tests/integration/test_integration_deseudonymize.py index e2a878d8..17a313b9 100644 --- a/tests/integration/test_integration_deseudonymize.py +++ b/tests/integration/test_integration_deseudonymize.py @@ -3,8 +3,6 @@ import polars as pl from dapla_pseudo import Depseudonymize -from tests.integration.utils import df_personer -from tests.integration.utils import df_personer_fnr_daead_encrypted from tests.integration.utils import integration_test from tests.integration.utils import setup diff --git a/tests/integration/test_integration_pseudonymize.py b/tests/integration/test_integration_pseudonymize.py index b9f2793b..5790bdd3 100644 --- a/tests/integration/test_integration_pseudonymize.py +++ b/tests/integration/test_integration_pseudonymize.py @@ -11,8 +11,6 @@ import pytest from dapla_pseudo import Pseudonymize -from tests.integration.utils import df_personer -from tests.integration.utils import df_personer_fnr_daead_encrypted from tests.integration.utils import get_calling_function_name from tests.integration.utils import get_expected_datadoc_metadata_container from tests.integration.utils import integration_test diff --git a/tests/integration/test_integration_result.py b/tests/integration/test_integration_result.py index 7c985208..b321ee2a 100644 --- a/tests/integration/test_integration_result.py +++ b/tests/integration/test_integration_result.py @@ -8,12 +8,7 @@ import pytest from dapla_pseudo import Pseudonymize -from tests.integration.utils import df_pandas_personer_fnr_daead_encrypted -from tests.integration.utils import df_personer -from tests.integration.utils import df_personer_fnr_daead_encrypted -from tests.integration.utils import df_personer_pandas from tests.integration.utils import integration_test -from tests.integration.utils import personer_file_path from tests.integration.utils import setup diff --git a/tests/integration/test_integration_validate.py b/tests/integration/test_integration_validate.py index 08551540..02d9b770 100644 --- a/tests/integration/test_integration_validate.py +++ b/tests/integration/test_integration_validate.py @@ -3,7 +3,6 @@ import polars as pl from dapla_pseudo import Validator -from tests.integration.utils import df_personer from tests.integration.utils import integration_test from tests.integration.utils import setup diff --git a/tests/integration/utils.py b/tests/integration/utils.py index d3f737de..d76da3ca 100644 --- a/tests/integration/utils.py +++ b/tests/integration/utils.py @@ -11,71 +11,6 @@ from datadoc_model.model import MetadataContainer -@pytest.fixture -def df_personer() -> pl.DataFrame: - JSON_FILE = "tests/data/personer.json" - return pl.read_json( - JSON_FILE, - schema={ - "fnr": pl.String, - "fornavn": pl.String, - "etternavn": pl.String, - "kjonn": pl.String, - "fodselsdato": pl.String, - }, - ) - - -@pytest.fixture -def df_personer_pandas() -> pd.DataFrame: - JSON_FILE = "tests/data/personer.json" - return pd.read_json( - JSON_FILE, - dtype={ - "fnr": str, - "fornavn": str, - "etternavn": str, - "kjonn": str, - "fodselsdato": str, - }, - ) - - -@pytest.fixture -def personer_file_path() -> str: - return "tests/data/personer.json" - - -@pytest.fixture -def df_personer_fnr_daead_encrypted() -> pl.DataFrame: - JSON_FILE = "tests/data/personer_pseudonymized_default_encryption.json" - return pl.read_json( - JSON_FILE, - schema={ - "fnr": pl.String, - "fornavn": pl.String, - "etternavn": pl.String, - "kjonn": pl.String, - "fodselsdato": pl.String, - }, - ) - - -@pytest.fixture -def df_pandas_personer_fnr_daead_encrypted() -> pd.DataFrame: - JSON_FILE = "tests/data/personer_pseudonymized_default_encryption.json" - return pd.read_json( - JSON_FILE, - dtype={ - "fnr": str, - "fornavn": str, - "etternavn": str, - "kjonn": str, - "fodselsdato": str, - }, - ) - - def integration_test() -> pytest.MarkDecorator: # Tests annotated with integration_test will run if `INTEGRATION_TESTS` env variable is unset or `TRUE` # This is used to disable integration tests in the `test.yaml` workflow, since these tests need additional configuration. diff --git a/tests/v1/test_depseudo.py b/tests/v1/test_depseudo.py index a48ca4af..12a869f0 100644 --- a/tests/v1/test_depseudo.py +++ b/tests/v1/test_depseudo.py @@ -35,44 +35,6 @@ TEST_FILE_PATH = "tests/v1/test_files" -@pytest.fixture() -def pseudonymized_df() -> pd.DataFrame: - with open("tests/data/personer_pseudonymized_default_encryption.csv") as test_data: - return pd.read_csv(test_data) - - -@pytest.fixture() -def pseudonymized_df_polars() -> pl.DataFrame: - dtypes = { - "fnr": pl.Utf8, - "fornavn": pl.Utf8, - "etternavn": pl.Utf8, - "kjonn": pl.Categorical, - "fodselsdato": pl.Utf8, - } - return pl.read_csv( - "tests/data/personer_pseudonymized_default_encryption.csv", dtypes=dtypes - ) - - -@pytest.fixture() -def json_pseudonymized_file_path() -> str: - return "tests/data/personer_pseudonymized_default_encryption.json" - - -@pytest.fixture() -def json_pseudonymized_hierarch_file_path() -> str: - return "tests/data/personer_hierarchical_pseudonymized.json" - - -@pytest.fixture() -def single_field_response() -> MagicMock: - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = b'{"data": ["f1","f2","f3"], "datadoc_metadata": {"pseudo_variables": []}, "metrics": [], "logs": []}' - return mock_response - - def mock_return_pseudonymize_operation_field( patch_pseudonymize_operation_field: Mock, ) -> None: @@ -85,7 +47,7 @@ def mock_return_pseudonymize_operation_field( @patch("dapla_pseudo.v1.PseudoClient._post_to_field_endpoint") def test_builder_pandas_depseudonymize_minimal_call( patched_post_to_field_endpoint: Mock, - pseudonymized_df: pd.DataFrame, + df_personer_fnr_daead_encrypted: pl.DataFrame, single_field_response: MagicMock, ) -> None: field_name = "fornavn" @@ -93,7 +55,7 @@ def test_builder_pandas_depseudonymize_minimal_call( patched_post_to_field_endpoint.return_value = single_field_response pseudo_result = ( - Depseudonymize.from_pandas(pseudonymized_df) + Depseudonymize.from_polars(df_personer_fnr_daead_encrypted) .on_fields(field_name) .with_default_encryption() .run() @@ -102,9 +64,8 @@ def test_builder_pandas_depseudonymize_minimal_call( pseudo_dataframe = pseudo_result.to_pandas() # TODO: Test for metadata values - # Check that the pseudonymized df has new values - assert pseudo_dataframe[field_name].tolist() == ["f1", "f2", "f3"] + assert pseudo_dataframe[field_name].tolist() == ["Donald", "Mikke", "Anton"] @patch("dapla_pseudo.v1.PseudoClient._post_to_field_endpoint") @@ -125,27 +86,31 @@ def test_single_field_pseudonymize_operation_field( TIMEOUT_DEFAULT, PseudoClient(pseudo_service_url="mock_url", auth_token="mock_token"), ) - assert data.to_list() == ["f1", "f2", "f3"] + assert data.to_list() == ["Donald", "Mikke", "Anton"] -def test_depseudo_fields_selector_single_field(pseudonymized_df: pd.DataFrame) -> None: - Depseudonymize.from_pandas(pseudonymized_df).on_fields("fornavn")._fields = [ +def test_depseudo_fields_selector_single_field( + df_personer_fnr_daead_encrypted: pl.DataFrame, +) -> None: + assert Depseudonymize.from_polars(df_personer_fnr_daead_encrypted).on_fields( "fornavn" - ] + )._fields == ["fornavn"] -def test_builder_fields_selector_single_field_polars( - pseudonymized_df_polars: pl.DataFrame, +def test_builder_fields_selector_single_field_pandas( + df_pandas_personer_fnr_daead_encrypted: pd.DataFrame, ) -> None: - Depseudonymize.from_polars(pseudonymized_df_polars).on_fields("fornavn")._fields = [ + assert Depseudonymize.from_pandas(df_pandas_personer_fnr_daead_encrypted).on_fields( "fornavn" - ] + )._fields == ["fornavn"] def test_builder_fields_selector_multiple_fields( - pseudonymized_df: pd.DataFrame, + df_personer_fnr_daead_encrypted: pl.DataFrame, ) -> None: - Depseudonymize.from_pandas(pseudonymized_df).on_fields("fornavn", "fnr")._fields = [ + assert Depseudonymize.from_polars(df_personer_fnr_daead_encrypted).on_fields( + "fornavn", "fnr" + )._fields == [ "fornavn", "fnr", ] @@ -153,13 +118,14 @@ def test_builder_fields_selector_multiple_fields( @patch(f"{PKG}.pseudo_operation_file") def test_builder_file_default( - patched_pseudo_operation_file: MagicMock, json_pseudonymized_file_path: str + patched_pseudo_operation_file: MagicMock, personer_pseudonymized_file_path: str ) -> None: mock_pseudo_file_response = Mock() mock_pseudo_file_response.data = File(file_handle=Mock(), content_type=Mock()) print(type(mock_pseudo_file_response.data)) patched_pseudo_operation_file.return_value = mock_pseudo_file_response - Depseudonymize.from_file(json_pseudonymized_file_path).on_fields( + + Depseudonymize.from_file(personer_pseudonymized_file_path).on_fields( "fornavn" ).with_default_encryption().run() @@ -191,10 +157,11 @@ def test_builder_file_default( @patch(f"{PKG}.pseudo_operation_file") def test_builder_file_hierarchical( - patched_pseudo_operation_file: MagicMock, json_pseudonymized_hierarch_file_path: str + patched_pseudo_operation_file: MagicMock, + personer_pseudonymized_hierarch_file_path: str, ) -> None: patched_pseudo_operation_file.return_value = Mock() - Depseudonymize.from_file(json_pseudonymized_hierarch_file_path).on_fields( + Depseudonymize.from_file(personer_pseudonymized_hierarch_file_path).on_fields( "person_info/fnr" ).with_default_encryption().run() @@ -226,16 +193,16 @@ def test_builder_file_hierarchical( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_selector_default( - patch_pseudonymize_operation_field: MagicMock, pseudonymized_df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) - Depseudonymize.from_pandas(pseudonymized_df).on_fields( + Depseudonymize.from_polars(df_personer).on_fields( "fornavn" ).with_default_encryption().run() patch_pseudonymize_operation_field.assert_called_once_with( path="depseudonymize/field", field_name="fornavn", - values=pseudonymized_df["fornavn"].tolist(), + values=df_personer["fornavn"].to_list(), pseudo_func=PseudoFunction( function_type=PseudoFunctionTypes.DAEAD, kwargs=DaeadKeywordArgs() ), @@ -247,15 +214,15 @@ def test_builder_pseudo_function_selector_default( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_selector_fpe( - patch_pseudonymize_operation_field: MagicMock, pseudonymized_df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) - Depseudonymize.from_pandas(pseudonymized_df).on_fields( + Depseudonymize.from_polars(df_personer).on_fields( "fnr" ).with_papis_compatible_encryption().run() patch_pseudonymize_operation_field.assert_called_once_with( path="depseudonymize/field", - values=pseudonymized_df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=PseudoFunction( function_type=PseudoFunctionTypes.FF31, kwargs=FF31KeywordArgs() @@ -268,19 +235,19 @@ def test_builder_pseudo_function_selector_fpe( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_selector_custom( - patch_pseudonymize_operation_field: MagicMock, pseudonymized_df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) pseudo_func = PseudoFunction( function_type=PseudoFunctionTypes.FF31, kwargs=FF31KeywordArgs() ) - Depseudonymize.from_pandas(pseudonymized_df).on_fields("fnr").with_custom_function( + Depseudonymize.from_polars(df_personer).on_fields("fnr").with_custom_function( pseudo_func ).run() patch_pseudonymize_operation_field.assert_called_once_with( path="depseudonymize/field", - values=pseudonymized_df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=pseudo_func, timeout=TIMEOUT_DEFAULT, @@ -291,20 +258,20 @@ def test_builder_pseudo_function_selector_custom( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_selector_redact( - patch_pseudonymize_operation_field: MagicMock, pseudonymized_df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) pseudo_func = PseudoFunction( function_type=PseudoFunctionTypes.REDACT, kwargs=RedactArgs(replacement_string="test"), ) - Depseudonymize.from_pandas(pseudonymized_df).on_fields("fnr").with_custom_function( + Depseudonymize.from_polars(df_personer).on_fields("fnr").with_custom_function( pseudo_func ).run() patch_pseudonymize_operation_field.assert_called_once_with( path="depseudonymize/field", - values=pseudonymized_df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=pseudo_func, timeout=TIMEOUT_DEFAULT, @@ -315,7 +282,7 @@ def test_builder_pseudo_function_selector_redact( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_keyset_selector_custom( - patch_pseudonymize_operation_field: MagicMock, pseudonymized_df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) @@ -340,13 +307,13 @@ def test_builder_pseudo_keyset_selector_custom( kek_uri=kek_uri, encrypted_keyset=encrypted_keyset, keyset_info=keyset_info ) - Depseudonymize.from_pandas(pseudonymized_df).on_fields("fnr").with_custom_function( + Depseudonymize.from_polars(df_personer).on_fields("fnr").with_custom_function( pseudo_func ).run(custom_keyset=keyset) patch_pseudonymize_operation_field.assert_called_once_with( path="depseudonymize/field", - values=pseudonymized_df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=pseudo_func, timeout=TIMEOUT_DEFAULT, @@ -357,7 +324,7 @@ def test_builder_pseudo_keyset_selector_custom( @patch(f"{PKG}.pseudonymize_operation_field") def test_pseudonymize_field_dataframe_setup( - patch_pseudonymize_operation_field: MagicMock, pseudonymized_df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: name = kwargs["field_name"] @@ -370,7 +337,7 @@ def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: fields_to_pseudonymize = "fnr", "fornavn", "etternavn" result = ( - Depseudonymize.from_pandas(pseudonymized_df) + Depseudonymize.from_polars(df_personer) .on_fields(*fields_to_pseudonymize) .with_default_encryption() .run() @@ -382,9 +349,9 @@ def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: assert dataframe[field].to_list() == side_effect(field_name=field)[0].to_list() -def test_builder_field_selector_multiple_fields(pseudonymized_df: pd.DataFrame) -> None: +def test_builder_field_selector_multiple_fields(df_personer: pl.DataFrame) -> None: fields = ["snr", "snr_mor", "snr_far"] - assert Depseudonymize.from_pandas(pseudonymized_df).on_fields(*fields)._fields == [ + assert Depseudonymize.from_polars(df_personer).on_fields(*fields)._fields == [ f"{f}" for f in fields ] @@ -423,7 +390,7 @@ def test_builder_from_invalid_gcs_file() -> None: @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_to_polars_from_polars_chaining( - patch_pseudonymize_operation_field: MagicMock, pseudonymized_df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: name = kwargs["field_name"] @@ -435,7 +402,7 @@ def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: patch_pseudonymize_operation_field.side_effect = side_effect fields_to_depseudonymize = "fnr", "fornavn", "etternavn" result: pl.DataFrame = ( - Depseudonymize.from_pandas(pseudonymized_df) + Depseudonymize.from_polars(df_personer) .on_fields(*fields_to_depseudonymize) .with_default_encryption() .run() diff --git a/tests/v1/test_pseudo.py b/tests/v1/test_pseudo.py index e29fee68..ab3b9d3d 100644 --- a/tests/v1/test_pseudo.py +++ b/tests/v1/test_pseudo.py @@ -1,4 +1,3 @@ -import json import typing as t from datetime import date from unittest.mock import ANY @@ -6,7 +5,6 @@ from unittest.mock import Mock from unittest.mock import patch -import pandas as pd import polars as pl import pytest from google.auth.exceptions import DefaultCredentialsError @@ -39,36 +37,6 @@ TEST_FILE_PATH = "tests/v1/test_files" -@pytest.fixture() -def df() -> pd.DataFrame: - with open("tests/data/personer.json") as test_data: - return pd.json_normalize(json.load(test_data)) - - -@pytest.fixture() -def df_polars() -> pl.DataFrame: - with open("tests/data/personer.json") as test_data: - return pl.from_pandas(pd.json_normalize(json.load(test_data))) - - -@pytest.fixture() -def json_file_path() -> str: - return "tests/data/personer.json" - - -@pytest.fixture() -def json_hierarch_file_path() -> str: - return "tests/data/personer_hierarchical.json" - - -@pytest.fixture() -def single_field_response() -> MagicMock: - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = b'{"data": ["f1","f2","f3"], "datadoc_metadata": {"pseudo_variables": []}, "metrics": [], "logs": []}' - return mock_response - - def mock_return_pseudonymize_operation_field( patch_pseudonymize_operation_field: Mock, ) -> None: @@ -81,7 +49,7 @@ def mock_return_pseudonymize_operation_field( @patch("dapla_pseudo.v1.PseudoClient._post_to_field_endpoint") def test_builder_pandas_pseudonymize_minimal_call( patched_post_to_field_endpoint: Mock, - df: pd.DataFrame, + df_personer: pl.DataFrame, single_field_response: MagicMock, ) -> None: field_name = "fornavn" @@ -89,7 +57,7 @@ def test_builder_pandas_pseudonymize_minimal_call( patched_post_to_field_endpoint.return_value = single_field_response pseudo_result = ( - Pseudonymize.from_pandas(df) + Pseudonymize.from_polars(df_personer) .on_fields(field_name) .with_default_encryption() .run() @@ -98,7 +66,7 @@ def test_builder_pandas_pseudonymize_minimal_call( pseudo_dataframe = pseudo_result.to_pandas() # Check that the pseudonymized df has new values - assert pseudo_dataframe[field_name].tolist() == ["f1", "f2", "f3"] + assert pseudo_dataframe[field_name].tolist() == ["Donald", "Mikke", "Anton"] @patch("dapla_pseudo.v1.PseudoClient._post_to_field_endpoint") @@ -121,19 +89,23 @@ def test_single_field_do_pseudonymize_field( TIMEOUT_DEFAULT, test_client, ) - assert series.to_list() == ["f1", "f2", "f3"] + assert series.to_list() == ["Donald", "Mikke", "Anton"] -def test_builder_fields_selector_single_field(df: pd.DataFrame) -> None: - Pseudonymize.from_pandas(df).on_fields("fornavn")._fields = ["fornavn"] +def test_builder_fields_selector_single_field(df_personer: pl.DataFrame) -> None: + assert Pseudonymize.from_polars(df_personer).on_fields("fornavn")._fields == [ + "fornavn" + ] -def test_builder_fields_selector_single_field_polars(df_polars: pl.DataFrame) -> None: - Pseudonymize.from_polars(df_polars).on_fields("fornavn")._fields = ["fornavn"] +def test_builder_fields_selector_single_field_polars(df_personer: pl.DataFrame) -> None: + Pseudonymize.from_polars(df_personer).on_fields("fornavn")._fields = ["fornavn"] -def test_builder_fields_selector_multiple_fields(df: pd.DataFrame) -> None: - Pseudonymize.from_pandas(df).on_fields("fornavn", "fnr")._fields = [ +def test_builder_fields_selector_multiple_fields(df_personer: pl.DataFrame) -> None: + assert Pseudonymize.from_polars(df_personer).on_fields( + "fornavn", "fnr" + )._fields == [ "fornavn", "fnr", ] @@ -141,10 +113,10 @@ def test_builder_fields_selector_multiple_fields(df: pd.DataFrame) -> None: @patch(f"{PKG}.pseudo_operation_file") def test_builder_file_default( - patched_pseudo_operation_file: MagicMock, json_file_path: str + patched_pseudo_operation_file: MagicMock, personer_file_path: str ) -> None: patched_pseudo_operation_file.return_value = Mock() - Pseudonymize.from_file(json_file_path).on_fields( + Pseudonymize.from_file(personer_file_path).on_fields( "fornavn" ).with_default_encryption().run() @@ -176,10 +148,10 @@ def test_builder_file_default( @patch(f"{PKG}.pseudo_operation_file") def test_builder_file_hierarchical( - patched_pseudonymize_file: MagicMock, json_hierarch_file_path: str + patched_pseudonymize_file: MagicMock, personer_hierarch_file_path: str ) -> None: patched_pseudonymize_file.return_value = Mock() - Pseudonymize.from_file(json_hierarch_file_path).on_fields( + Pseudonymize.from_file(personer_hierarch_file_path).on_fields( "person_info/fnr" ).with_default_encryption().run() @@ -212,14 +184,16 @@ def test_builder_file_hierarchical( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_selector_default( patch_pseudonymize_operation_field: MagicMock, - df: pd.DataFrame, + df_personer: pl.DataFrame, ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) - Pseudonymize.from_pandas(df).on_fields("fornavn").with_default_encryption().run() + Pseudonymize.from_polars(df_personer).on_fields( + "fornavn" + ).with_default_encryption().run() patch_pseudonymize_operation_field.assert_called_once_with( path="pseudonymize/field", field_name="fornavn", - values=df["fornavn"].tolist(), + values=df_personer["fornavn"].to_list(), pseudo_func=PseudoFunction( function_type=PseudoFunctionTypes.DAEAD, kwargs=DaeadKeywordArgs() ), @@ -231,13 +205,13 @@ def test_builder_pseudo_function_selector_default( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_selector_with_sid( - patch_pseudonymize_operation_field: MagicMock, df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) - Pseudonymize.from_pandas(df).on_fields("fnr").with_stable_id().run() + Pseudonymize.from_polars(df_personer).on_fields("fnr").with_stable_id().run() patch_pseudonymize_operation_field.assert_called_once_with( path="pseudonymize/field", - values=df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=PseudoFunction( function_type=PseudoFunctionTypes.MAP_SID, kwargs=MapSidKeywordArgs() @@ -250,15 +224,15 @@ def test_builder_pseudo_function_selector_with_sid( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_with_sid_snapshot_date_string( - patch_pseudonymize_operation_field: MagicMock, df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) - Pseudonymize.from_pandas(df).on_fields("fnr").with_stable_id( + Pseudonymize.from_polars(df_personer).on_fields("fnr").with_stable_id( sid_snapshot_date=convert_to_date("2023-05-21") ).run() patch_pseudonymize_operation_field.assert_called_once_with( path="pseudonymize/field", - values=df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=PseudoFunction( function_type=PseudoFunctionTypes.MAP_SID, @@ -272,15 +246,15 @@ def test_builder_pseudo_function_with_sid_snapshot_date_string( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_with_sid_snapshot_date_date( - patch_pseudonymize_operation_field: MagicMock, df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) - Pseudonymize.from_pandas(df).on_fields("fnr").with_stable_id( + Pseudonymize.from_polars(df_personer).on_fields("fnr").with_stable_id( sid_snapshot_date=date.fromisoformat("2023-05-21") ).run() patch_pseudonymize_operation_field.assert_called_once_with( path="pseudonymize/field", - values=df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=PseudoFunction( function_type=PseudoFunctionTypes.MAP_SID, @@ -294,15 +268,15 @@ def test_builder_pseudo_function_with_sid_snapshot_date_date( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_selector_fpe( - patch_pseudonymize_operation_field: MagicMock, df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) - Pseudonymize.from_pandas(df).on_fields( + Pseudonymize.from_polars(df_personer).on_fields( "fnr" ).with_papis_compatible_encryption().run() patch_pseudonymize_operation_field.assert_called_once_with( path="pseudonymize/field", - values=df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=PseudoFunction( function_type=PseudoFunctionTypes.FF31, kwargs=FF31KeywordArgs() @@ -315,19 +289,19 @@ def test_builder_pseudo_function_selector_fpe( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_selector_custom( - patch_pseudonymize_operation_field: MagicMock, df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) pseudo_func = PseudoFunction( function_type=PseudoFunctionTypes.FF31, kwargs=FF31KeywordArgs() ) - Pseudonymize.from_pandas(df).on_fields("fnr").with_custom_function( + Pseudonymize.from_polars(df_personer).on_fields("fnr").with_custom_function( pseudo_func ).run() patch_pseudonymize_operation_field.assert_called_once_with( path="pseudonymize/field", - values=df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=pseudo_func, timeout=TIMEOUT_DEFAULT, @@ -338,20 +312,20 @@ def test_builder_pseudo_function_selector_custom( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_function_selector_redact( - patch_pseudonymize_operation_field: MagicMock, df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) pseudo_func = PseudoFunction( function_type=PseudoFunctionTypes.REDACT, kwargs=RedactArgs(replacement_string="test"), ) - Pseudonymize.from_pandas(df).on_fields("fnr").with_custom_function( + Pseudonymize.from_polars(df_personer).on_fields("fnr").with_custom_function( pseudo_func ).run() patch_pseudonymize_operation_field.assert_called_once_with( path="pseudonymize/field", - values=df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=pseudo_func, timeout=TIMEOUT_DEFAULT, @@ -362,7 +336,7 @@ def test_builder_pseudo_function_selector_redact( @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_pseudo_keyset_selector_custom( - patch_pseudonymize_operation_field: MagicMock, df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: mock_return_pseudonymize_operation_field(patch_pseudonymize_operation_field) @@ -387,13 +361,13 @@ def test_builder_pseudo_keyset_selector_custom( kek_uri=kek_uri, encrypted_keyset=encrypted_keyset, keyset_info=keyset_info ) - Pseudonymize.from_pandas(df).on_fields("fnr").with_custom_function(pseudo_func).run( - custom_keyset=keyset - ) + Pseudonymize.from_polars(df_personer).on_fields("fnr").with_custom_function( + pseudo_func + ).run(custom_keyset=keyset) patch_pseudonymize_operation_field.assert_called_once_with( path="pseudonymize/field", - values=df["fnr"].tolist(), + values=df_personer["fnr"].to_list(), field_name="fnr", pseudo_func=pseudo_func, timeout=TIMEOUT_DEFAULT, @@ -404,7 +378,7 @@ def test_builder_pseudo_keyset_selector_custom( @patch(f"{PKG}.pseudonymize_operation_field") def test_pseudonymize_field_dataframe_setup( - patch_pseudonymize_operation_field: MagicMock, df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: name = kwargs["field_name"] @@ -416,7 +390,7 @@ def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: fields_to_pseudonymize = "fnr", "fornavn", "etternavn" result = ( - Pseudonymize.from_pandas(df) + Pseudonymize.from_polars(df_personer) .on_fields(*fields_to_pseudonymize) .with_default_encryption() .run() @@ -428,9 +402,9 @@ def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: assert dataframe[field].to_list() == side_effect(field_name=field)[0].to_list() -def test_builder_field_selector_multiple_fields(df: pd.DataFrame) -> None: +def test_builder_field_selector_multiple_fields(df_personer: pl.DataFrame) -> None: fields = ["snr", "snr_mor", "snr_far"] - assert Pseudonymize.from_pandas(df).on_fields(*fields)._fields == [ + assert Pseudonymize.from_polars(df_personer).on_fields(*fields)._fields == [ f"{f}" for f in fields ] @@ -469,7 +443,7 @@ def test_builder_from_invalid_gcs_file() -> None: @patch(f"{PKG}.pseudonymize_operation_field") def test_builder_to_polars_from_polars_chaining( - patch_pseudonymize_operation_field: MagicMock, df: pd.DataFrame + patch_pseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame ) -> None: def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: name = kwargs["field_name"] @@ -480,7 +454,7 @@ def side_effect(**kwargs: t.Any) -> tuple[pl.Series, RawPseudoMetadata]: patch_pseudonymize_operation_field.side_effect = side_effect fields_to_pseudonymize = "fnr", "fornavn", "etternavn" result: pl.DataFrame = ( - Pseudonymize.from_pandas(df) + Pseudonymize.from_polars(df_personer) .on_fields(*fields_to_pseudonymize) .with_default_encryption() .on_fields("fnr") diff --git a/tests/v1/test_result.py b/tests/v1/test_result.py index 02a04e33..0965388b 100644 --- a/tests/v1/test_result.py +++ b/tests/v1/test_result.py @@ -13,12 +13,6 @@ from dapla_pseudo.v1.result import Result -@pytest.fixture() -def polars_df() -> pl.DataFrame: - with open("tests/data/personer.json") as test_data: - return pl.from_pandas(pd.json_normalize(json.load(test_data))) - - @pytest.fixture() def pseudo_file_response() -> PseudoFileResponse: fd = open("tests/data/personer.json") @@ -64,18 +58,18 @@ def test_result_index_level(tmp_path: Path) -> None: assert "__index_level_0__" not in df_result.columns -def test_result_from_polars_to_polars(polars_df: pl.DataFrame) -> None: - result = Result(PseudoFieldResponse(data=polars_df, raw_metadata=[])) +def test_result_from_polars_to_polars(df_personer: pl.DataFrame) -> None: + result = Result(PseudoFieldResponse(data=df_personer, raw_metadata=[])) assert isinstance(result.to_polars(), pl.DataFrame) -def test_result_from_polars_to_pandas(polars_df: pl.DataFrame) -> None: - result = Result(PseudoFieldResponse(data=polars_df, raw_metadata=[])) +def test_result_from_polars_to_pandas(df_personer: pl.DataFrame) -> None: + result = Result(PseudoFieldResponse(data=df_personer, raw_metadata=[])) assert isinstance(result.to_pandas(), pd.DataFrame) -def test_result_from_polars_to_file(tmp_path: Path, polars_df: pl.DataFrame) -> None: - result = Result(PseudoFieldResponse(data=polars_df, raw_metadata=[])) +def test_result_from_polars_to_file(tmp_path: Path, df_personer: pl.DataFrame) -> None: + result = Result(PseudoFieldResponse(data=df_personer, raw_metadata=[])) result.to_file(tmp_path / "polars_to_file.json") diff --git a/tests/v1/test_stable_id.py b/tests/v1/test_stable_id.py index abebd548..e016db1a 100644 --- a/tests/v1/test_stable_id.py +++ b/tests/v1/test_stable_id.py @@ -1,4 +1,3 @@ -import json from datetime import date from unittest.mock import MagicMock from unittest.mock import Mock @@ -16,12 +15,6 @@ TEST_FILE_PATH = "tests/v1/test_files" -@pytest.fixture() -def df() -> pd.DataFrame: - with open("tests/data/personer.json") as test_data: - return pd.json_normalize(json.load(test_data)) - - @pytest.fixture() def sid_lookup_missing_response() -> MagicMock: mock_response = MagicMock() @@ -41,7 +34,7 @@ def sid_lookup_empty_response() -> MagicMock: @patch("dapla_pseudo.v1.PseudoClient._post_to_sid_endpoint") def test_validate_with_full_response( patched_post_to_sid_endpoint: Mock, - df: pd.DataFrame, + df_personer: pl.DataFrame, sid_lookup_missing_response: MagicMock, ) -> None: field_name = "fnr" @@ -49,7 +42,9 @@ def test_validate_with_full_response( patched_post_to_sid_endpoint.return_value = sid_lookup_missing_response validation_result = ( - Validator.from_pandas(df).on_field(field_name).validate_map_to_stable_id() + Validator.from_polars(df_personer) + .on_field(field_name) + .validate_map_to_stable_id() ) validation_df = validation_result.to_pandas() validation_metadata = validation_result.metadata @@ -67,7 +62,7 @@ def test_validate_with_full_response( @patch("dapla_pseudo.v1.PseudoClient._post_to_sid_endpoint") def test_validate_with_empty_response( patched_post_to_sid_endpoint: Mock, - df: pd.DataFrame, + df_personer: pl.DataFrame, sid_lookup_empty_response: MagicMock, ) -> None: field_name = "fnr" @@ -75,7 +70,7 @@ def test_validate_with_empty_response( patched_post_to_sid_endpoint.return_value = sid_lookup_empty_response validation_result = ( - Validator.from_pandas(df) + Validator.from_polars(df_personer) .on_field(field_name) .validate_map_to_stable_id(sid_snapshot_date=convert_to_date("2023-08-31")) ) @@ -118,5 +113,5 @@ def test_builder_from_file_with_storage_options(_mock_read_to_pandas_df: Mock) - ) -def test_builder_from_polars(df: pd.DataFrame) -> None: - Validator.from_polars(pl.from_pandas(df)) +def test_builder_from_polars(df_personer_pandas: pd.DataFrame) -> None: + Validator.from_polars(pl.from_pandas(df_personer_pandas)) diff --git a/tests/v1/test_supported_file_format.py b/tests/v1/test_supported_file_format.py index eb2b6f1a..219dd4c1 100644 --- a/tests/v1/test_supported_file_format.py +++ b/tests/v1/test_supported_file_format.py @@ -1,4 +1,3 @@ -import json from pathlib import Path import pandas as pd @@ -15,12 +14,6 @@ TEST_FILE_PATH = "tests/v1/test_files" -@pytest.fixture() -def df_polars() -> pl.DataFrame: - with open("tests/data/personer.json") as test_data: - return pl.from_pandas(pd.json_normalize(json.load(test_data))) - - def test_get_pandas_function_name_unsupported_format() -> None: # Checks that a unsupported file extension raise a value error. unsupported_format = "notsupported" @@ -55,8 +48,8 @@ def test_read_with_polars_unsupported_xml() -> None: @pytest.mark.parametrize("file_format", ["json", "csv", "parquet", "xml"]) def test_write_from_df( - tmp_path: Path, df_polars: pl.DataFrame, file_format: str + tmp_path: Path, df_personer: pl.DataFrame, file_format: str ) -> None: supported_format = SupportedOutputFileFormat(file_format) - write_from_df(df_polars, supported_format, f"{tmp_path}/test.{file_format}") + write_from_df(df_personer, supported_format, f"{tmp_path}/test.{file_format}")