From b0c953b9cff99880af4d9219277b5d93d450a2cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yolan=20Honor=C3=A9-Roug=C3=A9?= Date: Mon, 23 Oct 2023 23:25:49 +0200 Subject: [PATCH] :boom: :recycle: Rename MlflowMetricsDataset to MlflowMetricsHistoryDataset (#440) --- CHANGELOG.md | 19 +++++++----- README.md | 2 +- docs/source/01_introduction/02_motivation.md | 2 +- .../02_installation/03_migration_guide.md | 2 +- .../05_version_metrics.md | 14 ++++----- kedro_mlflow/framework/hooks/mlflow_hook.py | 11 ++++--- kedro_mlflow/io/metrics/__init__.py | 2 +- .../metrics/mlflow_abstract_metric_dataset.py | 2 +- .../io/metrics/mlflow_metrics_dataset.py | 4 +-- .../hooks/test_hook_deactivate_tracking.py | 2 +- .../framework/hooks/test_hook_log_metrics.py | 10 +++---- .../io/metrics/test_mlflow_metrics_dataset.py | 30 ++++++++++--------- 12 files changed, 55 insertions(+), 45 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d8fe694..022a0b72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,19 +2,24 @@ ## [Unreleased] +### Added + - :sparkles: Add support for python 3.11 ([#450, rxm7706](https://github.com/Galileo-Galilei/kedro-mlflow/pull/450)) + +### Changed + +- :boom: :sparkles: Change default ``copy_mode`` to ``"assign"`` in ``KedroPipelineModel`` because this is the most efficient setup (and usually the desired one) when serving a Kedro ``Pipeline`` as a Mlflow model. This is different from Kedro's default which is to deepcopy the dataset ([#463, ShubhamZoro](https://github.com/Galileo-Galilei/kedro-mlflow/pull/463)). +- :boom: :recycle: ``MlflowArtifactDataset.__init__`` method ``data_set`` argument is renamed ``dataset`` to match new Kedro conventions ([#391](https://github.com/Galileo-Galilei/kedro-mlflow/pull/391)). - :boom: :recycle: Rename the following ``DataSets`` with the ``Dataset`` suffix (without capitalized ``S``) to match new kedro conventions from ``kedro>=0.19`` and onwards ([#439, ShubhamZoro](https://github.com/Galileo-Galilei/kedro-mlflow/pull/439)): - ``MlflowArtifactDataSet``->``MlflowArtifactDataset`` - ``MlflowAbstractModelDataSet``->``MlflowAbstractModelDataset`` - ``MlflowModelRegistryDataSet``->``MlflowModelRegistryDataset`` - ``MlflowMetricDataSet``->``MlflowMetricDataset`` - ``MlflowMetricHistoryDataSet``->``MlflowMetricHistoryDataset`` - - ``MlflowMetricsDataSet``->``MlflowMetricsDataset`` -- :boom: :recycle: Rename the following ``DataSets`` to make their use more explicit, and use the ``Dataset`` suffix ([#465, ShubhamZoro](https://github.com/Galileo-Galilei/kedro-mlflow/pull/465)): - - ``MlflowModelLoggerDataSet``->``MlflowModelTrackingDataset`` - - ``MlflowModelSaverDataSet``->``MlflowModelLocalFileSystemDataset`` -- :boom: :sparkles: Change default ``copy_mode`` to ``"assign"`` in ``KedroPipelineModel`` because this is the most efficient setup (and usually the desired one) when serving a Kedro ``Pipeline`` as a Mlflow model. This is different from Kedro's default which is to deepcopy the dataset ([#463, ShubhamZoro](https://github.com/Galileo-Galilei/kedro-mlflow/pull/463)). -- :boom: :recycle: ``MlflowArtifactDataset.__init__`` method ``data_set`` argument is renamed ``dataset`` to match new Kedro conventions (). +- :boom: :recycle: Rename the following ``DataSets`` to make their use more explicit, and use the ``Dataset`` suffix: + - ``MlflowModelLoggerDataSet``->``MlflowModelTrackingDataset`` ([#391](https://github.com/Galileo-Galilei/kedro-mlflow/pull/391)) + - ``MlflowModelSaverDataSet``->``MlflowModelLocalFileSystemDataset`` ([#391](https://github.com/Galileo-Galilei/kedro-mlflow/pull/391)) + - ``MlflowMetricsDataSet``->``MlflowMetricsHistoryDataset`` ([#440](https://github.com/Galileo-Galilei/kedro-mlflow/pull/440)) ## [0.11.10] - 2023-10-03 @@ -334,7 +339,7 @@ - `get_mlflow_config` now works in interactive mode if `load_context` is called with a path different from the working directory ([#30](https://github.com/Galileo-Galilei/kedro-mlflow/issues/30)) - ``kedro_mlflow`` now works fine with ``kedro jupyter notebook`` independently of the working directory ([#64](https://github.com/Galileo-Galilei/kedro-mlflow/issues/64)) - You can use global variables in `mlflow.yml` which is now properly parsed if you use a `TemplatedConfigLoader` ([#72](https://github.com/Galileo-Galilei/kedro-mlflow/issues/72)) -- :bug: `MlflowMetricsDataset` now saves in the specified `run_id` instead of the current one when the prefix is not specified ([#62](https://github.com/Galileo-Galilei/kedro-mlflow/issues/62)) +- :bug: `MlflowMetricsHistoryDataset` now saves in the specified `run_id` instead of the current one when the prefix is not specified ([#62](https://github.com/Galileo-Galilei/kedro-mlflow/issues/62)) - :memo: Other bug fixes and documentation improvements ([#6](https://github.com/Galileo-Galilei/kedro-mlflow/issues/6), [#99](https://github.com/Galileo-Galilei/kedro-mlflow/issues/99)) ### Changed diff --git a/README.md b/README.md index 26042513..98440739 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ The documentation contains: Some frequently asked questions on more advanced features: -- You want to log additional metrics to the run? -> [Try ``MlflowMetricsDataset``](https://kedro-mlflow.readthedocs.io/en/stable/source/04_experimentation_tracking/05_version_metrics.html) ! +- You want to log additional metrics to the run? -> [Try ``MlflowMetricsHistoryDataset``](https://kedro-mlflow.readthedocs.io/en/stable/source/04_experimentation_tracking/05_version_metrics.html) ! - You want to log nice dataviz of your pipeline that you register with ``MatplotlibWriter``? -> [Try ``MlflowArtifactDataset`` to log any local files (.png, .pkl, .csv...) *automagically*](https://kedro-mlflow.readthedocs.io/en/stable/source/04_experimentation_tracking/03_version_datasets.html)! - You want to create easily an API to share your awesome model to anyone? -> [See if ``pipeline_ml_factory`` can fit your needs](https://github.com/Galileo-Galilei/kedro-mlflow/issues/16) - You want to do something that is not straigthforward with current implementation? Open an issue, and let's see what happens! diff --git a/docs/source/01_introduction/02_motivation.md b/docs/source/01_introduction/02_motivation.md index 389a912b..4bf199a6 100644 --- a/docs/source/01_introduction/02_motivation.md +++ b/docs/source/01_introduction/02_motivation.md @@ -41,7 +41,7 @@ Above implementations have the advantage of being very straightforward and *mlfl | Logging parameters | ``mlflow.yml`` | ``MlflowHook`` | | Logging artifacts | ``catalog.yml`` | ``MlflowArtifactDataset`` | | Logging models | ``catalog.yml`` | `MlflowModelTrackingDataset` and `MlflowModelLocalFileSystemDataset` | -| Logging metrics | ``catalog.yml`` | ``MlflowMetricsDataset`` | +| Logging metrics | ``catalog.yml`` | ``MlflowMetricsHistoryDataset`` | | Logging Pipeline as model | ``hooks.py`` | ``KedroPipelineModel`` and ``pipeline_ml_factory`` | `kedro-mlflow` does not currently provide interface to set tags outside a Kedro ``Pipeline``. Some of above decisions are subject to debate and design decisions (for instance, metrics are often updated in a loop during each epoch / training iteration and it does not always make sense to register the metric between computation steps, e.g. as a an I/O operation after a node run). diff --git a/docs/source/02_installation/03_migration_guide.md b/docs/source/02_installation/03_migration_guide.md index 2c99d1a6..128b758e 100644 --- a/docs/source/02_installation/03_migration_guide.md +++ b/docs/source/02_installation/03_migration_guide.md @@ -101,7 +101,7 @@ Replace the following entries: | old | new | | :-------------------------------------- | :------------------------------------------------ | | `kedro_mlflow.io.MlflowArtifactDataset` | `kedro_mlflow.io.artifacts.MlflowArtifactDataset` | -| `kedro_mlflow.io.MlflowMetricsDataset` | `kedro_mlflow.io.metrics.MlflowMetricsDataset` | +| `kedro_mlflow.io.MlflowMetricsHistoryDataset` | `kedro_mlflow.io.metrics.MlflowMetricsHistoryDataset` | ### Hooks diff --git a/docs/source/04_experimentation_tracking/05_version_metrics.md b/docs/source/04_experimentation_tracking/05_version_metrics.md index 98274c82..447c41d2 100644 --- a/docs/source/04_experimentation_tracking/05_version_metrics.md +++ b/docs/source/04_experimentation_tracking/05_version_metrics.md @@ -9,7 +9,7 @@ MLflow defines a metric as "a (key, value) pair, where the value is numeric". Ea `kedro-mlflow` introduces 3 ``AbstractDataset`` to manage metrics: - ``MlflowMetricDataset`` which can log a float as a metric - ``MlflowMetricHistoryDataset`` which can log the evolution over time of a given metric, e.g. a list or a dict of float. -- ``MlflowMetricsDataset``. It is a wrapper around a dictionary with metrics which is returned by node and log metrics in MLflow. +- ``MlflowMetricsHistoryDataset``. It is a wrapper around a dictionary with metrics which is returned by node and log metrics in MLflow. ### Saving a single float as a metric with ``MlflowMetricDataset`` @@ -148,13 +148,13 @@ my_model_metric: mode: ... # OPTIONAL: "list" by default, one of {"list", "dict", "history"} ``` -### Saving several metrics with their entire history with ``MlflowMetricsDataset`` +### Saving several metrics with their entire history with ``MlflowMetricsHistoryDataset`` Since it is an ``AbstractDataset``, it can be used with the YAML API. You can define it in your ``catalog.yml`` as: ```yaml my_model_metrics: - type: kedro_mlflow.io.metrics.MlflowMetricsDataset + type: kedro_mlflow.io.metrics.MlflowMetricsHistoryDataset ``` You can provide a prefix key, which is useful in situations like when you have multiple nodes producing metrics with the same names which you want to distinguish. If you are using the ``v``, it will handle that automatically for you by giving as prefix metrics data set name. In the example above the prefix would be ``my_model_metrics``. @@ -163,7 +163,7 @@ Let's look at an example with custom prefix: ```yaml my_model_metrics: - type: kedro_mlflow.io.metrics.MlflowMetricsDataset + type: kedro_mlflow.io.metrics.MlflowMetricsHistoryDataset prefix: foo ``` @@ -179,7 +179,7 @@ def metrics_node() -> Dict[str, Union[float, List[float]]]: } ``` -As you can see above, ``kedro_mlflow.io.metrics.MlflowMetricsDataset`` can take metrics as: +As you can see above, ``kedro_mlflow.io.metrics.MlflowMetricsHistoryDataset`` can take metrics as: - ``Dict[str, key]`` - ``List[Dict[str, key]]`` @@ -188,7 +188,7 @@ To store metrics we need to define metrics dataset in Kedro Catalog: ```yaml my_model_metrics: - type: kedro_mlflow.io.metrics.MlflowMetricsDataset + type: kedro_mlflow.io.metrics.MlflowMetricsHistoryDataset ``` Within a kedro run, the ``MlflowHook`` will automatically prefix the metrics datasets with their name in the catalog. In our example, the metrics will be stored in Mlflow with the following keys: ``my_model_metrics.metric1``, ``my_model_metrics.metric2``. @@ -197,7 +197,7 @@ It is also prossible to provide a prefix manually: ```yaml my_model_metrics: - type: kedro_mlflow.io.metrics.MlflowMetricsDataset + type: kedro_mlflow.io.metrics.MlflowMetricsHistoryDataset prefix: foo ``` diff --git a/kedro_mlflow/framework/hooks/mlflow_hook.py b/kedro_mlflow/framework/hooks/mlflow_hook.py index be1c26bb..e5420bb7 100644 --- a/kedro_mlflow/framework/hooks/mlflow_hook.py +++ b/kedro_mlflow/framework/hooks/mlflow_hook.py @@ -27,7 +27,7 @@ from kedro_mlflow.io.metrics import ( MlflowMetricDataset, MlflowMetricHistoryDataset, - MlflowMetricsDataset, + MlflowMetricsHistoryDataset, ) from kedro_mlflow.mlflow import KedroPipelineModel from kedro_mlflow.pipeline.pipeline_ml import PipelineML @@ -141,13 +141,16 @@ def after_catalog_created( # we use this hooks to modif "MlflowmetricsDataset" to ensure consistency # of the metric name with the catalog name for name, dataset in catalog._datasets.items(): - if isinstance(dataset, MlflowMetricsDataset) and dataset._prefix is None: + if ( + isinstance(dataset, MlflowMetricsHistoryDataset) + and dataset._prefix is None + ): if dataset._run_id is not None: - catalog._datasets[name] = MlflowMetricsDataset( + catalog._datasets[name] = MlflowMetricsHistoryDataset( run_id=dataset._run_id, prefix=name ) else: - catalog._datasets[name] = MlflowMetricsDataset(prefix=name) + catalog._datasets[name] = MlflowMetricsHistoryDataset(prefix=name) if isinstance(dataset, MlflowMetricDataset) and dataset.key is None: if dataset._run_id is not None: diff --git a/kedro_mlflow/io/metrics/__init__.py b/kedro_mlflow/io/metrics/__init__.py index 1fde8b04..fe54fadf 100644 --- a/kedro_mlflow/io/metrics/__init__.py +++ b/kedro_mlflow/io/metrics/__init__.py @@ -1,3 +1,3 @@ from .mlflow_metric_dataset import MlflowMetricDataset from .mlflow_metric_history_dataset import MlflowMetricHistoryDataset -from .mlflow_metrics_dataset import MlflowMetricsDataset +from .mlflow_metrics_dataset import MlflowMetricsHistoryDataset diff --git a/kedro_mlflow/io/metrics/mlflow_abstract_metric_dataset.py b/kedro_mlflow/io/metrics/mlflow_abstract_metric_dataset.py index c85b7d7c..6ac4b63c 100644 --- a/kedro_mlflow/io/metrics/mlflow_abstract_metric_dataset.py +++ b/kedro_mlflow/io/metrics/mlflow_abstract_metric_dataset.py @@ -13,7 +13,7 @@ def __init__( load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, ): - """Initialise MlflowMetricsDataset. + """Initialise MlflowMetricsHistoryDataset. Args: run_id (str): The ID of the mlflow run where the metric should be logged diff --git a/kedro_mlflow/io/metrics/mlflow_metrics_dataset.py b/kedro_mlflow/io/metrics/mlflow_metrics_dataset.py index 24f6a4ce..fb4d157f 100644 --- a/kedro_mlflow/io/metrics/mlflow_metrics_dataset.py +++ b/kedro_mlflow/io/metrics/mlflow_metrics_dataset.py @@ -11,7 +11,7 @@ MetricsDict = Dict[str, MetricItem] -class MlflowMetricsDataset(AbstractDataset): +class MlflowMetricsHistoryDataset(AbstractDataset): """This class represent MLflow metrics dataset.""" def __init__( @@ -19,7 +19,7 @@ def __init__( run_id: str = None, prefix: Optional[str] = None, ): - """Initialise MlflowMetricsDataset. + """Initialise MlflowMetricsHistoryDataset. Args: prefix (Optional[str]): Prefix for metrics logged in MLflow. diff --git a/tests/framework/hooks/test_hook_deactivate_tracking.py b/tests/framework/hooks/test_hook_deactivate_tracking.py index ac861541..4e4e25fd 100644 --- a/tests/framework/hooks/test_hook_deactivate_tracking.py +++ b/tests/framework/hooks/test_hook_deactivate_tracking.py @@ -63,7 +63,7 @@ def catalog_config(kedro_project_path): }, }, "metrics_data": { - "type": "kedro_mlflow.io.metrics.MlflowMetricsDataset", + "type": "kedro_mlflow.io.metrics.MlflowMetricsHistoryDataset", }, "metric_data": { "type": "kedro_mlflow.io.metrics.MlflowMetricDataset", diff --git a/tests/framework/hooks/test_hook_log_metrics.py b/tests/framework/hooks/test_hook_log_metrics.py index 5ff2024a..862fd612 100644 --- a/tests/framework/hooks/test_hook_log_metrics.py +++ b/tests/framework/hooks/test_hook_log_metrics.py @@ -12,7 +12,7 @@ from kedro_mlflow.io.metrics import ( MlflowMetricDataset, MlflowMetricHistoryDataset, - MlflowMetricsDataset, + MlflowMetricsHistoryDataset, ) @@ -105,8 +105,8 @@ def dummy_catalog(tmp_path): "params:unused_param": MemoryDataset("blah"), "data": MemoryDataset(), "model": PickleDataset((tmp_path / "model.csv").as_posix()), - "my_metrics": MlflowMetricsDataset(), - "another_metrics": MlflowMetricsDataset(prefix="foo"), + "my_metrics": MlflowMetricsHistoryDataset(), + "another_metrics": MlflowMetricsHistoryDataset(prefix="foo"), "my_metric": MlflowMetricDataset(), "another_metric": MlflowMetricDataset(key="foo"), "my_metric_history": MlflowMetricHistoryDataset(), @@ -181,8 +181,8 @@ def test_mlflow_hook_metrics_dataset_with_run_id( "model": PickleDataset( (kedro_project_with_mlflow_conf / "data" / "model.csv").as_posix() ), - "my_metrics": MlflowMetricsDataset(run_id=existing_run_id), - "another_metrics": MlflowMetricsDataset( + "my_metrics": MlflowMetricsHistoryDataset(run_id=existing_run_id), + "another_metrics": MlflowMetricsHistoryDataset( run_id=existing_run_id, prefix="foo" ), "my_metric": MlflowMetricDataset(run_id=existing_run_id), diff --git a/tests/io/metrics/test_mlflow_metrics_dataset.py b/tests/io/metrics/test_mlflow_metrics_dataset.py index b862a2b4..2ebece9f 100644 --- a/tests/io/metrics/test_mlflow_metrics_dataset.py +++ b/tests/io/metrics/test_mlflow_metrics_dataset.py @@ -6,7 +6,7 @@ from mlflow.tracking import MlflowClient from pytest_lazyfixture import lazy_fixture -from kedro_mlflow.io.metrics import MlflowMetricsDataset +from kedro_mlflow.io.metrics import MlflowMetricsHistoryDataset def assert_are_metrics_logged( @@ -81,12 +81,12 @@ def metrics3(): ], ) def test_mlflow_metrics_dataset_saved_and_logged(tmp_path, tracking_uri, data, prefix): - """Check if MlflowMetricsDataset can be saved in catalog when filepath is given, + """Check if MlflowMetricsHistoryDataset can be saved in catalog when filepath is given, and if logged in mlflow. """ mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) - mlflow_metrics_dataset = MlflowMetricsDataset(prefix=prefix) + mlflow_metrics_dataset = MlflowMetricsHistoryDataset(prefix=prefix) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id @@ -96,7 +96,7 @@ def test_mlflow_metrics_dataset_saved_and_logged(tmp_path, tracking_uri, data, p assert_are_metrics_logged(data, mlflow_client, run_id, prefix) # Check if metrics are stored in catalog. - catalog_metrics = MlflowMetricsDataset( + catalog_metrics = MlflowMetricsHistoryDataset( prefix=prefix, # Run id needs to be provided as there is no active run. run_id=run_id, @@ -109,14 +109,14 @@ def test_mlflow_metrics_dataset_saved_and_logged(tmp_path, tracking_uri, data, p def test_mlflow_metrics_dataset_saved_without_run_id(tmp_path, tracking_uri, metrics3): - """Check if MlflowMetricsDataset can be saved in catalog when filepath is given, + """Check if MlflowMetricsHistoryDataset can be saved in catalog when filepath is given, and if logged in mlflow. """ prefix = "test_metric" mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) - mlflow_metrics_dataset = MlflowMetricsDataset(prefix=prefix) + mlflow_metrics_dataset = MlflowMetricsHistoryDataset(prefix=prefix) # a mlflow run_id is automatically created mlflow_metrics_dataset.save(metrics3) @@ -126,13 +126,13 @@ def test_mlflow_metrics_dataset_saved_without_run_id(tmp_path, tracking_uri, met def test_mlflow_metrics_dataset_exists(tmp_path, tracking_uri, metrics3): - """Check if MlflowMetricsDataset is well identified as + """Check if MlflowMetricsHistoryDataset is well identified as existing if it has already been saved. """ prefix = "test_metric" mlflow.set_tracking_uri(tracking_uri.as_uri()) - mlflow_metrics_dataset = MlflowMetricsDataset(prefix=prefix) + mlflow_metrics_dataset = MlflowMetricsHistoryDataset(prefix=prefix) # a mlflow run_id is automatically created mlflow_metrics_dataset.save(metrics3) @@ -140,7 +140,7 @@ def test_mlflow_metrics_dataset_exists(tmp_path, tracking_uri, metrics3): def test_mlflow_metrics_dataset_does_not_exist(tmp_path, tracking_uri, metrics3): - """Check if MlflowMetricsDataset is well identified as + """Check if MlflowMetricsHistoryDataset is well identified as not existingif it has never been saved. """ @@ -148,7 +148,9 @@ def test_mlflow_metrics_dataset_does_not_exist(tmp_path, tracking_uri, metrics3) mlflow.start_run() # starts a run toenable mlflow_metrics_dataset to know where to seacrh run_id = mlflow.active_run().info.run_id mlflow.end_run() - mlflow_metrics_dataset = MlflowMetricsDataset(prefix="test_metric", run_id=run_id) + mlflow_metrics_dataset = MlflowMetricsHistoryDataset( + prefix="test_metric", run_id=run_id + ) # a mlflow run_id is automatically created assert not mlflow_metrics_dataset.exists() @@ -156,12 +158,12 @@ def test_mlflow_metrics_dataset_does_not_exist(tmp_path, tracking_uri, metrics3) def test_mlflow_metrics_dataset_fails_with_invalid_metric( tmp_path, tracking_uri, metrics3 ): - """Check if MlflowMetricsDataset is well identified as + """Check if MlflowMetricsHistoryDataset is well identified as not existingif it has never been saved. """ mlflow.set_tracking_uri(tracking_uri.as_uri()) - mlflow_metrics_dataset = MlflowMetricsDataset(prefix="test_metric") + mlflow_metrics_dataset = MlflowMetricsHistoryDataset(prefix="test_metric") with pytest.raises( DatasetError, match="Unexpected metric value. Should be of type" @@ -172,7 +174,7 @@ def test_mlflow_metrics_dataset_fails_with_invalid_metric( def test_mlflow_metrics_logging_deactivation(tracking_uri, metrics): - mlflow_metrics_dataset = MlflowMetricsDataset(prefix="hello") + mlflow_metrics_dataset = MlflowMetricsHistoryDataset(prefix="hello") mlflow.set_tracking_uri(tracking_uri.as_uri()) mlflow_client = MlflowClient(tracking_uri=tracking_uri.as_uri()) @@ -197,7 +199,7 @@ def test_mlflow_metrics_logging_deactivation(tracking_uri, metrics): def test_mlflow_metrics_logging_deactivation_is_bool(): - mlflow_metrics_dataset = MlflowMetricsDataset(prefix="hello") + mlflow_metrics_dataset = MlflowMetricsHistoryDataset(prefix="hello") with pytest.raises(ValueError, match="_logging_activated must be a boolean"): mlflow_metrics_dataset._logging_activated = "hello"