diff --git a/.devcontainer/.dockerignore b/.devcontainer/.dockerignore deleted file mode 100644 index ba0430d..0000000 --- a/.devcontainer/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -__pycache__/ \ No newline at end of file diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile deleted file mode 100644 index b2b8052..0000000 --- a/.devcontainer/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -ARG VARIANT="3.10-bullseye" -FROM mcr.microsoft.com/vscode/devcontainers/python:0-${VARIANT} - - -# [Optional] If your pip requirements rarely change, uncomment this section to add them to the image. -COPY requirements.txt /tmp/pip-tmp/ -RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \ - && rm -rf /tmp/pip-tmp - -# [Optional] Uncomment this section to install additional OS packages. -# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ -# && apt-get -y install --no-install-recommends - -ENTRYPOINT ["streamlit", "run", "tsod/active_learning/app.py"] \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json deleted file mode 100644 index 3dc08e2..0000000 --- a/.devcontainer/devcontainer.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "name": "Python 3", - "build": { - "dockerfile": "Dockerfile", - "context": "..", - "args": { - "VARIANT": "3.10" - } - }, - "customizations": { - "vscode": { - "settings": { - "python.defaultInterpreterPath": "/usr/local/bin/python" - } - } - }, - "containerEnv": { - "PYTHONPATH": "/workspaces/tsod", - "TSOD_DEV_MODE": "true" - }, - "forwardPorts": [ - 8501 - ], - "postStartCommand": "pip3 install --user --no-cache-dir -r requirements.txt", - "remoteUser": "vscode" -} \ No newline at end of file diff --git a/.streamlit/config.toml b/.streamlit/config.toml deleted file mode 100644 index e4a68bc..0000000 --- a/.streamlit/config.toml +++ /dev/null @@ -1,8 +0,0 @@ -[server] -runOnSave = true - -[theme] -base = "dark" - -[runner] -fastReruns = true \ No newline at end of file diff --git a/Makefile b/Makefile deleted file mode 100644 index 3b3c46e..0000000 --- a/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -run_app: - streamlit run tsod/active_learning/app/1_Outlier_Annotation.py \ No newline at end of file diff --git a/README.md b/README.md index a744c7b..6429c71 100644 --- a/README.md +++ b/README.md @@ -12,14 +12,6 @@ Sensors often provide faulty or missing observations. These anomalies must be de This package aims to provide examples and algorithms for detecting anomalies in time series data specifically tailored to DHI users and the water domain. It is simple to install and deploy operationally and is accessible to everyone (open-source). -## [Active learning web application](https://github.com/DHI/tsod/blob/main/tsod/active_learning/) - -The web application is developed using [Streamlit](https://streamlit.io/). All requirements needed to run this application can be found [here](https://github.com/DHI/tsod/blob/main/.devcontainer/Dockerfile). - -![](images/active_learning_app.png) - - - ## Getting Started * [Documentation](https://dhi.github.io/tsod/getting_started.html) @@ -39,6 +31,11 @@ Or development version: `pip install https://github.com/DHI/tsod/archive/main.zip` +## [Active learning web application](https://github.com/DHI/tsod/blob/main/tsod/active_learning/) + +There is a web application that is developed using [Streamlit](https://streamlit.io/). It can be found [here]() + + ## Vision * A simple and consistent API for anomaly detection of timeseries * The computational speed will be good for typical timeseries data found in the water domain, to support realtime detection @@ -59,4 +56,3 @@ The training data is considered "normal" and is not polluted by outliers. New te - Follow PEP8 code style. This is automatically checked during Pull Requests. - If citing or re-using other code please make sure their license is also consistent with our policy. - diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index b1aff7f..0000000 --- a/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -pandas>=1.0.0 -#numba -joblib -streamlit -pytest>=6.2.1 -plotly -streamlit-echarts -streamlit_profiler -streamlit-option-menu -ipykernel -scikit-learn -mikeio -openpyxl \ No newline at end of file diff --git a/ruff.toml b/ruff.toml index 791a459..8e4ed33 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,3 +1,2 @@ # ignore long license for ruff ignore = ["E501", "E741"] -exclude = ["tsod/active_learning/"] diff --git a/setup.py b/setup.py index 59255f6..c8cb43b 100644 --- a/setup.py +++ b/setup.py @@ -6,15 +6,19 @@ setuptools.setup( name="tsod", version="0.2.0", - install_requires=["pandas>=1.0.0", "numba", "joblib"], + install_requires=[ + "pandas>=1.0.0", + "joblib", + "numba", + ], extras_require={ "dev": [ - "pytest>=6.2.1", + "pytest>=6", + "pytest-cov>=4", "sphinx==4.5.0", # pin version to work with sphinx-book-theme, "sphinx-book-theme", ], - "ml": ["pyod", "tensorflow"], - "test": ["pytest>=6.2.1"], + "ml": ["pyod", "tensorflow>=2"], }, author="Henrik Andersson", author_email="jan@dhigroup.com", diff --git a/tsod/active_learning/__init__.py b/tsod/active_learning/__init__.py deleted file mode 100644 index 800b8a5..0000000 --- a/tsod/active_learning/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from pathlib import Path - -BASE_PATH = Path(__file__).parent.resolve() -MEDIA_PATH = BASE_PATH / "media" diff --git a/tsod/active_learning/app.py b/tsod/active_learning/app.py deleted file mode 100644 index d18a2a9..0000000 --- a/tsod/active_learning/app.py +++ /dev/null @@ -1,55 +0,0 @@ -import sys - -# quick hack to fix python path on streamlit cloud, I'm sure there's a better way -sys.path.append("/app/tsod") - -import streamlit as st -from streamlit_option_menu import option_menu - -from tsod.active_learning.components import FUNC_MAPPING, dev_options -from tsod.active_learning.utils import init_session_state - - -def main(): - st.set_page_config( - layout="wide", - page_icon="https://static.thenounproject.com/png/2196104-200.png", - page_title="Outlier Annotation Tool", - ) - init_session_state() - - icons = [ - "graph-up", - "file-bar-graph", - "lightbulb", - "question-square", - "download", - "info-circle", - ] - - with st.sidebar: - choice = option_menu( - "Time Series Outlier Detection", - list(FUNC_MAPPING.keys()), - default_index=st.session_state["page_index"], - icons=icons, - styles={ - "container": {"padding": "0!important"}, - "icon": {"color": "orange", "font-size": "15px"}, - "menu-title": {"font-size": "18px"}, - "nav-link": { - "font-size": "15px", - "text-align": "left", - "margin": "0px", - "--hover-color": "#AFADB4", - }, - "nav-link-selected": {"background-color": "green"}, - }, - menu_icon="", - ) - with dev_options(st.sidebar): - FUNC_MAPPING[choice]() - - -if __name__ == "__main__": - main() diff --git a/tsod/active_learning/components.py b/tsod/active_learning/components.py deleted file mode 100644 index 8a8410a..0000000 --- a/tsod/active_learning/components.py +++ /dev/null @@ -1,1822 +0,0 @@ -import datetime -import logging -import os -import pickle -from collections import defaultdict -from contextlib import nullcontext -from io import BytesIO -from typing import List - -import numpy as np -import pandas as pd -import streamlit as st -from streamlit_echarts import st_pyecharts -from streamlit_profiler import Profiler - -from tsod.active_learning import MEDIA_PATH -from tsod.active_learning.data_structures import AnnotationState, plot_return_value_as_datetime -from tsod.active_learning.instructions import INSTRUCTION_DICT -from tsod.active_learning.modelling import get_model_predictions, train_model -from tsod.active_learning.plotting import ( - feature_importance_plot, - get_echarts_plot_time_range, - make_annotation_suggestion_plot, - make_outlier_distribution_plot, - make_removed_outliers_example_plots, - make_time_range_outlier_plot, -) -from tsod.active_learning.upload_data import add_new_data, data_uploader -from tsod.active_learning.utils import ( - MODEL_OPTIONS, - custom_text, - fix_random_seeds, - get_as, - recursive_ss_search, - set_session_state_items, - show_memory_usage, - ss_recursive_df_memory_usage, -) - - -def outlier_annotation(): - exp = st.sidebar.expander("Data Upload", expanded=not len(st.session_state["data_store"])) - data_uploader(exp) - st.sidebar.title("Annotation Controls") - data_selection(st.sidebar) - state = get_as() - if not state: - if st.session_state["page_index"] != FUNC_IDX_MAPPING["1. Outlier Annotation"]: - st.session_state["page_index"] = FUNC_IDX_MAPPING["1. Outlier Annotation"] - st.experimental_rerun() - st.info( - """Upload your data to get started (using 'Data Upload' in the sidebar)! - If you just want to try out the application, you can also use some - randomly generated time series data by clicking the button below.""" - ) - st.button("Add generated data", on_click=generate_example_data) - st.markdown("***") - - st.warning( - """For first time users: It is recommended to check out the - instructions before starting. - If you are unsure what a widget is for, hover your mouse above the little question mark next to it to get some more info.""" - ) - st.button( - "View instructions", - on_click=set_session_state_items, - args=("page_index", FUNC_IDX_MAPPING["Instructions"]), - ) - st.image( - str(MEDIA_PATH / "workflow.png"), - use_column_width=True, - caption="Basic workflow suggestion", - ) - return - with st.sidebar.expander("Actions", expanded=True): - create_annotation_plot_buttons() - with st.sidebar.expander("Time Range Selection", expanded=True): - time_range_selection() - with st.sidebar.expander("Save / load previous", expanded=True): - create_save_load_buttons() - - plot = get_echarts_plot_time_range( - state.start, - state.end, - state.column, - True, - f"Outlier Annotation - {state.dataset} - {state.column}", - ) - - clicked_point = st_pyecharts( - plot, - height=800, - theme="dark", - events={ - "click": "function(params) { return [params.data[0], 'click'] }", - "brushselected": "function(params) { return [params.batch[0].selected, 'brush'] }", - # "brushselected": """function(params) { return [params.batch[0].selected.filter( obj => {return obj.seriesName === 'Datapoints'})[0].dataIndex, 'brush'] }""", - }, - ) - process_data_from_echarts_plot(clicked_point) - - -def model_training(): - st.sidebar.title("Training Controls") - if not st.session_state["data_store"]: - st.info( - """To train models or view annotation summaries, please upload some data - in the 'Outlier Annotation' - page.""" - ) - return - fix_random_seeds() - data_selection(st.sidebar) - show_annotation_summary() - train_options() - test_metrics() - show_feature_importances() - - -def model_prediction(): - st.sidebar.title("Prediction Controls") - - if not st.session_state["data_store"]: - st.info( - """Please upload a dataset in order to generate and - interact with model predictions.""" - ) - return - - prediction_options(st.sidebar) - - if not st.session_state["inference_results"]: - st.info( - "To see and interact with model predictions, please choose one or multiple models and datasets \ - in the sidebar, then click 'Generate Predictions.'" - ) - for dataset_name, series_dict in st.session_state["inference_results"].items(): - for series in series_dict.keys(): - - with st.expander(f"{dataset_name} - {series} - Visualization Options", expanded=True): - st.subheader(f"{dataset_name} - {series}") - model_choice_options(dataset_name, series) - prediction_summary_table(dataset_name, series) - outlier_visualization_options(dataset_name, series) - with st.expander(f"{dataset_name} - {series} - Retraining options", expanded=True): - retrain_options(dataset_name, series) - if not st.session_state["models_to_visualize"][dataset_name][series]: - continue - with st.expander(f"{dataset_name} - {series} - Graphs", expanded=True): - start_time, end_time = make_outlier_distribution_plot(dataset_name, series) - if start_time is None: - if f"last_clicked_range_{dataset_name}_{series}" in st.session_state: - start_time, end_time = st.session_state[ - f"last_clicked_range_{dataset_name}_{series}" - ] - else: - continue - st.checkbox( - "Area select: Only select predicted outliers", - True, - key=f"only_select_outliers_{dataset_name}_{series}", - help="""To select multiple points at once you might want to use - one of the area select options. - This checkbox controls whether these selection methods only select - datapoints where outliers were predicted (markers) or all datapoints - in range.""", - ) - clicked_point = make_time_range_outlier_plot( - dataset_name, series, start_time, end_time - ) - # pass in dataset_name to activate checking for "select only outliers" - process_data_from_echarts_plot(clicked_point, dataset_name, series) - correction_options(dataset_name, series) - - -def instructions(): - tabs = st.tabs(list(INSTRUCTION_DICT.keys())) - - for i, (k, instruction_func) in enumerate(INSTRUCTION_DICT.items()): - with tabs[i]: - st.header(k) - instruction_func() - - -def annotation_suggestion(): - # set_session_state_items("page_index", FUNC_IDX_MAPPING["Annotation Suggestion"]) - - # for now only allow annotation suggestion for models trained in the current session - if not (("most_recent_model" in st.session_state) and (st.session_state["inference_results"])): - st.info( - """Annotation suggestion requires at least one model trained this session. - If a model was trained and it was used to generate predictions for a dataset, annotation suggestion will become available here.""" - ) - return - - st.sidebar.title("Suggestion Controls") - session_models = sorted(st.session_state["models_trained_this_session"], reverse=True) - model = st.sidebar.selectbox("Choose model", options=session_models) - - num_points = st.sidebar.slider( - "Number of points to show on each side of the candidate", - min_value=10, - max_value=1000, - value=50, - step=10, - key="suggestion_number_of_points", - ) - - dataset = st.session_state["model_library"][model]["trained_on_dataset"] - series = st.session_state["model_library"][model]["trained_on_series"] - - state = get_as(dataset, series) - - base_df = st.session_state["inference_results"][dataset][series] - # candidates are all datapoints that are not already annotated - df_not_annotated: pd.DataFrame = base_df[ - ~base_df.index.isin(state.all_indices - state.selection) - ] - # also filter out skipped values - df_not_annotated = df_not_annotated[ - ~df_not_annotated.index.isin( - [p[0] for p in st.session_state["suggested_points_with_annotation"][dataset][series]] - ) - ] - # sort by "model uncertainty" => lower = more uncertain - df_not_annotated.sort_values(f"certainty_{model}", inplace=True) - int_idx = state.df.index.get_loc(df_not_annotated.index[0]) - start_time = state.df.index[max(int_idx - num_points, 0)] - end_time = state.df.index[min(int_idx + num_points, len(state.df) - 1)] - x_value = state.df.index[int_idx] - y_value = state.df[series][int_idx] - - cert_value = int((df_not_annotated[f"certainty_{model}"][0] / 0.5) * 100) - - c1, c2 = st.columns([6, 1]) - with c1: - make_annotation_suggestion_plot(start_time, end_time, dataset, series, (x_value, y_value)) - c2.markdown("
", unsafe_allow_html=True) - c2.button( - "Yes", on_click=annotation_suggestion_callback, args=(dataset, series, "outlier", x_value) - ) - c2.button( - "No", on_click=annotation_suggestion_callback, args=(dataset, series, "normal", x_value) - ) - c2.metric("Model certainty", f"{cert_value} %") - c2.button( - "Skip", on_click=annotation_suggestion_callback, args=(dataset, series, "skipped", x_value) - ) - c2.button( - "Back to previous", - on_click=back_to_previous_suggestion_callback, - args=(dataset, series), - disabled=len(st.session_state["suggested_points_with_annotation"][dataset][series]) < 1, - help="Go back to the previous prompt (thereby removing its label).", - ) - - st.info( - """The presented datapoints are chosen based on the 'model certainty' - of the predictions. Uncertainty simply describes the degree of disagreement between - the individual tree classifiers. The points with the lowest certainty will be prompted first.""" - ) - - was_retrained = retrain_options(dataset, series) - # if model was retrained, we need to rerun to switch to the predictions page - if was_retrained: - st.experimental_rerun() - - -def annotation_suggestion_callback(dataset, series, annotation_type, value): - state = get_as(dataset, series) - if annotation_type in state.data: - state.update_data(annotation_type, [value]) - st.session_state["suggested_points_with_annotation"][dataset][series].append( - (value, annotation_type) - ) - - -def back_to_previous_suggestion_callback(dataset, series): - value_to_remove = st.session_state["suggested_points_with_annotation"][dataset][series].pop() - state = get_as(dataset, series) - key = value_to_remove[1] - if key == "skipped": - return - - state.data[key].discard(value_to_remove[0]) - state._update_df(key) - state._update_plot_df(key) - - -def data_download(): - removal_possible = True - if not (st.session_state["model_library"] and st.session_state["data_store"]): - st.info( - """Once a model was trained or uploaded and a dataset was created, - you will be able to use your models to remove outliers and download - the resulting data here. """ - ) - removal_possible = False - - st.sidebar.subheader("Download Controls") - - if removal_possible: - dataset = st.sidebar.selectbox( - "Select source dataset", - options=list(st.session_state["data_store"].keys()), - index=list(st.session_state["data_store"].keys()).index( - st.session_state["current_dataset"] - ), - disabled=len(st.session_state["data_store"]) < 2, - key="download_dataset", - ) - - series = st.sidebar.multiselect( - "Select series to remove outliers from", - options=list(st.session_state["data_store"][dataset].keys()), - default=st.session_state["current_series"][dataset], - disabled=len(st.session_state["data_store"][dataset]) < 2, - key="download_series", - help="""The final dataset will keep all columns it had when it was uploaded. - Here you can choose which of those columns should be cleaned of outliers. - You might want to use different models to clean different series.""", - ) - if not series: - st.sidebar.warning("Please select at least one series.") - return - - st.sidebar.selectbox( - "Select model to use for outlier identification", - options=sorted(st.session_state["model_library"].keys()), - index=sorted(st.session_state["model_library"].keys()).index( - st.session_state["most_recent_model"] - ) - if "most_recent_model" in st.session_state - else len(st.session_state["model_library"]) - 1, - disabled=len(st.session_state["model_library"]) < 2, - key="download_model", - ) - - method = st.sidebar.radio( - "Select how to handle predicted outliers", - options=list(REMOVAL_METHODS.keys()), - key="download_method", - ) - - st.sidebar.button( - "Update Preview" if "df_before" in st.session_state else "Preview", - on_click=remove_outliers, - help="""Creates a preview by sampling three predicted outliers per - series and overlaying a series with the outliers removed according - to the chosen method.""", - ) - - if f"df_after_{dataset}" in st.session_state: - with st.sidebar.expander("Save cleaned data as dataset", expanded=True): - if "_cleaned_" in dataset: - stem = dataset.split("_cleaned")[0] - default_ds_name = ( - f"{stem}_cleaned_{st.session_state['cleaned_dataset_counter'][stem]}" - ) - else: - default_ds_name = ( - f"{dataset}_cleaned_{st.session_state['cleaned_dataset_counter'][dataset]}" - ) - new_ds_name = st.text_input( - "Enter dataset name", - value=default_ds_name, - max_chars=30, - key="cleaned_dataset_to_add", - help="""Save cleaned data as a new dataset. - You can then use this dataset to remove more outliers using - another model, add further annotations or download it.""", - ) - disabled = False - if (new_ds_name == "") or (new_ds_name == " "): - st.warning("Please enter a name.") - disabled = True - if new_ds_name in st.session_state["data_store"]: - st.warning("A dataset with this name already exists.") - disabled = True - st.button( - "Add dataset", on_click=add_cleaned_dataset, args=(dataset,), disabled=disabled - ) - - if not st.session_state["data_store"]: - return - with st.sidebar.expander("Download a dataset", expanded=True): - ds_to_download = st.selectbox( - "Select dataset to download", - options=sorted(st.session_state["data_store"].keys()), - index=list(st.session_state["data_store"].keys()).index( - st.session_state["current_dataset"] - ), - disabled=len(st.session_state["data_store"]) < 2, - ) - file_format = st.radio("Select output file format", options=["csv", "xlsx"]) - - file_name = f"{ds_to_download}.{file_format}" - - st.download_button( - "Download dataset", - file_name=file_name, - data=get_data_to_download(ds_to_download, file_format), - on_click=logging.info, - args=("A dataset was successfully downloaded.",), - ) - - -@st.cache -def get_data_to_download(dataset: str, file_format: str): - df_download = pd.DataFrame() - for df_series in st.session_state["data_store"][dataset].values(): - df_download = df_download.merge( - df_series, - left_index=True, - right_index=True, - how="outer", - ) - - df_download.dropna(how="all", inplace=True) - - if file_format == "csv": - return df_download.to_csv().encode("utf-8") - - elif file_format == "xlsx": - output = BytesIO() - writer = pd.ExcelWriter(output, engine="xlsxwriter") - df_download.to_excel(writer, sheet_name="Sheet1") - workbook = writer.book - worksheet = writer.sheets["Sheet1"] - format1 = workbook.add_format({"num_format": "0.00"}) - worksheet.set_column("A:A", None, format1) - writer.save() - return output.getvalue() - - -FUNC_MAPPING = { - "1. Outlier Annotation": outlier_annotation, - "2. Model Training": model_training, - "3. Model Prediction": model_prediction, - "Annotation Suggestion": annotation_suggestion, - "Data Download": data_download, - "Instructions": instructions, -} - -FUNC_IDX_MAPPING = {k: i for i, k in enumerate(FUNC_MAPPING.keys())} - - -def add_cleaned_dataset(original_dataset: str): - ds_name = st.session_state["cleaned_dataset_to_add"] - if (ds_name == "") or (ds_name == " ") or (ds_name in st.session_state["data_store"]): - return - - df_new = st.session_state[f"df_after_{original_dataset}"] - if "_cleaned_" in original_dataset: - stem = original_dataset.split("_cleaned")[0] - st.session_state["cleaned_dataset_counter"][stem] += 1 - else: - st.session_state["cleaned_dataset_counter"][original_dataset] += 1 - add_new_data(df_new, ds_name) - - -def remove_outliers(): - dataset: str = st.session_state["download_dataset"] - series: List = st.session_state["download_series"] - model: str = st.session_state["download_model"] - method: str = st.session_state["download_method"] - - st.session_state["prediction_data"] = defaultdict(list) - st.session_state["prediction_data"][dataset] = series - st.session_state["prediction_models"][model] = st.session_state["model_library"][model] - - get_predictions_callback() - set_session_state_items("page_index", FUNC_IDX_MAPPING["Data Download"]) - - # reconstruct a single dataframe from all series - df = pd.DataFrame() - for s in series: - df_to_add: pd.DataFrame = st.session_state["inference_results"][dataset][s] - model_columns = [c for c in df_to_add.columns if model in c] - df_to_add = df_to_add[model_columns + [s]] - df_to_add = df_to_add.rename(columns=lambda c: f"{s}_{c}" if c in model_columns else c) - df = df.merge( - df_to_add, - left_index=True, - right_index=True, - how="outer", - ) - - # add other series that belong to the dataset, but should not be cleaned - other_series = [s for s in st.session_state["data_store"][dataset] if s not in series] - for s in other_series: - df = df.merge( - st.session_state["data_store"][dataset][s], - left_index=True, - right_index=True, - how="outer", - ) - - df.sort_index(inplace=True, ascending=True) - df_before = df.copy(deep=True) - df_new = REMOVAL_METHODS[method](df) - - df_new.dropna(how="all", inplace=True, subset=series + other_series) - - make_removed_outliers_example_plots(df_before, df_new) - - st.session_state[f"df_before_{dataset}"] = df_before[series + other_series] - st.session_state[f"df_after_{dataset}"] = df_new[series + other_series] - - -def remove_all_outliers(df: pd.DataFrame) -> pd.DataFrame: - series: List = st.session_state["download_series"] - model: str = st.session_state["download_model"] - - for s in series: - model_column = f"{s}_{model}" - mask = df[model_column] == 1 - df.loc[mask, s] = np.nan - - return df - - -def linear_outlier_interpolation(df: pd.DataFrame) -> pd.DataFrame: - series: List = st.session_state["download_series"] - model: str = st.session_state["download_model"] - - for s in series: - model_column = f"{s}_{model}" - mask = df[model_column] == 1 - df.loc[mask, s] = np.nan - df[s] = df[s].interpolate() - - return df - - -REMOVAL_METHODS = { - "Delete outliers completely": remove_all_outliers, - "Linear interpolation": linear_outlier_interpolation, -} - - -def retrain_options(dataset: str, series: str): - deltas = get_annotation_deltas(dataset, series) - if ( - (f"last_model_name_{dataset}_{series}" not in st.session_state) - or (f"old_number_annotations_{dataset}_{series}" not in st.session_state) - or (not any(abs(v) > 0 for v in deltas.values())) - ): - c1, c2 = st.columns([1, 3]) - c2.info( - """Once you have added further annotations, you can use this button - to quickly retrain the last model and use it to generate new predictions.""" - ) - disabled = True - - else: - c1, c2, c3, c4, c5 = st.columns([2, 1, 1, 1, 1]) - - c2.metric("New marked Train Outlier", deltas["outlier"]) - c3.metric("New marked Train Normal", deltas["normal"]) - c4.metric("New marked Test Outlier", deltas["test_outlier"]) - c5.metric("New marked Test Normal", deltas["test_normal"]) - disabled = False - - was_retrained = c1.button( - "Retrain most recent model with new data", - key=f"retrain_{dataset}_{series}", - on_click=retrain_and_repredict, - args=(dataset, series), - disabled=disabled, - help="""Use this button to simplify your workflow and retrain the last model trained on this dataset & series - (using the added annotation data), generate new predictions and visualize them.""", - ) - return was_retrained - - -def dataset_choice_callback(): - ds = st.session_state["dataset_choice"] - st.session_state["current_dataset"] = ds - - -def series_choice_callback(dataset: str): - series = st.session_state["column_choice"] - st.session_state["current_series"][dataset] = series - set_session_state_items("expand_data_selection", False) - - -def data_selection(base_obj=None): - obj = base_obj or st - - with obj.expander("Data Selection", expanded=st.session_state["expand_data_selection"]): - - datasets = list(st.session_state["data_store"].keys()) - dataset_choice = st.selectbox( - label="Select dataset", - options=datasets, - index=datasets.index(st.session_state["current_dataset"]) - if st.session_state.get("current_dataset") is not None - else len(datasets) - 1, - disabled=len(st.session_state["data_store"]) < 2, - on_change=dataset_choice_callback, - key="dataset_choice", - ) - if not dataset_choice: - return - columns = sorted(st.session_state["data_store"][dataset_choice].keys()) - current_series = st.session_state["current_series"].get(dataset_choice) - column_choice = st.selectbox( - "Select Series", - columns, - index=columns.index(current_series) - if current_series is not None and current_series in columns - else len(columns) - 1, - disabled=len(st.session_state["data_store"][dataset_choice]) < 2, - on_change=series_choice_callback, - args=(dataset_choice,), - key="column_choice", - ) - - st.session_state[f"column_choice_{dataset_choice}"] = column_choice - - -def retrain_and_repredict(dataset: str, series: str, base_obj=None): - - obj = base_obj or st - fix_random_seeds() - - train_model(obj, dataset, series) - - get_predictions_callback(obj) - - st.session_state["suggested_points_with_annotation"][dataset][series].clear() - - -def show_all_callback(): - state = get_as() - - st.session_state["start_time"] = state.start - st.session_state["end_time"] = state.end - set_session_state_items("use_date_picker", False) - - -def back_to_selected_time_callback(): - state = get_as() - state.update_plot(st.session_state["start_time"], st.session_state["end_time"]) - set_session_state_items("use_date_picker", True) - - -def calendar_callback(): - dates = st.session_state["calendar"] - if len(dates) < 2: - return - state = get_as() - - current_start_time = state.start - current_end_time = state.end - start_date, end_date = dates - - new_start = current_start_time.replace( - year=start_date.year, month=start_date.month, day=start_date.day - ) - new_end = current_end_time.replace(year=end_date.year, month=end_date.month, day=end_date.day) - state.update_plot(new_start, new_end) - - -def time_callback(): - start_time = st.session_state["start_time_widget"] - end_time = st.session_state["end_time_widget"] - state = get_as() - - current_start_time = state.start - current_end_time = state.end - - new_start = current_start_time.replace(hour=start_time.hour, minute=start_time.minute) - new_end = current_end_time.replace(hour=end_time.hour, minute=end_time.minute) - state.update_plot(new_start, new_end) - - -def time_range_selection(base_obj=None) -> pd.DataFrame: - obj = base_obj or st - state = get_as() - if (state.df.index.max() - state.df.index.min()).days > 1: - obj.date_input( - "Graph Date Range", - value=(state.start.date(), state.end.date()), - min_value=state.df.index.min(), - max_value=state.df.index.max(), - on_change=calendar_callback, - key="calendar", - ) - inp_col_1, inp_col_2 = obj.columns(2) - - inp_col_1.time_input( - "Start time", - value=state.start.time(), - on_change=time_callback, - key="start_time_widget", - ) - inp_col_2.time_input( - "End time", - value=state.end.time(), - on_change=time_callback, - key="end_time_widget", - ) - if st.session_state["use_date_picker"]: - inp_col_1.button("Show All", on_click=show_all_callback) - c1, c2 = obj.columns(2) - c1.button( - "Shift back", - on_click=shift_plot_window, - args=("backwards",), - help="""Moves the displayed plot time range backwards, while keeping the range equal. - This can be used to iterate over the dataset in chunks of any size.""", - ) - c2.button( - "Shift forward", - on_click=shift_plot_window, - args=("forward",), - help="""Moves the displayed plot time range forward, while keeping the range equal. - This can be used to iterate over the dataset in chunks of any size.""", - ) - - else: - state.update_plot(state.df.index.min(), state.df.index.max()) - obj.button( - "Back to previous selection", - on_click=back_to_selected_time_callback, - ) - - form = obj.form(key="time_range_form") - - form.number_input( - "Set number of datapoints", - min_value=2, - max_value=len(state.df), - value=len(state.df_plot), - step=100, - help="""Control the number of datapoints shown in the plot window. - When changed, the currently set end timestamp will remain the same.""", - key="time_range_slider", - ) - form.form_submit_button("Update", on_click=shift_plot_window_number_of_points) - - -def shift_plot_window_number_of_points(): - state = get_as() - number_points = st.session_state["time_range_slider"] - current_end_time = state.end - current_end_idx = state.df.sort_index().index.get_loc(current_end_time) - new_start_idx = current_end_idx - number_points + 1 - new_start_time = state.df.sort_index().index[new_start_idx] - state.update_plot(start_time=new_start_time) - - -def shift_plot_window(direction: str): - state = get_as() - current_start = state.start - current_end = state.end - - current_range = current_end - current_start - - if direction == "forward": - new_start = current_start + current_range - new_end = current_end + current_range - else: - new_start = current_start - current_range - new_end = current_end - current_range - - if new_start < state.df.index.min(): - new_start = state.df.index.min() - new_end = new_start + current_range - if new_end > state.df.index.max(): - new_end = state.df.index.max() - new_start = new_end - current_range - - state.update_plot(new_start, new_end) - - -def generate_example_data(): - fix_random_seeds() - - dti = pd.date_range("2018-01-01", periods=5000, freq="10min") - - # generate some simple sine data - cycles = 50 - resolution = 5000 # how many datapoints to generate - length = np.pi * 2 * cycles - data = np.sin(np.arange(0, length, length / resolution)) - - # add some random noise, which is not outliers - data = data + np.random.normal(0, 0.005, data.shape) - - # add some randomly generated outliers - oulier_idc = np.random.randint(0, data.size, 200) - data[oulier_idc] *= np.random.normal(0, 2, oulier_idc.shape) - - data_2 = np.cos(np.arange(0, length, length / resolution)) - data_2 = data_2 + np.random.normal(0, 0.005, data_2.shape) - oulier_idc = np.random.randint(0, data_2.size, 500) - data_2[oulier_idc] *= np.random.normal(0, 1, oulier_idc.shape) - - df = pd.DataFrame({"Example Series 1": data, "Example Series 2": data_2}, index=dti) - - for c in df.columns: - st.session_state["data_store"]["Example Dataset"][c] = df[[c]] - an_st = AnnotationState("Example Dataset", c) - st.session_state["annotation_state_store"]["Example Dataset"][c] = an_st - - st.session_state["current_dataset"] = "Example Dataset" - st.session_state["current_series"]["Example Dataset"] = "Example Series 1" - - -def create_annotation_plot_buttons(base_obj=None): - obj = base_obj or st - - state = get_as() - - disabled = len(state.selection) < 1 - if disabled: - obj.info("Select points first, then annotate them using these buttons.") - - custom_text("Training Data", 15, True, base_obj=obj) - c_1, c_2 = obj.columns(2) - c_1.button( - "Mark selection Outlier", - on_click=state.update_data, - args=("outlier",), - disabled=disabled, - help=None - if disabled - else """Use this button to annotate all selected points as outliers - to be used for training.""", - ) - c_2.button( - "Mark selection Normal", - on_click=state.update_data, - args=("normal",), - disabled=disabled, - help=None - if disabled - else """Use this button to annotate all selected points as normal points - to be used for training.""", - ) - custom_text("Test Data", 15, True, base_obj=obj) - c_1, c_2 = obj.columns(2) - c_1.button( - "Mark selection Outlier", - on_click=state.update_data, - args=("test_outlier",), - key="mark_test_outlier", - disabled=disabled, - help=None - if disabled - else """Use this button to annotate all selected points as outliers - to be used in the test set. - This means they will not be used for training, but instead to evalutate the - performance of the trained model.""", - ) - c_2.button( - "Mark selection Normal", - on_click=state.update_data, - args=("test_normal",), - key="mark_test_normal", - disabled=disabled, - help=None - if disabled - else """Use this button to annotate all selected points as normal - points to be used in the test set. - This means they will not be used for training, but instead to evalutate the - performance of the trained model.""", - ) - obj.markdown("***") - c_1, c_2 = obj.columns(2) - - c_1.button("Clear Selection", on_click=state.clear_selection, disabled=len(state.selection) < 1) - c_2.button("Clear All", on_click=state.clear_all, disabled=len(state.all_indices) < 1) - - -def validate_uploaded_file_contents(base_obj=None): - obj = base_obj or st - uploaded_files = st.session_state["uploaded_annotation_files"] - if not uploaded_files: - return - state = get_as(return_all_columns=True) - data_to_load_if_file_ok = defaultdict(list) - # loop through files if there are multiple - for file_number, uploaded_file in enumerate(uploaded_files): - datapoints_tracker = {} - file_failed = False - data = pickle.loads(uploaded_file.getvalue()) - # loop through series of uploaded data - for column, data_dict in data.items(): - if file_failed: - break - total_datapoints_loaded = 0 - if column not in state: - obj.warning( - f"""File {uploaded_file.name}: - Column {column} not found in current dataset {st.session_state["dataset_choice"]}, skipping.""" - ) - continue - # loop through different annotation types (outlier, normal, etc.) - for df in data_dict.values(): - if df.empty: - continue - values_match = ( - df[column] - .astype(float) - .round(10) - .isin(state[column].df[column].round(10).values) - .all() - ) - index_match = df.index.isin(state[column].df.index).all() - - if (not values_match) or (not index_match): - file_failed = True - break - total_datapoints_loaded += len(df) - - if not file_failed: - datapoints_tracker[column] = total_datapoints_loaded - data_to_load_if_file_ok[column].append(data_dict) - - if file_failed: - obj.error( - f"""{uploaded_file.name}: Did not pass validation for loaded dataset. - Either index or values of loaded annotations do not match.""" - ) - else: - for c, count in datapoints_tracker.items(): - obj.success( - f"File {file_number + 1}: Loaded {count} annotations for series {c}.", icon="✅" - ) - - st.session_state["uploaded_annotation_data"][uploaded_file.name] = data - - st.session_state["uploaded_annotation_data"] = data_to_load_if_file_ok - - -def annotation_file_upload_callback(base_obj=None): - obj = base_obj or st - validate_uploaded_file_contents(obj) - - for column, data_list in st.session_state["uploaded_annotation_data"].items(): - state = get_as(column=column) - for data in data_list: - for key, df in data.items(): - state.update_data(key, df.index.to_list()) - - -def dev_options(base_obj=None): - if os.environ.get("TSOD_DEV_MODE", "false") == "false": - return nullcontext() - obj = base_obj or st - exp = obj.expander("Dev Options") - with exp: - dev_col_1, dev_col_2 = st.columns(2) - profile = dev_col_1.checkbox("Profile Code", value=False) - show_total_mem = dev_col_2.checkbox("Show total Memory Usage", value=False) - show_ss = dev_col_1.button("Show full Session State") - show_mem = dev_col_2.checkbox("Show mem. usage of dfs") - search_str = dev_col_1.text_input("Search SS", max_chars=25, value="") - - if len(search_str) > 1: - recursive_ss_search(search_str, base_obj=exp) - if show_mem: - st.write(ss_recursive_df_memory_usage()) - if show_total_mem: - show_memory_usage(exp) - - if show_ss: - st.write(st.session_state) - - return Profiler() if profile else nullcontext() - - -def create_save_load_buttons(base_obj=None): - obj = base_obj or st - state = get_as() - obj.info( - f"""Current dataset: - {state.dataset}""" - ) - c_1, c_2 = obj.columns(2) - - file_name = ( - f"{state.dataset}_Annotations_{datetime.datetime.now().strftime('%Y-%m-%d_%H:%M')}.bin" - ) - - if not state.all_indices: - c_1.warning("No Annotations have been added for this dataset.") - c_1.download_button( - "Download Annotations", - pickle.dumps( - { - k: v._download_data - for k, v in get_as(state.dataset, return_all_columns=True).items() - if (v._download_data) and (k != "selected") - } - ), - file_name=file_name, - disabled=not state.all_indices, - ) - - c_2.file_uploader( - "Upload Annotations", - key="uploaded_annotation_files", - type="bin", - on_change=annotation_file_upload_callback, - args=(obj,), - accept_multiple_files=True, - ) - obj.markdown("***") - - -def get_annotation_deltas(dataset: str, series: str): - annotation_keys = ["outlier", "normal", "test_outlier", "test_normal"] - state = get_as(dataset, series) - if f"old_number_annotations_{dataset}_{series}" in st.session_state: - deltas = { - k: len(state.data[k]) - - st.session_state[f"old_number_annotations_{dataset}_{series}"][k] - for k in annotation_keys - } - else: - deltas = {k: None for k in annotation_keys} - - return deltas - - -def show_annotation_summary(base_obj=None): - obj = base_obj or st - with obj.expander("Annotation Info", expanded=True): - state = get_as() - dataset = state.dataset - series = state.column - obj.subheader(f"Annotation summary - {state.dataset} - {state.column}") - c1, c2, c3 = st.columns([1, 1, 2]) - - custom_text("Training Data", 15, base_obj=c1, centered=False) - custom_text("Test Data", 15, base_obj=c2, centered=False) - - deltas = get_annotation_deltas(dataset, series) - - c1.metric( - "Total labelled Outlier", - len(state.outlier), - delta=deltas["outlier"], - delta_color="normal" if deltas["outlier"] != 0 else "off", - ) - c1.metric( - "Total labelled Normal", - len(state.normal), - delta=deltas["normal"], - delta_color="normal" if deltas["normal"] != 0 else "off", - ) - c2.metric( - "Total labelled Outlier", - len(state.test_outlier), - delta=deltas["test_outlier"], - delta_color="normal" if deltas["test_outlier"] != 0 else "off", - help="""Test data will not be used for training. - It can be used to measure how well a model performs on new data not seen during training. """, - ) - c2.metric( - "Total labelled Normal", - len(state.test_normal), - delta=deltas["test_normal"], - delta_color="normal" if deltas["test_normal"] != 0 else "off", - ) - - if not (len(state.data["outlier"]) and len(state.data["normal"]) > 1): - c3.warning( - """In order to train an outlier prediction model, please annotate at least one outlier - and two normal points as training data. - You can add annotations by - 1) Marking points in the 'Outlier Annotation' - page - 2) Uploading a previously created annotation file - 3) Correcting model predictions in the 'Model prediction' - page - Then choose a method and parameters and click on 'Train Outlier Model' in the sidebar.""" - ) - - if "classifier" in st.session_state: - obj.subheader(f"Current model: {st.session_state[f'last_model_name_{dataset}_{series}']}") - - -def save_method(): - method = st.session_state["current_method_choice"] - st.session_state["last_method_choice"] = method - - -def train_options(base_obj=None): - obj = base_obj or st - - state = get_as() - dataset = state.dataset - series = state.column - if not (len(state.data["outlier"]) and len(state.data["normal"]) > 1): - return - - with st.sidebar.expander("Modelling Options", expanded=True): - st.selectbox( - "Choose OD method", - options=list(MODEL_OPTIONS.keys()), - key="current_method_choice", - format_func=lambda x: MODEL_OPTIONS.get(x), - on_change=save_method, - help="Here you can choose what type of outlier detection approach to use.", - ) - - with st.sidebar.expander("Feature Options", expanded=True): - st.number_input( - "Points before", - min_value=1, - value=st.session_state.get("old_points_before") - if st.session_state.get("old_points_before") is not None - else 10, - key="number_points_before", - step=5, - help="How many points before each annotated point to include in its feature set.", - ) - st.number_input( - "Points after", - min_value=0, - value=st.session_state.get("old_points_after") - if st.session_state.get("old_points_after") is not None - else 0, - key="number_points_after", - step=5, - help="How many points after each annotated point to include in its feature set.", - ) - - auto_generate = st.sidebar.checkbox( - "Auto generate predictions for entire annotation series", - value=True, - help="""Often the next step after training a model is looking at its prediction distribution - over the entire annotated dataset-series combination. - By checking this box, predictions for the relevant series will be generated after training, - so the results can be viewed / compared straight away in the 'Model Prediction' - page.""", - ) - train_button = st.sidebar.button("Train Outlier Model", key="train_button") - - if train_button: - st.session_state["old_points_before"] = st.session_state["number_points_before"] - st.session_state["old_points_after"] = st.session_state["number_points_after"] - - train_model(obj) - if auto_generate: - get_predictions_callback(obj) - set_session_state_items("page_index", FUNC_IDX_MAPPING["2. Model Training"]) - st.experimental_rerun() - if f"last_model_name_{dataset}_{series}" in st.session_state: - st.sidebar.success( - f"{st.session_state[f'last_model_name_{dataset}_{series}']} finished training." - ) - if ( - st.session_state[f"last_model_name_{dataset}_{series}"] - in st.session_state["inference_results"][dataset][series].columns - ): - st.sidebar.success( - f"Predictions for model {st.session_state[f'last_model_name_{dataset}_{series}']} \ - have been generated and can be viewed on the 'Model Prediction' - page." - ) - if st.session_state["models_trained_this_session"]: - with st.sidebar.expander("Model Download", expanded=True): - model_choice = st.selectbox( - "Choose Model", - options=sorted(st.session_state["models_trained_this_session"], reverse=True), - ) - st.download_button( - "Download model", - pickle.dumps(st.session_state["model_library"][model_choice]), - f"{model_choice}.pkl", - ) - - -def get_predictions_callback(obj=None): - # set_session_state_items("hide_choice_menus", True) - set_session_state_items("page_index", FUNC_IDX_MAPPING["3. Model Prediction"]) - get_model_predictions(obj) - set_session_state_items("prediction_models", {}) - - -def add_uploaded_models(base_obj=None): - obj = base_obj or st - # set_session_state_items("prediction_models", {}) - if not st.session_state.current_uploaded_models: - return - for data in st.session_state.current_uploaded_models: - model_data = pickle.loads(data.read()) - model = model_data["model"] - if not hasattr(model, "predict"): - obj.error( - "The uploaded object can not be used for prediction (does not implement 'predict' method)." - ) - continue - - st.session_state["prediction_models"][data.name] = model_data - st.session_state["model_library"][data.name] = model_data - - -def add_session_models(): - to_add = st.session_state["session_models_to_add"] - uploaded_models = { - k: v for k, v in st.session_state["prediction_models"].items() if k.endswith(".pkl") - } - st.session_state["prediction_models"] = { - k: st.session_state["model_library"][k] for k in to_add - } - st.session_state["prediction_models"].update(uploaded_models) - - -def add_session_dataset(): - session_ds = st.session_state["pred_session_ds_choice"] - session_cols = st.session_state["pred_session_col_choice"] - - st.session_state["prediction_data"][session_ds] = session_cols - - -def prediction_options(base_obj=None): - obj = base_obj or st - _, c, _ = obj.columns([2, 5, 2]) - c.button("Generate Predictions", on_click=get_predictions_callback, args=(obj,)) - - with obj.expander("Model Choice", expanded=True): - st.subheader("Choose models for generating predictions") - st.multiselect( - "Select models trained this session", - options=sorted( - [k for k in st.session_state["model_library"].keys() if not k.endswith(".pkl")] - ), - default=[ - k for k in st.session_state["prediction_models"].keys() if not k.endswith(".pkl") - ], - on_change=add_session_models, - key="session_models_to_add", - help="When a new model is trained, it is automatically pre-selected.", - ) - st.file_uploader( - "Or / and select model from disk (optional)", - type="pkl", - on_change=add_uploaded_models, - key="current_uploaded_models", - args=(obj,), - accept_multiple_files=True, - ) - - st.subheader("Selected models:") - st.json(list(st.session_state.prediction_models.keys())) - if st.session_state.prediction_models: - st.button( - "Clear selection", on_click=set_session_state_items, args=("prediction_models", {}) - ) - with obj.expander("Data Choice", expanded=True): - st.subheader("Select data for generating predictions") - ds_options = list(st.session_state["data_store"].keys()) - if "most_recent_model" in st.session_state: - idx = ds_options.index( - st.session_state["model_library"][st.session_state["most_recent_model"]][ - "trained_on_dataset" - ] - ) - else: - if len(ds_options): - idx = len(ds_options) - 1 - else: - idx = 0 - ds_choice = st.selectbox( - label="Select datasets uploaded this session", - options=ds_options, - index=idx, - disabled=len(st.session_state["data_store"]) < 2, - key="pred_session_ds_choice", - ) - if ds_choice: - col_options = list(st.session_state["data_store"][ds_choice].keys()) - if "most_recent_model" in st.session_state: - if ( - ds_choice - == st.session_state["model_library"][st.session_state["most_recent_model"]][ - "trained_on_dataset" - ] - ): - default = st.session_state["model_library"][ - st.session_state["most_recent_model"] - ]["trained_on_series"] - else: - default = st.session_state["prediction_data"].get(ds_choice) - else: - default = st.session_state["prediction_data"].get(ds_choice) - - session_ds_columns = st.multiselect( - "Pick series", - options=col_options, - default=default, - on_change=add_session_dataset, - key="pred_session_col_choice", - ) - st.subheader("Selected Series:") - st.json(st.session_state["prediction_data"]) - # st.json({k: list(v.keys()) for k, v in st.session_state["prediction_data"].items()}) - if st.session_state["prediction_data"]: - st.button( - "Clear selection", - on_click=set_session_state_items, - args=("prediction_data", {}), - key="data_clear", - ) - _, c, _ = obj.columns([2, 5, 2]) - c.button( - "Generate Predictions", on_click=get_predictions_callback, args=(obj,), key="pred_btn_2" - ) - - -def remove_model_to_visualize(dataset_name, series, model_name): - st.session_state["models_to_visualize"][dataset_name][series].discard(model_name) - - # if not recursive_length_count(st.session_state["models_to_visualize"]): - # st.session_state["hide_choice_menus"] = False - - -def prediction_summary_table(dataset_name: str, series: str, base_obj=None): - obj = base_obj or st - - DEFAULT_MARKER_COLORS = ["#e88b0b", "#1778dc", "#1bd476", "#d311e6"] - model_names = sorted(st.session_state["models_to_visualize"][dataset_name][series]) - - if not model_names: - return - - if len(model_names) > len(DEFAULT_MARKER_COLORS): - obj.error( - f"Currently max. number of models is {len(DEFAULT_MARKER_COLORS)}, got {len(model_names)}" - ) - return - - c1, c2, c4, c5, c6, c7 = obj.columns([5, 3, 6, 3, 3, 3]) - custom_text("Model Name", base_obj=c1, font_size=15, centered=True) - custom_text("Params", base_obj=c4, font_size=15, centered=False) - custom_text("Predicted Outliers", base_obj=c5, font_size=15, centered=False) - custom_text("Predicted Normal", base_obj=c6, font_size=15, centered=False) - custom_text("Choose Plot Color", base_obj=c7, font_size=15, centered=False) - - obj.markdown("***") - - for i, model in enumerate(model_names): - _local_obj = obj.container() - c1, c2, c4, c5, c6, c7 = _local_obj.columns([5, 3, 6, 3, 3, 3]) - custom_text(model, base_obj=c1, font_size=15, centered=True) - c2.button( - "Remove", - key=f"remove_{model}_{dataset_name}_{series}", - on_click=remove_model_to_visualize, - args=(dataset_name, series, model), - ) - c4.json(st.session_state["model_library"][model]["params"], expanded=False) - custom_text( - st.session_state["number_outliers"][dataset_name][series][model], - base_obj=c5, - font_size=20, - centered=False, - ) - custom_text( - len(st.session_state["inference_results"][dataset_name][series]) - - st.session_state["number_outliers"][dataset_name][series][model], - base_obj=c6, - font_size=20, - centered=False, - ) - c7.color_picker( - model, - key=f"color_{model}_{dataset_name}_{series}", - label_visibility="collapsed", - value=DEFAULT_MARKER_COLORS[i], - ) - obj.markdown("***") - - -def test_metrics(base_obj=None): - obj = base_obj or st - - state = get_as() - dataset = state.dataset - series = state.column - if f"last_model_name_{dataset}_{series}" not in st.session_state: - return - - custom_text( - f"Most recent model: {st.session_state[f'last_model_name_{dataset}_{series}']}", - base_obj=obj, - ) - obj.info( - """All displayed metrics have a max value of 1 (best possible result) - and min value of 0 (worst possible result). Hover over the question mark next to the metrics to get info on what they mean.""" - ) - c1, c2, c3, c4 = obj.columns(4) - - current_train_metrics = st.session_state[f"current_model_train_metrics_{dataset}_{series}"] - prec = current_train_metrics["precision"] - rec = current_train_metrics["recall"] - f1 = current_train_metrics["f1"] - - if f"previous_model_train_metrics_{dataset}_{series}" in st.session_state: - old_metrics = st.session_state[f"previous_model_train_metrics_{dataset}_{series}"] - old_prec = old_metrics["precision"] - old_rec = old_metrics["recall"] - old_f1 = old_metrics["f1"] - - out_prec_diff = (prec[1] - old_prec[1]).round(3) - out_rec_diff = (rec[1] - old_rec[1]).round(3) - out_f1_diff = (f1[1] - old_f1[1]).round(3) - norm_prec_diff = (prec[0] - old_prec[0]).round(3) - norm_rec_diff = (rec[0] - old_rec[0]).round(3) - norm_f1_diff = (f1[0] - old_f1[0]).round(3) - - else: - out_prec_diff, out_rec_diff, out_f1_diff, norm_prec_diff, norm_rec_diff, norm_f1_diff = ( - None, - None, - None, - None, - None, - None, - ) - - with c1.expander("Train Set Outlier Metrics", expanded=True): - st.metric( - "Precision Score", - prec[1], - delta=out_prec_diff, - delta_color="normal" if out_prec_diff != 0.0 else "off", - help=f"""The ratio of true predicted positives to total predicted positives for the train set outliers. - Represents the ability not to classify a normal sample as an outlier. - Your score of {prec[1]} means that for the training set, {int(prec[1] * 100)}% of your model's predicted outliers were correct (not labelled as normal points).""", - ) - st.metric( - "Recall Score", - rec[1], - delta=out_rec_diff, - delta_color="normal" if out_rec_diff != 0.0 else "off", - help=f"""The ratio of true predicted positives to total positives for the train set outliers. - Represents the ability to correctly predict all the outliers. - Your score of {rec[1]} means that for the training set, your model has correctly predicted {int(rec[1] * 100)}% of the annotated outliers.""", - ) - st.metric( - "F1 Score", - f1[1], - delta=out_f1_diff, - delta_color="normal" if out_f1_diff != 0.0 else "off", - help="The harmonic mean of the precision and recall for the train set outliers.", - ) - with c2.expander("Train Set Normal Metrics", expanded=True): - st.metric( - "Precision Score", - prec[0], - delta=norm_prec_diff, - delta_color="normal" if norm_prec_diff != 0.0 else "off", - help=f"""The ratio of true predicted positives to total predicted positives for the train set normal points. - Represents the ability not to classify an outlier sample as a normal point. - Your score of {prec[0]} means that for the training set, {int(prec[0] * 100)}% of your model's predicted normal points were correct (not labelled as outliers).""", - ) - st.metric( - "Recall Score", - rec[0], - delta=norm_rec_diff, - delta_color="normal" if norm_rec_diff != 0.0 else "off", - help=f"""The ratio of true predicted positives to total positives for the train set normal points. - Represents the ability to correctly predict all the normal points. - Your score of {rec[0]} means that for the training set, your model has correctly predicted {int(rec[0] * 100)}% of the annotated normal points.""", - ) - st.metric( - "F1 Score", - f1[0], - delta=norm_f1_diff, - delta_color="normal" if norm_f1_diff != 0.0 else "off", - help="The harmonic mean of the precision and recall for the train set normal points.", - ) - - if f"current_model_test_metrics_{dataset}_{series}" not in st.session_state: - return - - current_metrics = st.session_state[f"current_model_test_metrics_{dataset}_{series}"] - prec = current_metrics["precision"] - rec = current_metrics["recall"] - f1 = current_metrics["f1"] - - if f"previous_model_test_metrics_{dataset}_{series}" in st.session_state: - old_metrics = st.session_state[f"previous_model_test_metrics_{dataset}_{series}"] - old_prec = old_metrics["precision"] - old_rec = old_metrics["recall"] - old_f1 = old_metrics["f1"] - - out_prec_diff = ( - (prec[1] - old_prec[1]).round(3) if (len(prec) == 2) and (len(old_prec) == 2) else None - ) - out_rec_diff = ( - (rec[1] - old_rec[1]).round(3) if (len(rec) == 2) and (len(old_rec) == 2) else None - ) - out_f1_diff = ( - (f1[1] - old_f1[1]).round(3) if (len(f1) == 2) and (len(old_f1) == 2) else None - ) - norm_prec_diff = ( - (prec[0] - old_prec[0]).round(3) if (len(prec) == 2) and (len(old_prec) == 2) else None - ) - norm_rec_diff = ( - (rec[0] - old_rec[0]).round(3) if (len(rec) == 2) and (len(old_rec) == 2) else None - ) - norm_f1_diff = ( - (f1[0] - old_f1[0]).round(3) if (len(f1) == 2) and (len(old_f1) == 2) else None - ) - - else: - out_prec_diff, out_rec_diff, out_f1_diff, norm_prec_diff, norm_rec_diff, norm_f1_diff = ( - None, - None, - None, - None, - None, - None, - ) - - if st.session_state[f"old_number_annotations_{dataset}_{series}"]["test_outlier"] > 0: - if st.session_state[f"old_number_annotations_{dataset}_{series}"]["test_normal"] > 0: - idx = 1 - else: - idx = 0 - with c3.expander("Test Set Outlier Metrics", expanded=True): - st.metric( - "Precision Score", - prec[idx], - delta=out_prec_diff, - delta_color="normal" if out_prec_diff != 0.0 else "off", - help=f"""The ratio of true predicted positives to total predicted positives for the test set outliers. - Represents the ability not to classify a normal sample as an outlier. - Your score of {prec[idx]} means that for the test set, {int(prec[idx] * 100)}% of your model's predicted outliers were correct (not labelled as normal points).""", - ) - st.metric( - "Recall Score", - rec[idx], - delta=out_rec_diff, - delta_color="normal" if out_rec_diff != 0.0 else "off", - help=f"""The ratio of true predicted positives to total positives for the test set outliers. - Represents the ability to correctly predict all the outliers. - Your score of {rec[idx]} means that for the test set, your model has correctly predicted {int(rec[idx] * 100)}% of the annotated outliers.""", - ) - st.metric( - "F1 Score", - f1[idx], - delta=out_f1_diff, - delta_color="normal" if out_f1_diff != 0.0 else "off", - help="The harmonic mean of the precision and recall for the outliers.", - ) - if st.session_state[f"old_number_annotations_{dataset}_{series}"]["test_normal"] > 0: - with c4.expander("Test Set Normal Metrics", expanded=True): - st.metric( - "Precision Score", - prec[0], - delta=norm_prec_diff, - delta_color="normal" if norm_prec_diff != 0.0 else "off", - help=f"""The ratio of true predicted positives to total predicted positives for the test set normal points. - Represents the ability not to classify an outlier sample as a normal point. - Your score of {prec[0]} means that for the test set, {int(prec[0] * 100)}% of your model's predicted normal points were correct (not labelled as outliers).""", - ) - st.metric( - "Recall Score", - rec[0], - delta=norm_rec_diff, - delta_color="normal" if norm_rec_diff != 0.0 else "off", - help=f"""The ratio of true predicted positives to total positives for the test set normal points. - Represents the ability to correctly predict all the normal points. - Your score of {rec[0]} means that for the test set, your model has correctly predicted {int(rec[0] * 100)}% of the annotated normal points.""", - ) - st.metric( - "F1 Score", - f1[0], - delta=norm_f1_diff, - delta_color="normal" if norm_f1_diff != 0.0 else "off", - help="The harmonic mean of the precision and recall for the normal points.", - ) - - -def model_choice_callback(dataset_name: str, series: str): - st.session_state["models_to_visualize"][dataset_name][series] = set( - st.session_state[f"model_choice_{dataset_name}_{series}"] - ) - - -def model_choice_options(dataset_name: str, series: str): - if (dataset_name == list(st.session_state["inference_results"].keys())[0]) and ( - series == list(st.session_state["inference_results"][dataset_name].keys())[0] - ): - st.info( - f"""Below you can choose from all models which have generated - predictions for this series. - Add them to the selection to visualize their results. - By default, the two most recently trained models are selected.""" - ) - st.multiselect( - "Choose models for dataset", - sorted(st.session_state["available_models"][dataset_name][series]), - key=f"model_choice_{dataset_name}_{series}", - default=sorted(st.session_state["models_to_visualize"][dataset_name][series]), - on_change=model_choice_callback, - args=(dataset_name, series), - max_selections=4, - ) - - st.markdown("***") - - -def outlier_visualization_options(dataset_name: str, series: str): - if not st.session_state["models_to_visualize"][dataset_name][series]: - return - - form = st.form( - f"form_{dataset_name}_{series}", - ) - form.info( - "Click on a bar in the distribution plot to view all outliers \ - in that time period. Each time period is chosen so it contains the same number of datapoints." - ) - c1, c2 = form.columns(2) - c1.slider( - "Number of datapoints per bar", - value=300, - min_value=10, - max_value=1000, - step=1, - key=f"num_outliers_{dataset_name}_{series}", - help="""Adjust the number of datapoints each bar represents.""", - ) - c2.slider( - "Height of figures (px)", - value=600, - min_value=100, - max_value=1500, - step=100, - key=f"figure_height_{dataset_name}_{series}", - ) - - form.checkbox( - "Only show time ranges containing outliers (predicted or annotated)", - key=f"only_show_ranges_with_outliers_{dataset_name}_{series}", - help="""Depending on how well a model is already trainied, there might be many - time ranges that do not contain any predictied outliers. - By setting this option, these ranges are not included on the x axis in the distribution plot.""", - ) - - state = get_as(dataset_name, series) - - c1, c2 = form.columns(2) - if state.test_outlier: - - c1.checkbox( - "Hightlight missed test set outliers", - value=True, - key=f"highlight_test_{dataset_name}_{series}", - help="""If this is set, time ranges that contain annotated test outliers - which the models did not classify as such are marked for easy identification.""", - ) - if state.outlier: - c2.checkbox( - "Hightlight missed train set outliers", - value=False, - key=f"highlight_train_{dataset_name}_{series}", - help="""If this is set, time ranges that contain annotated train outliers - which the models did not classify as such are marked for easy identification.""", - ) - - form.form_submit_button("Update Distribution Plot") - - -def show_feature_importances(base_obj=None): - obj = base_obj or st - state = get_as() - dataset = state.dataset - series = state.column - if f"last_model_name_{dataset}_{series}" not in st.session_state: - return - - with obj.expander("Feature Importances", expanded=True): - c1, c2 = st.columns([2, 1]) - feature_importance_plot(c1) - c2.dataframe(st.session_state[f"current_importances_{dataset}_{series}"]) - - -def add_slider_selected_points(dataset_name: str, model_name: str): - start, end = st.session_state[f"outlier_slider_{dataset_name}_{model_name}"] - coords = st.session_state[f"current_outlier_value_store"][dataset_name][model_name] - timestamps_to_add = {coords[i][0] for i in range(start, end + 1)} - state = get_as() - state.update_selected(timestamps_to_add) - - -def process_data_from_echarts_plot( - clicked_point: List | dict | None, dataset_name=None, series=None, base_obj=None -): - obj = base_obj or st - state = get_as(dataset_name, series) - was_updated = False - - if (clicked_point is None) or ((clicked_point[1] == "brush") and (not clicked_point[0])): - return - - # we want to select only the outlier series, not the datapoints series. - # This behaviour is set by a checkbox above the plot. It only effects area selection. - if ( - (clicked_point[1] == "brush") - and dataset_name - and series - and st.session_state[f"only_select_outliers_{dataset_name}_{series}"] - ): - model_names = sorted(st.session_state["models_to_visualize"][dataset_name][series]) - - relevant_outlier_idc = { - d["seriesName"]: d["dataIndex"] - for d in clicked_point[0] - if d["seriesName"] in model_names - } - relevant_data_points = [ - st.session_state["pred_outlier_tracker"][dataset_name][series][k] - .iloc[v] - .index.to_list() - for k, v in relevant_outlier_idc.items() - ] - relevant_data_points = set().union(*relevant_data_points) - was_updated = state.update_selected(relevant_data_points) - - else: - if clicked_point[1] == "click": - point_to_process = plot_return_value_as_datetime(clicked_point[0]) - if point_to_process: - if point_to_process not in state.selection: - was_updated = state.update_selected([point_to_process]) - else: - relevant_series = [s for s in clicked_point[0] if s["seriesName"] == "Datapoints"] - if not relevant_series: - return - relevant_data_idc = relevant_series[0]["dataIndex"] - was_updated = state.update_selected( - state.df_plot.iloc[relevant_data_idc].index.to_list() - ) - - if was_updated: - st.experimental_rerun() - - -def correction_options(dataset_name: str, series: str, base_obj=None): - obj = base_obj or st - obj.subheader("Prediction correction:") - - obj.info( - """Either select individual points in the above plot or use the area select options - (top right corner of the plot window) to select multiple points. Then add further - annotations to correct faulty model predictions.""" - ) - - # current_range = st.session_state[f"range_str_{dataset_name}"] - # df_current_counts = st.session_state[f"current_ranges_counts_{dataset_name}"] - - # model_names = sorted(st.session_state["models_to_visualize"][dataset_name]) - # for model_name in model_names: - # model_counts = df_current_counts.loc[current_range, model_name] - # if model_counts <= 1: - # continue - # # if not st.session_state[f"current_outlier_value_store"][dataset_name].get(model_name): - # # continue - # form = st.form(f"outlier_select_form_{dataset_name}_{model_name}") - # c1, c2, c3, c4 = form.columns([5, 7, 1, 2]) - # custom_text(model_name, 20, base_obj=c1) - # if model_counts > 1: - # c2.slider( - # model_name, - # min_value=1, - # max_value=model_counts, - # value=(1, model_counts), - # label_visibility="collapsed", - # key=f"outlier_slider_{dataset_name}_{model_name}", - # ) - # c4.form_submit_button( - # "Select Outlier points", - # on_click=add_slider_selected_points, - # args=(dataset_name, model_name), - # ) - - c1, c2, c3, c4 = obj.columns(4) - - state = get_as(dataset_name, series) - - c1.button( - "Mark Train Outlier", - on_click=state.update_data, - args=("outlier",), - kwargs={"base_obj": obj}, - key=f"pred_mark_outlier_{dataset_name}_{series}", - ) - c2.button( - "Mark Train Normal", - on_click=state.update_data, - args=("normal",), - kwargs={"base_obj": obj}, - key=f"pred_mark_normal_{dataset_name}_{series}", - ) - c3.button( - "Mark Test Outlier", - on_click=state.update_data, - args=("test_outlier",), - kwargs={"base_obj": obj}, - key=f"pred_mark_test_outlier_{dataset_name}_{series}", - ) - c4.button( - "Mark Test Normal", - on_click=state.update_data, - args=("test_normal",), - kwargs={"base_obj": obj}, - key=f"pred_mark_test_normal_{dataset_name}_{series}", - ) - c1.button( - "Clear Selection", - on_click=state.clear_selection, - key=f"pred_clear_selection_{dataset_name}_{series}", - ) - # c3.button("Clear All", on_click=state.clear_all) diff --git a/tsod/active_learning/data_structures.py b/tsod/active_learning/data_structures.py deleted file mode 100644 index 3d3ce33..0000000 --- a/tsod/active_learning/data_structures.py +++ /dev/null @@ -1,156 +0,0 @@ -from collections import defaultdict -import datetime -from typing import Sequence -import pandas as pd -import streamlit as st - - -class AnnotationState: - def __init__(self, dataset: str, column: str) -> None: - self.df = st.session_state["data_store"][dataset][column] - self.dataset = dataset - self.column = column - self.data = defaultdict(set) - self.df_selected = pd.DataFrame() - self.df_outlier = pd.DataFrame() - self.df_normal = pd.DataFrame() - self.df_test_outlier = pd.DataFrame() - self.df_test_normal = pd.DataFrame() - self.df_plot = pd.DataFrame() - self.start: datetime.datetime | None = None - self.end: datetime.datetime | None = None - self._download_data = {} - - start_time = self.df.sort_index().index[-min(200, len(self.df))] - end_time = self.df.index.max() - - self.update_plot(start_time, end_time) - - @property - def all_indices(self): - return self.selection.union(self.outlier, self.normal, self.test_outlier, self.test_normal) - - @property - def selection(self) -> set: - return self.data["selected"] - - @property - def outlier(self) -> set: - return self.data["outlier"] - - @property - def normal(self) -> set: - return self.data["normal"] - - @property - def test_outlier(self) -> set: - return self.data["test_outlier"] - - @property - def test_normal(self) -> set: - return self.data["test_normal"] - - def update_selected(self, data: Sequence): - to_add = {plot_return_value_as_datetime(e) for e in set(data)} - if not to_add.issubset(self.data["selected"]): - self.data["selected"].update(to_add) - self._update_df("selected") - self._update_plot_df("selected") - return True - return False - - def update_data(self, key: str, data_to_add: Sequence | None = None, base_obj=None): - obj = base_obj or st - - _data = ( - {plot_return_value_as_datetime(e) for e in set(data_to_add)} - if data_to_add - else self.selection - ) - - for k, stored_data in self.data.items(): - if (k == key) or (k == "selected"): - continue - if _data.intersection(stored_data): - obj.warning( - f"Some of the selected points have already been marked as {k} and were overwritten." - ) - self.data[k] = self.data[k] - _data - self._update_df(k) - self._update_plot_df(k) - - if not _data.issubset(self.data[key]): - self.data[key].update(_data) - self._update_df(key) - self._update_plot_df(key) - if not data_to_add: - self.clear_selection() - - def update_plot( - self, start_time: datetime.datetime | None = None, end_time: datetime.datetime | None = None - ): - if not start_time: - start_time = self.start - if not end_time: - end_time = self.end - if ( - (not self.start and not self.end) - or (start_time != self.start) - or (end_time != self.end) - ): - self.df_plot = self.df[self.df.index.to_series().between(start_time, end_time)] - self.start = self.df_plot.index.min() - self.end = self.df_plot.index.max() - - for key in self.data: - self._update_plot_df(key) - - def _update_plot_df(self, key: str): - if key not in self.data: - raise ValueError(f"Key {key} not found.") - - setattr(self, f"df_plot_{key}", self.df_plot[self.df_plot.index.isin(self.data[key])]) - - def clear_selection(self): - self.data["selected"].clear() - self._update_df("selected") - self._update_plot_df("selected") - - def clear_all(self): - for key in self.data: - self.data[key].clear() - self._update_df(key) - self._update_plot_df(key) - - def _update_df(self, key: str): - if key not in self.data: - raise ValueError(f"Key {key} not found.") - - new_df = self.df[self.df.index.isin(self.data[key])] - - setattr(self, f"df_{key}", new_df) - if new_df.empty: - self._download_data.pop(key, None) - else: - self._download_data[key] = new_df - - -def plot_return_value_as_datetime(value: str | int | datetime.datetime) -> datetime.datetime: - # Plotly sometimes returns selected points as timestamp - if isinstance(value, int): - return datetime.datetime.fromtimestamp(value / 1000) - # also sometimes as strings - elif isinstance(value, str): - try: - return datetime.datetime.strptime(value, "%Y-%m-%d %H:%M") # Plotly return format - except ValueError: - pass - try: - return datetime.datetime.strptime( - value, "%Y-%m-%d" - ) # Plotly return format for midnight values - except ValueError: - pass - return datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%S") # Pyecharts return format - else: - return value diff --git a/tsod/active_learning/instructions.py b/tsod/active_learning/instructions.py deleted file mode 100644 index 85dd7eb..0000000 --- a/tsod/active_learning/instructions.py +++ /dev/null @@ -1,310 +0,0 @@ -import streamlit as st -from tsod.active_learning import MEDIA_PATH - - -def general(): - st.markdown( - """ -Welcome to the Time Series Outlier Detection web app! -This project is a human-in-the-loop system for detecting outliers in time series, -allowing for the cleaning of noisy datasets, which is often a requirement before the dataset can be used further (e.g. for training ML-models). - -### General notes -- This project implements a quite simple workflow. If you are unsure about what the use of a widget / field is, either hover your mouse above the little question mark next to it, or check the documentation. -- This app is currently a simple web app without user identification or saving of intermediate results. That means that if you refresh the page, you will start from scratch again. However, the app does allow for the download and upload of annotations, models and datasets so you may use them again at a later time. -- It is possible to upload multiple datasets containing multiple series (columns) each. You can add annotations and train models on every individual series. Is is currently not possible to train a single model using features from multiple series (multivariate outlier prediction). However, this is one of the possible future improvements. -### Recommended workflow - """ - ) - - st.image(str(MEDIA_PATH / "workflow.png"), use_column_width=True) - - st.markdown( - """ - There are several ways of training your outlier detection models. Which one of those works best depends very much on your use case, however here are a few general guidelines. For more details on each step, please find the designated page instructions in the other tabs. - -1. Upload your data (under *Data Upload* in the sidebar in the *Outlier Annotation*-page). There are a number of formats supported, see the instructions on Outlier Annotation. -2. Add some annotations for one of your series. If you have previously annotated and saved that series, remember to upload your annotations file from disk (under *Save / load previous* in the sidebar in the *Outlier Annotation*-page). No need to add too many annotations in the first iteration, better to train a model quickly to gain insights into what the model has learned. -3. Head to the page *Model Training*, choose a modelling method (currently only Random Forest Classifier is implemented) and choose some parameters (most of the modelling choices are abstracted away on purpose). Click on *Train Outlier Model* on the bottom of the sidebar to train an initial model. -4. After a short amount of time you will see a brief training summary including train set metrics, (if defined: test set metrics) and feature importances. The next step is to head over to the *Model Prediction*-page to judge the quality of the model. -5. By default, model predictions for the entire training series are generated when training a model. On the prediction page, you can use any model to generate predictions on any of your datasets/series. Once you have some predictions, you will see them visualized in the main window. As there can be many predicted outliers (especially for earlier models), the predictions are summarized in the outlier distribution bar plot. Each bar represents a time window containing an equal number of datapoints and the height of the bar shows you how many outliers each model predicts in that window. Click on a bar of interest. -6. You can now see the predicted outliers in a new plot underneath. Try to identify patterns of faulty prediction and generate some new annotations by correcting them directly in the graph. You can also add individual annotations, just like in step 2). -7. Alternatively, you can also generate further annotations by heading the the *Annotation Suggestion*-page. There you are prompted to give simple yes or no answer for selected points (based on model uncertainty). -8. After having added some further annotations, you can train another model iteration. For this, either head back to the *Model Training*-page. If you don't want to change any model parameters, you can also click on 'Retain most recent model with new data' (available on both the *Model Prediction* and the *Annotation suggestion*-page). This will train a new model using the same parameters as before, generate new predictions and bring you to the prediction page for comparison. -9. Repeat the circle of adding annotations, retraining and evaluating the results until you are satisfied. -10. To remove outliers from any of your datasets/series, head to the *Data Download*-page. There you can create new datasets by removing predicted outliers, as well as download any dataset you have uploaded/created. - """ - ) - - -def outlier_annotation(): - - st.markdown( - """ - The *Outlier Annotation*-page is designated to the manual adding of annotations to any series. As the "entrypoint" of the app, it also holds the functionality to upload datasets. -The main window will always only contain an interactive plot window. In the sidebar, you'll find all widgets related to interacting with the annotation process. -""" - ) - st.markdown("***") - c1, c2 = st.columns([3, 1]) - - c1.markdown( - """ -### Uploading Data - -The first field in the sidebar allows you to upload your datasets. For trying out the app, you can also click on 'Add generated data' to add a toy dataset with two random series. - -Currently, the following file formats are supported for uploading your data from disk: -- CSV -- XLSX / XLS -- DFS0 - -If your dataset is split into multiple files, you select all files and they will be merged into a single dataset. However, the data needs to be consistent (may not contain multiple values for the same timestamp for the same series). A variety of different timestamp formats are supported. -Optionally, you can give your dataset a name for easy identification, otherwise it will receive a handle based on the names of the uploaded files. -To finish, click on 'Upload'. Once your files have been validated and merged, you will be able to select your dataset under *Data Selection* in the sidebar. - -""" - ) - c2.image( - str(MEDIA_PATH / "data_upload.png"), - use_column_width=True, - # width=400, - caption="The 'Data Upload' field.", - ) - st.markdown("***") - c1, c2 = st.columns([3, 1]) - c1.markdown( - """ -### Main plot window - -The main plot window will display the selected series for the selected time interval. By default, the plot will contain a series displaying only your actual datapoints, as well as a series containing the connecting line. This is purely for convenience reasons, individual points are always selected by clicking on the datapoints. -As soon as you selected at least one point, you will see your selection marked in purple (also a new entry will be added in the legend). -Once you add annotations, they will be marked as a new series as well. -To select multiple neighboring points at once, it is easier to use the 'Horizontally Select' or 'Box Select' - options by activating them in the top right corner of the plot window. - """ - ) - c2.image( - str(MEDIA_PATH / "selection_options.png"), - use_column_width=True, - caption="You can change data selection modes in the top right corner of the main plot window.", - ) - st.markdown("***") - c1, c2 = st.columns([3, 1]) - - c1.markdown( - """ -### Annotation Controls - -**Actions** - -This fields allows you to choose what to do with your selection. Your selection has two label options (Outlier or Normal) and can be assigned to either the train or the test set. -'Clear Selection' removes your entire selection, but keeps all points annotated thus far. 'Clear All' resets your annotation state, removing selected as well as annotated points. - -**Time Range Selection** - -This field offers control over the time range that is displayed in the main plot window. -In order to assure that the app works with datasets on any time scale (nanoseconds to decades), when first loading in a new dataset, a time range will automatically be chosen so that the main plot contains the last 200 points of data. -If your selected series spans a time range of more than a day, a calender widget will be available to select start & end date, as well as two time widgets for setting start & end time. -You can also directly set the number of datapoints the plot should contain, using the number input on the bottom of the field. -'Show All' will display the entire series (not recommended for large number of points). -The 'Shift back' and 'Shift forward' - buttons are useful for stepping through your data in equal time steps. As any initial visualization starts at the end of the timestamp index, clicking on 'Shift back' will determine the current range that is being displayed and then update the plot backwards in time, keeping the range equal (the previous start timestamp will become the new end timestamp). -**Recommended workflow for stepping through your dataset:** -Select an appropriate time range that makes outliers easily visible for you => step through the dataset using the Shift buttons and add annotations. - -**Save / load previous** - -To continue annotations in a different session, you might want to download your current progress. Click the 'Download Annotations' - button to save your annotations for the current dataset to disk as binary data. -Use the 'Upload Annotations' - uploader to add previously created annotations. This assumes that you already have the correct dataset loaded, the actual data is not saved together with the annotations. - """ - ) - c2.image( - str(MEDIA_PATH / "time_range_selection.png"), - use_column_width=True, - caption="Select a time range that fits your data and lets you easily identify outliers. Then step through your dataset while keeping that range.", - ) - - -def model_training(): - st.markdown( - """ - The *Model Training*-page is designated to choosing modelling methods & hyperparameters, training models and evaluating their performances. In the sidebar, you'll find all widgets related to the training process. - On top of the main window, you will see a short annotation summary. If you have already trained a model, you will also be able to see how many annotation points where added since the last model was trained. -After training a new model, train metrics will show underneath (& test metrics if defined). Precision, Recall & F1 scores are shown separately for annotated outliers and normal points. If you have already trained a model, you will also be able to see how the metrics have changed compared to the previous model. -Underneath that, you'll find a plot & table showing some of the feature importances of the last trained model. If you have already trained a model, you will also be able to see how the importances have changed compared to the previous model. -""" - ) - st.markdown("***") - st.markdown( - """ - ### Training Controls - -**Data Selection:** -If you have annotated multiple datasets or one dataset containing multiple series, you can choose your training series here. By default, the last series you have added annotations for is selected. - -**Modelling Options:** -Choose which modelling approach to use to predict outliers (more below). - -**Feature Options:** -Control some basic, model-specific feature parameters (more below): - -To start training, click on 'Train Outlier Model' on the bottom of the sidebar. -The checkbox 'Auto generate predictions for entire annotation series' (selected by default) simply means that after training, the newly trained model will be used to create predictions for the entire selected series. This is just a convenience option, as this would be the most common next step in the suggested workflow anyway. - -### Modelling Options - -Right now, only one supervised learning method is implemented for outlier prediction. More are to follow in the future. -For each annotated point (outlier or normal), a set of features is generated that is fed to the model to predict whether the point is an outlier or not. - - -**Random Forest Classifier** - -[scikit-learn docs - RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) -[scikit-learn docs - RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) - -The feature set is constructed by taking the x points before the annotated point, the y points after and normalizing them using the value of the annotated point. This way the model can only look at the relative changes of a series leading up to or after the point in question. -If you are interested in building any kind of 'real time' outlier detection model, set the number of points after to 0 (default). -The model using the feature set is a simple scikit-learn Random Forest Classifier. Behind the scenes, the random seeds are fixed for reproducible results and a small hyperparameter search is performed for cross validation. -Training times depend on the number of annotated points, but should generally never be longer than a minute. - """ - ) - - -def model_prediction(): - st.markdown( - """ - The *Model Prediction*-page is designated to creating & visualizing the predictions of any trained or uploaded model on any series. In the sidebar, you'll find all widgets related the generation of predictions. - The main window serves the visualizations for each selected series, while the sidebar is used to generate new predictions. -""" - ) - st.markdown("***") - st.markdown( - """ -## Prediction Controls (sidebar) - -It is recommended to generate predictions whenever training a new model (default). However, it is also possible to upload previously trained models and generate predictions for any uploaded series. -This allows for several potentially interesting workflows, such as uploading a test set as a separate dataset or evaluating the capabilities of a model on a completely different dataset. If you have previously created a capable model for your use case, you can skip the annotation and training processes and jump straight to the model prediction page, upload your model and clean your datasets. -Use the controls in the sidebar to select one or multiple models trained this session and/or upload models from disk. Then add individual series from any of your uploaded datasets for which you would like these models to generate predictions. -*** - """ - ) - - st.markdown( - """ - ## Prediction visualization (main window) - -Each series you have generated predictions for will appear underneath each other in main window, each with its own set of visualization options and graphs. - """ - ) - st.image( - str(MEDIA_PATH / "distribution_to_prediction_plot.png"), - use_column_width=True, - caption="The distribution plot (left) summarizes the predicted outlier distribution. Clicking on any of the bar groups will bring up the outlier plot (right) for that time range underneath.", - ) - - c1, c2 = st.columns([2, 1]) - - c1.markdown( - """ -### Visualization Options - - -1. In the multiselect at the top you will be able to choose up to 4 models that have been used to generate predictions for this series. By default (when a new set of predictions is added), the two most current models will be selected here, as this comparison is usually the most relevant (comparing the predictions of the most recent model to those of the previous one). - Underneath, you will then see a small summary table, showing for each selected model: - - The model name - - A 'Remove'-button to quickly remove that model from the selection - - The parameters of the model - - The number of predicted outliers for the series - - The number of predicted normal points for the series - - A color picker to choose which color that model should be shown in in the following graphs - -2. Each bar in the following distribution plot represents the number of predicted outliers for a given model. In order to not display too many points at once, the series is split into multiple parts containing an equal number of datapoints. That number can be adjusted using this slider. - -3. Set the height of both the distribution plot and the outlier plot (in pixel). - -4. If there are many segments of the distribution plot containing no predicted outliers, activate this checkbox to only display bars where there are predicted outliers. - -5. If any of these checkboxes are set, annotated train or test outliers that where not predicted to be outliers by the model will be highlighted by a blinking symbol in the distribution plot. This is useful for identifying at a glance where predictions are deviating from annotations. - - """ - ) - - c2.image( - str(MEDIA_PATH / "model_visualization.png"), - use_column_width=True, - caption="Customization options for visualizing model predictions.", - ) - c1, c2 = st.columns([2, 1]) - c1.markdown( - """ -### Prediction Correction - -You can use the outlier plots themselves to add further annotations, based on your model predictions. For this, you have several options to select points: - -- Click on predictions markers or normal datapoints for individual selection -- Utilize the Box- or Horizontal selection modes to select multiple prediction markers at once (default). This way when drawing a box, you will only select points where at least one model made a prediction. This is useful to quickly correct multiple faulty predictions -- By unchecking the checkbox 'Area select: Only select predicted outliers' above the outlier plot, you can also select any datapoints within the range of a Box- or Horizontal select. - -Underneath the outlier plot, you will find another set of annotation buttons which you can use to label your selected points. - """ - ) - - c1, c2 = st.columns([2, 1]) - c1.markdown( - """ - ### Retraining - -For convenience, the 'Retrain most recent model with new data' - button exists in order to retrain the most recent model (using the same parameters) and generate new predictions using that model. This allows for quicker iterations. - - """ - ) - c2.image( - str(MEDIA_PATH / "retrain.png"), - use_column_width=True, - caption="The retrain button allows for quicker iterations, without having to switch pages.", - ) - - -def annotaion_suggestion(): - st.markdown( - """ - The *Annotation Suggestion*-page is designated to presenting the user with interesting points to annotate, using a simple yes-no-dialogue. - For Random Forest Classifiers, the order of presented points is determined by the degree of disagreement of the individual decision trees (points with the most disagreement first). - In the sidebar, you can choose from any model trained in the current session. Points are always drawn from the series the model was trained on. - You can also set the number of neighboring points to display in the plot. - """ - ) - - st.image( - str(MEDIA_PATH / "annotation_suggestion.png"), - use_column_width=True, - caption="In the annotation suggestion page, you are prompted for annotations of specific points.", - ) - - -def data_download(): - st.markdown( - """ -The *Data Download*-page is designated to the removal of outliers from your datasets and the download of your cleaned data. -In the sidebar, choose a dataset. If the dataset has multiple series, you can then choose which of its series to clean. The final dataset will always contain all its original series, regardless of which series you choose. -Next pick any of your trained or uploaded models to use for the cleaning. -Currently, there are two methods available for handling the predicted outliers: Either deleting them from the series completely or performing linear interpolation. -Click on 'Preview' to generate and review 3 random samples that show you effect of removing the outliers in the chosen way. -After reviewing, you can then add your cleaned data as a new dataset, either to download right away or to clean further using another model. You could also start an new annotation process for the newly created dataset. -Finally, at the bottom of the sidebar, you'll have the option to download any of your session dataset to disk, either as a .csv or a .xlsx file. - """ - ) - st.image( - str(MEDIA_PATH / "data_download.png"), - use_column_width=True, - caption="Before saving your cleaned data, you may preview your chosen method of removing outliers.", - ) - - -INSTRUCTION_DICT = { - "General": general, - "Outlier Annotation": outlier_annotation, - "Model Training": model_training, - "Model Prediction": model_prediction, - "Annotation Suggestion": annotaion_suggestion, - "Data Download": data_download, -} diff --git a/tsod/active_learning/media/annotation_suggestion.png b/tsod/active_learning/media/annotation_suggestion.png deleted file mode 100644 index 6b55969..0000000 Binary files a/tsod/active_learning/media/annotation_suggestion.png and /dev/null differ diff --git a/tsod/active_learning/media/data_download.png b/tsod/active_learning/media/data_download.png deleted file mode 100644 index 8fe4c5f..0000000 Binary files a/tsod/active_learning/media/data_download.png and /dev/null differ diff --git a/tsod/active_learning/media/data_upload.png b/tsod/active_learning/media/data_upload.png deleted file mode 100644 index 0291e89..0000000 Binary files a/tsod/active_learning/media/data_upload.png and /dev/null differ diff --git a/tsod/active_learning/media/distribution_to_prediction_plot.png b/tsod/active_learning/media/distribution_to_prediction_plot.png deleted file mode 100644 index f71c8d0..0000000 Binary files a/tsod/active_learning/media/distribution_to_prediction_plot.png and /dev/null differ diff --git a/tsod/active_learning/media/model_visualization.png b/tsod/active_learning/media/model_visualization.png deleted file mode 100644 index def092a..0000000 Binary files a/tsod/active_learning/media/model_visualization.png and /dev/null differ diff --git a/tsod/active_learning/media/retrain.png b/tsod/active_learning/media/retrain.png deleted file mode 100644 index 11523ba..0000000 Binary files a/tsod/active_learning/media/retrain.png and /dev/null differ diff --git a/tsod/active_learning/media/selection_options.png b/tsod/active_learning/media/selection_options.png deleted file mode 100644 index 041e343..0000000 Binary files a/tsod/active_learning/media/selection_options.png and /dev/null differ diff --git a/tsod/active_learning/media/time_range_selection.png b/tsod/active_learning/media/time_range_selection.png deleted file mode 100644 index 6c2d1b9..0000000 Binary files a/tsod/active_learning/media/time_range_selection.png and /dev/null differ diff --git a/tsod/active_learning/media/workflow.png b/tsod/active_learning/media/workflow.png deleted file mode 100644 index 66d739a..0000000 Binary files a/tsod/active_learning/media/workflow.png and /dev/null differ diff --git a/tsod/active_learning/modelling.py b/tsod/active_learning/modelling.py deleted file mode 100644 index deb3d10..0000000 --- a/tsod/active_learning/modelling.py +++ /dev/null @@ -1,402 +0,0 @@ -import datetime -import logging -from typing import Dict, List -from zoneinfo import ZoneInfo - -import numpy as np -import pandas as pd -import streamlit as st -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import precision_recall_fscore_support -from sklearn.model_selection import RandomizedSearchCV - -from tsod.active_learning.utils import ( - get_as, - recursive_length_count, - recursive_round, - set_session_state_items, -) - - -def get_neighboring_points( - indices: List, - data_column: str, - points_before: int, - points_after: int, - full_df: pd.DataFrame, - column_for_normalization: str | None = None, -) -> List[List]: - """ - Given a list of datetime indices, returns the number_neighbors points before and - after each point (from full_df). - If column_for_normalization is set, every neighbor's value is divided by this columns value at the corresponding index. - """ - df: pd.DataFrame = full_df.reset_index(names="date") - - df = df[~df[data_column].isna()] - - if column_for_normalization and (column_for_normalization not in df.columns): - raise ValueError(f"Column {column_for_normalization} not found.") - - mask = df["date"].isin(indices) - - all_neighbors: List[List] = [] - - # Only access df where it matches indices for loop - for i in df.loc[mask].index: - sample_neighbors = [] - normalization_value = ( - df.loc[i, column_for_normalization] if column_for_normalization else 1.0 - ) - if abs(normalization_value) < 1e-4: - if normalization_value < 0: - normalization_value = -1e-4 - else: - normalization_value = 1e-4 - for i_2 in range(i - points_before, i + points_after + 1): - try: - if i_2 != i: - # Access full df for neighboring values - sample_neighbors.append(df.loc[i_2, data_column] / normalization_value) - except KeyError: - sample_neighbors.append(1.0) - - all_neighbors.append(sample_neighbors) - - return all_neighbors - - -def get_class_labels_RF(points_before: int, points_after: int): - class_labels = [f"t-{i}" for i in reversed(range(1, points_before + 1))] - class_labels.extend([f"t+{i}" for i in range(1, points_after + 1)]) - - return class_labels - - -def construct_training_data_RF(dataset: str | None = None, series: str | None = None): - - points_before = ( - st.session_state["number_points_before"] - if st.session_state.get("number_points_before") is not None - else st.session_state["model_library"][ - st.session_state[f"last_model_name_{dataset}_{series}"] - ]["params"]["points_before"] - ) - points_after = ( - st.session_state["number_points_after"] - if st.session_state.get("number_points_after") is not None - else st.session_state["model_library"][ - st.session_state[f"last_model_name_{dataset}_{series}"] - ]["params"]["points_after"] - ) - st.session_state["last_points_before"] = points_before - st.session_state["last_points_after"] = points_after - state = get_as(dataset, series) - outliers = state.df_outlier - if outliers.empty: - return - normal = state.df_normal - - features = [] - labels = [] - features.extend( - get_neighboring_points( - indices=outliers.index.to_list(), - data_column=state.column, - points_before=points_before, - points_after=points_after, - full_df=state.df, - column_for_normalization=state.column, - ) - ) - features.extend( - get_neighboring_points( - indices=normal.index.to_list(), - data_column=state.column, - points_before=points_before, - points_after=points_after, - full_df=state.df, - column_for_normalization=state.column, - ) - ) - - labels.extend([1] * len(outliers)) - labels.extend([0] * len(normal)) - - features = pd.DataFrame(features, columns=get_class_labels_RF(points_before, points_after)) - labels = np.array(labels) - - st.session_state["features"] = features - st.session_state["labels"] = labels - - -def construct_test_data_RF(dataset: str | None = None, series: str | None = None): - points_before = ( - st.session_state["number_points_before"] - if st.session_state.get("number_points_before") is not None - else st.session_state["model_library"][ - st.session_state[f"last_model_name_{dataset}_{series}"] - ]["params"]["points_before"] - ) - points_after = ( - st.session_state["number_points_after"] - if st.session_state.get("number_points_after") is not None - else st.session_state["model_library"][ - st.session_state[f"last_model_name_{dataset}_{series}"] - ]["params"]["points_after"] - ) - - state = get_as(dataset, series) - dataset = state.dataset - series = state.column - outliers = state.df_test_outlier - normal = state.df_test_normal - - if outliers.empty and normal.empty: - return - - features = [] - labels = [] - features.extend( - get_neighboring_points( - outliers.index.to_list(), - state.column, - points_before, - points_after, - state.df, - state.column, - ) - ) - features.extend( - get_neighboring_points( - normal.index.to_list(), - state.column, - points_before, - points_after, - state.df, - state.column, - ) - ) - labels.extend([1] * len(outliers)) - labels.extend([0] * len(normal)) - - class_labels = [f"t-{i}" for i in reversed(range(1, points_before + 1))] - class_labels.extend([f"t+{i}" for i in range(1, points_after + 1)]) - - features = pd.DataFrame(features, columns=class_labels) - labels = np.array(labels) - - st.session_state[f"test_features_{dataset}_{series}"] = features - st.session_state["test_labels"] = labels - - return True - - -def train_model(base_obj=None, dataset: str | None = None, series: str | None = None): - obj = base_obj or st - - state = get_as(dataset, series) - if recursive_length_count(state.data, exclude_keys="selected"): - st.session_state[f"old_number_annotations_{state.dataset}_{state.column}"] = { - k: len(v) for k, v in state.data.items() - } - - with st.spinner("Constructing features..."): - if st.session_state["last_method_choice"] == "RF_1": - construct_training_data_RF(dataset, series) - construct_test_data_RF(dataset, series) - with st.spinner("Training Model..."): - if st.session_state["last_method_choice"] == "RF_1": - train_random_forest_classifier(obj, dataset, series) - - logging.info("A new model was trained successfully.") - - -def train_random_forest_classifier( - base_obj=None, dataset: str | None = None, series: str | None = None -): - obj = base_obj or st - - state = get_as(dataset, series) - dataset = state.dataset - series = state.column - - if "features" not in st.session_state: - obj.warning("No features were created, not training a model.") - return - X = st.session_state["features"] - y = st.session_state["labels"] - - rfc = RandomForestClassifier(random_state=30) - - forest_params = { - "max_depth": [int(x) for x in np.linspace(10, 30, num=3)] + [None], - "max_features": ["sqrt", "log2"], - "n_estimators": [int(x) for x in np.linspace(start=100, stop=1000, num=10)], - "min_samples_split": [2, 4], - "bootstrap": [True, False], - } - clf = RandomizedSearchCV( - estimator=rfc, - param_distributions=forest_params, - cv=3, - n_iter=10, - n_jobs=-1, - verbose=0, - random_state=30, - ) - - clf.fit(X, y) - - model_name = f"RF_Classifier ({datetime.datetime.now(tz=ZoneInfo('Europe/Copenhagen')).strftime('%Y-%m-%d %H:%M:%S')})" - st.session_state[f"last_model_name_{dataset}_{series}"] = model_name - st.session_state["most_recent_model"] = model_name - st.session_state["models_trained_this_session"].update([model_name]) - # Update Test metrics - if f"current_model_test_metrics_{dataset}_{series}" in st.session_state: - st.session_state[f"previous_model_test_metrics_{dataset}_{series}"] = st.session_state[ - f"current_model_test_metrics_{dataset}_{series}" - ] - st.session_state[f"previous_model_train_metrics_{dataset}_{series}"] = st.session_state[ - f"current_model_train_metrics_{dataset}_{series}" - ] - - model = clf.best_estimator_ - - train_preds = model.predict(X) - train_prec, train_rec, train_f1, train_support = precision_recall_fscore_support( - y, train_preds, zero_division=1 - ) - st.session_state[f"current_model_train_metrics_{dataset}_{series}"] = recursive_round( - {"precision": train_prec, "recall": train_rec, "f1": train_f1} - ) - - if f"test_features_{dataset}_{series}" in st.session_state: - X_test = st.session_state[f"test_features_{dataset}_{series}"] - y_test = st.session_state["test_labels"] - test_preds = model.predict(X_test) - test_prec, test_rec, test_f1, support = precision_recall_fscore_support( - y_test, test_preds, zero_division=1 - ) - st.session_state[f"current_model_test_metrics_{dataset}_{series}"] = recursive_round( - {"precision": test_prec, "recall": test_rec, "f1": test_f1} - ) - - if f"current_importances_{dataset}_{series}" in st.session_state: - st.session_state[f"previous_importances_{dataset}_{series}"] = st.session_state[ - f"current_importances_{dataset}_{series}" - ] - - df_fi = pd.DataFrame( - [ - {"Feature": feat, "Feature importance": imp.round(3)} - for feat, imp in zip(model.feature_names_in_, model.feature_importances_) - ], - ).sort_values("Feature importance", ascending=False) - - st.session_state[f"current_importances_{dataset}_{series}"] = df_fi - - st.session_state["model_library"][model_name] = { - "model": model, - "type": st.session_state["last_method_choice"], - "params": { - "points_before": st.session_state["last_points_before"], - "points_after": st.session_state["last_points_after"], - }, - "trained_on_dataset": dataset, - "trained_on_series": series, - } - - st.session_state["prediction_models"][model_name] = st.session_state["model_library"][ - model_name - ] - - st.session_state["prediction_data"][dataset] = [series] - - -def get_model_predictions(base_obj=None): - obj = base_obj or st - models: Dict[str, RandomForestClassifier] = st.session_state["prediction_models"] - datasets: Dict[str, pd.DataFrame] = st.session_state["prediction_data"] - if (not models) or (not datasets): - obj.error("Please add at least one model and one data file.") - return - - get_model_predictions_RF() - - -def get_model_predictions_RF(): - model_dicts: Dict[str, Dict] = { - k: v for k, v in st.session_state["prediction_models"].items() if v["type"] == "RF_1" - } - datasets = st.session_state["prediction_data"] - - if (not model_dicts) or (not datasets): - return - - params = [d["params"] for d in model_dicts.values()] - start_features = max([p["points_before"] for p in params]) - end_features = max([p["points_after"] for p in params]) - - for dataset_name, series_list in datasets.items(): - # for dataset_name, ds in datasets.items(): - ds = st.session_state["data_store"][dataset_name] - for series in series_list: - df_series = ds[series] - if ( - (series not in st.session_state["uploaded_ds_features"][dataset_name]) - or (start_features > st.session_state["RF_features_computed_start"]) - or (end_features > st.session_state["RF_features_computed_end"]) - ): - with st.spinner(f"{dataset_name} - {series}: Constructing dataset features..."): - features = get_neighboring_points( - indices=df_series.index, - data_column=series, - points_before=start_features, - points_after=end_features, - full_df=df_series, - column_for_normalization=series, - ) - - feature_names = get_class_labels_RF(start_features, end_features) - features = pd.DataFrame(features, columns=feature_names) - st.session_state["uploaded_ds_features"][dataset_name][series] = features - st.session_state["RF_features_computed_start"] = start_features - st.session_state["RF_features_computed_end"] = end_features - - with st.spinner(f"{dataset_name} - {series}: Getting model results..."): - if series not in st.session_state["inference_results"][dataset_name]: - st.session_state["inference_results"][dataset_name][series] = df_series.copy( - deep=True - ) - for model_name, model_data in model_dicts.items(): - if model_name in st.session_state["inference_results"][dataset_name][series]: - continue - # st.session_state["models_to_visualize"][dataset_name].update([model_name]) - # select relevant subset of ds features, based on what the model needs - number_model_features_before = model_data["params"]["points_before"] - number_model_features_after = model_data["params"]["points_after"] - relevant_model_columns = get_class_labels_RF( - number_model_features_before, number_model_features_after - ) - model_feautures = st.session_state["uploaded_ds_features"][dataset_name][ - series - ][relevant_model_columns] - results = model_data["model"].predict(model_feautures) - probas = model_data["model"].predict_proba(model_feautures) - st.session_state["inference_results"][dataset_name][series][ - model_name - ] = results - st.session_state["inference_results"][dataset_name][series][ - f"certainty_{model_name}" - ] = np.abs( - probas[:, 0] - 0.5 - ) # lower = more uncertain - st.session_state["number_outliers"][dataset_name][series][model_name] = len( - results.nonzero()[0].tolist() - ) - st.session_state["available_models"][dataset_name][series].update([model_name]) - - st.session_state["models_to_visualize"][dataset_name][series] = set( - sorted(st.session_state["available_models"][dataset_name][series])[-2:] - ) diff --git a/tsod/active_learning/plotting.py b/tsod/active_learning/plotting.py deleted file mode 100644 index 4b25d21..0000000 --- a/tsod/active_learning/plotting.py +++ /dev/null @@ -1,626 +0,0 @@ -import copy -import datetime -from typing import List -import numpy as np -import pandas as pd -import plotly.express as px -import plotly.graph_objs as go -import streamlit as st -from pyecharts import options as opts -from pyecharts.charts import Line, Bar, Scatter, EffectScatter -from streamlit_echarts import st_pyecharts -from tsod.active_learning.utils import get_as - -ANNOTATION_COLORS = { - "selected": "#8c259a", - "outlier": "#e60b0b", - "normal": "#3fc762", - "test_outlier": "#fd7c99", - "test_normal": "#0fefc7", -} -MARKER_SIZES = {"selected": 10, "outlier": 12, "normal": 12, "test_outlier": 12, "test_normal": 12} -MARKER_VALUES = { - "selected": "S", - "outlier": "O", - "normal": "N", - "test_outlier": "TO", - "test_normal": "TN", -} -MARKER_HOVER = { - "selected": "Selected Point", - "outlier": "Training Outlier", - "normal": "Traning Normal", - "test_outlier": "Test Outlier", - "test_normal": "Test Normal", -} - - -@st.cache(persist=True, max_entries=100, show_spinner=False) -def cachable_get_outlier_counts( - dataset_name: str, - series: str, - model_names: List, - train_outliers: set, - test_outliers: set, - number_of_datapoints: int, -) -> pd.DataFrame: - with st.spinner("Creating new distribution plot..."): - state = get_as(dataset_name, series) - dataset: pd.DataFrame = st.session_state["inference_results"][dataset_name][series] - - dataset["outlier_group"] = range(len(dataset)) - dataset["outlier_group"] = (dataset["outlier_group"] // number_of_datapoints).astype( - np.int16 - ) - - threshold_timestamps = ( - dataset.reset_index().groupby("outlier_group")["index"].first().to_list() - ) - threshold_timestamps.append(dataset.index.max()) - ranges = [f"{i} - {j}" for i, j in zip(threshold_timestamps, threshold_timestamps[1:])] - - out_columns = copy.deepcopy(model_names) - for m in model_names: - out_columns.extend([f"{m} Missed Train Outliers", f"{m} Missed Test Outliers"]) - out_columns.extend(["Marked Train Outliers", "Marked Test Outliers"]) - df_out = pd.DataFrame(index=ranges, columns=out_columns) - - annotated_outliers = state.df_outlier - annotated_test_outliers = state.df_test_outlier - for group_index, (_, group) in enumerate(dataset.groupby("outlier_group")): - outliers_in_this_group = annotated_outliers[ - annotated_outliers.index.to_series().between(group.index[0], group.index[-1]) - ] - test_outliers_in_this_group = annotated_test_outliers[ - annotated_test_outliers.index.to_series().between(group.index[0], group.index[-1]) - ] - df_out.iat[group_index, -2] = len(outliers_in_this_group) - df_out.iat[group_index, -1] = len(test_outliers_in_this_group) - - for model_index, model in enumerate(model_names): - model_pred_outliers = group[group[model] == 1].index - df_out.iat[group_index, model_index] = len(model_pred_outliers) - df_out.at[ranges[group_index], f"{model} Missed Train Outliers"] = np.count_nonzero( - outliers_in_this_group.index.isin(model_pred_outliers) == False - ) - df_out.at[ranges[group_index], f"{model} Missed Test Outliers"] = np.count_nonzero( - test_outliers_in_this_group.index.isin(model_pred_outliers) == False - ) - - return df_out - - -def make_outlier_distribution_plot(dataset_name: str, series: str): - model_names = sorted(st.session_state["models_to_visualize"][dataset_name][series]) - if not model_names: - return None, None - state = get_as(dataset=dataset_name, column=series) - - df_counts = cachable_get_outlier_counts( - dataset_name, - series, - model_names, - state.outlier, - state.test_outlier, - number_of_datapoints=st.session_state[f"num_outliers_{dataset_name}_{series}"], - ) - - if st.session_state[f"only_show_ranges_with_outliers_{dataset_name}_{series}"]: - df_counts = df_counts[df_counts.any(axis=1)] - - bar = ( - Bar() - .add_xaxis(df_counts.index.to_list()) - .set_global_opts( - title_opts=opts.TitleOpts( - title="Distribution Plot - Number of outliers per model", - subtitle="Click on bar to isolate time range", - padding=15, - ), - xaxis_opts=opts.AxisOpts( - is_scale=True, - name="Time Range", - name_location="middle", - name_gap=30, - axistick_opts=opts.AxisTickOpts(is_inside=True, is_align_with_label=True), - ), - yaxis_opts=opts.AxisOpts( - type_="value", - name="Number of outliers", - name_rotate=90, - name_location="middle", - name_gap=50, - boundary_gap="30%", - ), - datazoom_opts=[ - opts.DataZoomOpts( - type_="slider", - range_start=0, - range_end=100, - ), - opts.DataZoomOpts( - type_="inside", - range_start=0, - range_end=100, - ), - ], - legend_opts=opts.LegendOpts(pos_top=10, pos_right=10, orient="vertical"), - tooltip_opts=opts.TooltipOpts( - axis_pointer_type="shadow", - trigger="axis", - ), - ) - ) - for ann_series in df_counts.columns: - if "missed" in ann_series.lower(): - continue - if df_counts[ann_series].any(): - bar = bar.add_yaxis( - ann_series, - df_counts[ann_series].to_list(), - stack=ann_series, - label_opts=opts.LabelOpts(is_show=False), - category_gap="40%", - ) - - colors = [st.session_state[f"color_{m}_{dataset_name}_{series}"] for m in model_names] - if state.outlier: - colors.append("#e60b0b") - if state.test_outlier: - colors.append("#fd7c99") - - for m in model_names: - if not ( - st.session_state.get(f"highlight_train_{dataset_name}_{series}") - or st.session_state.get(f"highlight_test_{dataset_name}_{series}") - ): - break - df_missed = df_counts[df_counts[f"{m} Missed Train Outliers"] > 0] - if st.session_state.get(f"highlight_train_{dataset_name}_{series}"): - df_missed = df_counts[df_counts[f"{m} Missed Train Outliers"] > 0] - if not df_missed.empty: - effect_scatter = (EffectScatter().add_xaxis(df_missed.index.tolist())).add_yaxis( - f"{m} Missed Training Outliers", - df_missed[m].tolist(), - label_opts=opts.LabelOpts(is_show=False), - tooltip_opts=opts.TooltipOpts(is_show=False), - symbol="triangle", - ) - bar = bar.overlap(effect_scatter) - colors.append(st.session_state[f"color_{m}_{dataset_name}_{series}"]) - if st.session_state.get(f"highlight_test_{dataset_name}_{series}"): - df_missed = df_counts[df_counts[f"{m} Missed Test Outliers"] > 0] - if not df_missed.empty: - effect_scatter = (EffectScatter().add_xaxis(df_missed.index.tolist())).add_yaxis( - f"{m} Missed Test Outliers", - df_missed[m].tolist(), - label_opts=opts.LabelOpts(is_show=False), - tooltip_opts=opts.TooltipOpts(is_show=False), - symbol_size=15, - ) - bar = bar.overlap(effect_scatter) - colors.append(st.session_state[f"color_{m}_{dataset_name}_{series}"]) - - bar.set_colors(colors) - - clicked_range = st_pyecharts( - bar, - height=f"{st.session_state[f'figure_height_{dataset_name}_{series}']}px", - theme="dark", - events={ - "click": "function(params) { return params.name }", - # "dblclick": "function(params) { console.log(params) } ", - }, - # key=f"distribution_plot_{dataset_name}", - ) - - def _get_start_and_end_date(clicked_range: str): - if not clicked_range: - return None, None - start_str, end_str = clicked_range.split(" - ") - start_time = datetime.datetime.strptime(start_str, "%Y-%m-%d %H:%M:%S") - end_time = datetime.datetime.strptime(end_str, "%Y-%m-%d %H:%M:%S") - st.session_state[f"last_clicked_range_{dataset_name}_{series}"] = start_time, end_time - # st.session_state[f"range_str_{dataset_name}"] = clicked_range - return start_time, end_time - - return _get_start_and_end_date(clicked_range) - - -def make_annotation_suggestion_plot( - start_time, end_time, dataset_name, series, point_to_highlight: tuple -): - state = get_as(dataset_name, series) - state.update_plot(start_time, end_time) - - x_data = state.df_plot.index.to_list() - y_data = state.df_plot[series].to_list() - plot = ( - Line() - .add_xaxis(x_data) - .add_yaxis( - series, - y_data, - color="yellow", - label_opts=opts.LabelOpts(is_show=False), - is_symbol_show=False, - ) - .set_global_opts( - title_opts=opts.TitleOpts( - title="Is this point an outlier?", - subtitle="Labels generated here will be added directly to the training data.", - padding=15, - ), - yaxis_opts=opts.AxisOpts( - type_="value", - name=series, - name_rotate=90, - name_location="middle", - name_gap=50, - ), - xaxis_opts=opts.AxisOpts( - type_="time", - is_scale=True, - name="Date & Time", - name_location="middle", - name_gap=-20, - ), - datazoom_opts=opts.DataZoomOpts(type_="inside", range_start=0, range_end=100), - legend_opts=opts.LegendOpts(pos_top=40, pos_right=10, orient="vertical"), - tooltip_opts=opts.TooltipOpts(axis_pointer_type="line", trigger="axis"), - ) - ) - - scatter = ( - Scatter() - .add_xaxis(x_data) - .add_yaxis( - "Datapoints", - y_data, - label_opts=opts.LabelOpts(is_show=False), - symbol_size=3, - itemstyle_opts=opts.ItemStyleOpts(color="#dce4e3"), - is_selected=len(x_data) < 10000, - tooltip_opts=opts.TooltipOpts(is_show=False), - ) - ) - plot = plot.overlap(scatter) - effect_scatter = ( - EffectScatter() - .add_xaxis([point_to_highlight[0]]) - .add_yaxis( - "Candidate", - [point_to_highlight[1]], - label_opts=opts.LabelOpts(is_show=False), - tooltip_opts=opts.TooltipOpts(is_show=False), - symbol_size=30, - symbol="pin", - ) - ) - plot = plot.overlap(effect_scatter) - - st_pyecharts(plot, theme="dark", height="600px") - - -def get_echarts_plot_time_range( - start_time, - end_time, - data_column, - include_annotations, - plot_title: str, - dataset_name: str = None, - series: str = None, -): - state = get_as(dataset_name, series) - state.update_plot(start_time, end_time) - - x_data = state.df_plot.index.to_list() - y_data = state.df_plot[data_column].to_list() - plot = ( - Line(init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation=False))) - .add_xaxis(x_data) - .add_yaxis( - data_column, - y_data, - color="yellow", - label_opts=opts.LabelOpts(is_show=False), - # markpoint_opts=opts.MarkPointOpts(data=markers), - is_symbol_show=False, - ) - .set_global_opts( - title_opts=opts.TitleOpts( - title=plot_title, - subtitle="Click on points or markers to select them. Activate different selection modes in the toolbar (top right). Zoom using the mouse wheel.", - padding=15, - ), - yaxis_opts=opts.AxisOpts( - type_="value", - name=data_column, - name_rotate=90, - name_location="middle", - name_gap=50, - ), - xaxis_opts=opts.AxisOpts( - type_="time", - is_scale=True, - name="Date & Time", - name_location="middle", - name_gap=-20, - ), - datazoom_opts=[ - opts.DataZoomOpts( - type_="slider", - range_start=0, - range_end=100, - ), - opts.DataZoomOpts( - type_="inside", - ), - ], - legend_opts=opts.LegendOpts(pos_top=40, pos_right=10, orient="vertical"), - brush_opts=opts.BrushOpts( - throttle_type="debounce", - throttle_delay=500, - brush_mode="multiple", - brush_type="lineX", - tool_box=["lineX", "rect", "clear"], - series_index="all", - out_of_brush={"colorAlpha": 0.1}, - ), - tooltip_opts=opts.TooltipOpts(axis_pointer_type="line", trigger="axis"), - ) - ) - - scatter = ( - Scatter() - .add_xaxis(x_data) - .add_yaxis( - "Datapoints", - y_data, - label_opts=opts.LabelOpts(is_show=False), - symbol_size=3, - itemstyle_opts=opts.ItemStyleOpts(color="#dce4e3"), - is_selected=len(x_data) < 10000, - tooltip_opts=opts.TooltipOpts(is_show=False), - ) - ) - plot = plot.overlap(scatter) - if not include_annotations: - return plot - - df_selected = getattr(state, "df_plot_selected", None) - - for series_name in state.data: - if not hasattr(state, f"df_plot_{series_name}"): - continue - - df = getattr(state, f"df_plot_{series_name}") - if df_selected is not None and series_name != "selected": - df = df[~df.index.isin(df_selected.index)] - if df.empty: - continue - plot.overlap( - Scatter() - .add_xaxis(df.index.to_list()) - .add_yaxis( - MARKER_HOVER[series_name], - df[state.column].to_list(), - label_opts=opts.LabelOpts(is_show=False), - symbol_rotate=0, - symbol="roundRect", - symbol_size=15, - # color="#dce4e3", - itemstyle_opts=opts.ItemStyleOpts(opacity=1, color=ANNOTATION_COLORS[series_name]), - tooltip_opts=opts.TooltipOpts(is_show=False), - ) - ) - return plot - - -def make_time_range_outlier_plot(dataset_name: str, series: str, start_time, end_time): - dataset: pd.DataFrame = st.session_state["inference_results"][dataset_name][series] - model_names = sorted(st.session_state["models_to_visualize"][dataset_name][series]) - - df_plot = dataset[dataset.index.to_series().between(start_time, end_time)] - state = get_as(dataset_name, series) - - plot = get_echarts_plot_time_range( - start_time, - end_time, - state.column, - plot_title=f"Outlier Plot {start_time} - {end_time}", - include_annotations=True, - dataset_name=dataset_name, - series=series, - ) - - pred_outlier_tracker = {} - for model_number, model_name in enumerate(model_names): - df_outlier = df_plot[df_plot[model_name] > 0] - pred_outlier_tracker[model_name] = df_outlier - if df_outlier.empty: - continue - plot.overlap( - Scatter() - .add_xaxis(df_outlier.index.to_list()) - .add_yaxis( - model_name, - df_outlier[state.column].to_list(), - label_opts=opts.LabelOpts(is_show=False), - symbol_rotate=-90 * model_number, - symbol="pin", - symbol_size=40, - itemstyle_opts=opts.ItemStyleOpts( - opacity=1, color=st.session_state[f"color_{model_name}_{dataset_name}_{series}"] - ), - tooltip_opts=opts.TooltipOpts(formatter="{a}
Outlier predicted"), - ) - ) - - st.session_state["pred_outlier_tracker"][dataset_name][series] = pred_outlier_tracker - - clicked_point = st_pyecharts( - plot, - height=f"{st.session_state[f'figure_height_{dataset_name}_{series}']}px", - theme="dark", - events={ - "click": "function(params) { return [params.data[0], 'click'] }", - # "brushselected": "function(params) { console.log(params) }", - "brushselected": "function(params) { return [params.batch[0].selected, 'brush'] }", - # "brushselected": "function(params) { return [params.batch[0].selected[1].dataIndex, 'brush'] }", - }, - # key=f"time_range_plot_{dataset_name}", - ) - - return clicked_point - - -def feature_importance_plot(base_obj=None): - obj = base_obj or st - - state = get_as() - dataset = state.dataset - series = state.column - - df_new: pd.DataFrame = st.session_state[f"current_importances_{dataset}_{series}"] - - if f"previous_importances_{dataset}_{series}" in st.session_state: - df_old: pd.DataFrame = st.session_state[f"previous_importances_{dataset}_{series}"] - df_plot = df_new.merge(df_old, how="left", on="Feature", suffixes=("", " before")) - df_plot["diff"] = ( - df_plot["Feature importance"] - df_plot["Feature importance before"] - ).round(3) - df_plot["diff_text"] = df_plot["diff"].apply(lambda x: str(x) if x <= 0 else f"+{x}") - else: - df_plot = df_new - - df_plot = df_plot.iloc[:10].sort_values("Feature importance") - - fig = go.Figure( - data=[ - go.Bar( - x=df_plot["Feature importance"], - y=df_plot["Feature"], - orientation="h", - name="Current Importances", - text=df_plot["Feature importance"], - ) - ] - ) - - if f"previous_importances_{dataset}_{series}" in st.session_state: - fig.add_trace( - go.Bar( - x=df_plot["diff"], - y=df_plot["Feature"], - orientation="h", - name="Change to previous model", - text=df_plot["diff_text"], - ) - ) - fig.update_layout( - margin=dict(l=10, r=10, t=50, b=0), - legend=dict(yanchor="bottom", y=1.0, xanchor="left", x=0.2, orientation="h"), - barmode="relative", - title={ - "text": f"Feature importances {st.session_state[f'last_model_name_{dataset}_{series}']}", - "y": 1.0, - "x": 0.5, - "xanchor": "center", - "yanchor": "top", - }, - ) - obj.plotly_chart(fig, use_container_width=True) - - -def make_removed_outliers_example_plots(df_before: pd.DataFrame, df_new: pd.DataFrame): - dataset: str = st.session_state["download_dataset"] - series: List = st.session_state["download_series"] - model: str = st.session_state["download_model"] - - st.header(f"Outlier removal preview - {model}") - - c1, c2 = st.columns(2) - c1.metric("Current number of total rows in dataset (all series)", len(df_before)) - c2.metric("Total number of rows after outlier removal (all series)", len(df_new)) - - for s in series: - outlier_mask = df_before[f"{s}_{model}"] == 1 - number_outlier = outlier_mask.sum() - number_to_show = min(number_outlier, 3) - changes_sample = ( - df_before[df_before[f"{s}_{model}"] == 1].sample(number_to_show, random_state=1).index - ) - - st.subheader(s) - - cols = st.columns(3) - cols[0].metric("Number of predicted outliers in this series", number_outlier) - cols[1].metric("Number of non-NaN entries before", len(df_before[~df_before[s].isna()])) - cols[2].metric("Number of non-NaN entries after", len(df_new[~df_new[s].isna()])) - - if not number_outlier: - continue - - for i in range(number_to_show): - int_idx = df_before.index.get_loc(changes_sample[i]) - start_idx = max(0, int_idx - 20) - end_idx = min(len(df_before) - 1, int_idx + 20) - start_time = df_before.index[start_idx] - end_time = df_before.index[end_idx] - df_plot_before = df_before[df_before.index.to_series().between(start_time, end_time)][ - [s] - ].dropna() - df_plot_new = df_new[df_new.index.to_series().between(start_time, end_time)][ - [s] - ].dropna() - - plot = ( - Line() - .add_xaxis(df_plot_before.index.tolist()) - .add_yaxis( - "Before", - df_plot_before[s].tolist(), - label_opts=opts.LabelOpts(is_show=False), - is_symbol_show=False, - ) - ) - plot = plot.overlap( - Line() - .add_xaxis(df_plot_new.index.tolist()) - .add_yaxis( - "After", - df_plot_new[s].tolist(), - label_opts=opts.LabelOpts(is_show=False), - is_symbol_show=False, - ) - ) - - plot.set_global_opts( - title_opts=opts.TitleOpts( - title=f"Sample # {i+1}", - padding=15, - ), - yaxis_opts=opts.AxisOpts( - type_="value", - name=s, - name_rotate=90, - name_location="middle", - name_gap=50, - ), - xaxis_opts=opts.AxisOpts( - type_="time", - is_scale=True, - name="Date & Time", - name_location="middle", - name_gap=-20, - ), - datazoom_opts=opts.DataZoomOpts(type_="inside", range_start=30, range_end=70), - legend_opts=opts.LegendOpts(pos_top=40, pos_right=10, orient="vertical"), - tooltip_opts=opts.TooltipOpts(axis_pointer_type="line", trigger="axis"), - ) - - plot.set_colors(["Yellow", "Green"]) - - with cols[i]: - st_pyecharts(plot, theme="dark") diff --git a/tsod/active_learning/test.py b/tsod/active_learning/test.py deleted file mode 100644 index c7db7a0..0000000 --- a/tsod/active_learning/test.py +++ /dev/null @@ -1,17 +0,0 @@ -from pathlib import Path -import streamlit as st -import mikeio - -uploaded_file = st.file_uploader("Choose a file") - -if uploaded_file: - data = uploaded_file.getvalue() - TEMP_FILE = Path("test.dfs0") - - with TEMP_FILE.open("wb") as f: - f.write(data) - - test = mikeio.read("test.dfs0") - - st.write(test.to_dataframe()) - TEMP_FILE.unlink() diff --git a/tsod/active_learning/upload_data.py b/tsod/active_learning/upload_data.py deleted file mode 100644 index 6ef458b..0000000 --- a/tsod/active_learning/upload_data.py +++ /dev/null @@ -1,325 +0,0 @@ -import logging -from pathlib import Path - -import dateutil -import mikeio -import numpy as np -import pandas as pd -import streamlit as st -from dateutil.parser._parser import ParserError - -from tsod.active_learning.data_structures import AnnotationState - - -def datetime_unififer(data=pd.DataFrame, date_column=str, base_obj=None): - obj = base_obj or st - try: - data[date_column] = pd.to_datetime(data[date_column], unit="ns", utc=False) - return data - except ParserError as e: - pass - try: - data[date_column] = pd.to_datetime( - data[date_column].str.replace("*", " "), unit="ns", utc=False - ) - return data - except ParserError: - pass - try: - data[date_column] = pd.to_datetime( - data[date_column].str.replace("'T'", " "), unit="ns", utc=False - ) - return data - except ParserError: - pass - try: - data[date_column] = pd.to_datetime( - data[date_column], - format="%d/%b/%Y:%H:%M:%S %z", - exact=False, - utc=False, - ) - return data - except ParserError: - pass - try: - tzmapping = { - "CET": dateutil.tz.gettz("Europe/Berlin"), - "CEST": dateutil.tz.gettz("Europe/Berlin"), - "PDT": dateutil.tz.gettz("US/Pacific"), - } - data[date_column] = ( - data[date_column].str.replace("*", " ").apply(dateutil.parser.parse, tzinfos=tzmapping) - ) - return data - except ParserError: - pass - try: - data[date_column] = pd.to_datetime( - data[date_column], - format="%Y-%m-%d*%H:%M:%S:%f", - exact=False, - utc=False, - ) - return data - except ParserError: - obj.error("Data does not contain recognized timestamp column.") - # except Exception as e: - # obj.error("Data does not contain recognized timestamp column.") - - -def data_upload_callback_old(base_obj=None): - obj = base_obj or st - datafiles = st.session_state["data_upload"] - if not datafiles: - return - dataframe = pd.DataFrame() - st.write(datafiles) - file_handle = "_".join(sorted([Path(f.name).stem for f in datafiles])) - for file in datafiles: - if ( - file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - or file.type == "application/vnd.ms-excel" - ): - # read the excel file - df = pd.read_excel(file) - # check which column contains the datetime - if len(df.select_dtypes(include=["datetime"]).columns) != 0: - date_column = df.select_dtypes(include=["datetime"]).columns[0] - elif len(df.select_dtypes(include=["object"]).columns) != 0: - date_column = df.select_dtypes(include=["object"]).columns[0] - else: - obj.error("Data does not contain recognized timestamp column.") - - df = datetime_unififer(df, date_column, obj) - # set datetime as index - df = df.set_index(date_column) - # append the list of uploaded dataframes - elif file.type == "text/csv": - df = pd.read_csv(file) - # check which column contains the datetime (object) - if len(df.select_dtypes(include=["datetime"]).columns) != 0: - date_column = df.select_dtypes(include=["datetime"]).columns[0] - elif len(df.select_dtypes(include=["object"]).columns) != 0: - date_column = df.select_dtypes(include=["object"]).columns[0] - else: - obj.error("Data does not contain recognized timestamp column.") - # standardize the datetime object - df = datetime_unififer(df, date_column) - # set datetime as index - df = df.set_index(date_column) - elif file.name.endswith(".dfs0"): - # elif file.type == "application/octet-stream": - data = file.getvalue() - TEMP_FILE = Path("test.dfs0") - - with TEMP_FILE.open("wb") as f: - f.write(data) - - data = mikeio.read("test.dfs0") - df = data.to_dataframe() - TEMP_FILE.unlink() - - df.index = df.index.tz_localize(tz="UTC") - - else: - obj.error("This datatype is not supported.") - - # check intersecting rows in dataframes - idx = dataframe.index.intersection(df.index) - clm = dataframe.columns.intersection(df.columns) - df_diff = dataframe[clm].eq(df[clm]).loc[idx] - df_null = pd.notnull(dataframe[clm].loc[idx]) - - # st.write(dataframe[dataframe.index.isin(idx)].dtypes) - # st.write(df[df.index.isin(idx)].dtypes) - - # st.write(len(idx)) - - # check if values differ and are not none - if False in df_diff.values and False in df_diff.eq(df_null).values: - obj.error( - f"File {file.name} contains different values for the same timestamps and cannot be integrated for training the model." - ) - # fill none values with real data if applicable - elif ( - len(df.index.difference(dataframe.index)) == 0 - and len(df.columns.difference(dataframe.columns)) == 0 - and True in pd.isnull(dataframe.loc[idx]).values - ): - dataframe.fillna(df, inplace=True) - # if indexes/columns are new, concat the dataframe with the new values - else: - if ( - df[[d for d in df.columns if d not in clm]] - .loc[ - df[[d for d in df.columns if d not in clm]].index.intersection(dataframe.index) - ] - .shape[1] - > 0 - ): - # concat by column, if there exist additional columns in one dataframe with the same index - dataframe = pd.concat( - [ - dataframe, - df[[d for d in df.columns if d not in clm]].loc[ - df[[d for d in df.columns if d not in clm]].index.intersection( - dataframe.index - ) - ], - ], - axis=1, - ) - - # concat dataframes with unique indexes by row - dataframe = pd.concat([dataframe, df.loc[df.index.difference(dataframe.index)]]) - obj.info(f"File {file.name} uploaded and concatenated.") - return dataframe - - -def data_uploader(base_obj=None): - obj = base_obj or st - form = obj.form("data_upload_form", clear_on_submit=True) - form.file_uploader( - label="Upload data from disk", - accept_multiple_files=True, - type=["csv", "dfs0", "xlsx", "xls"], - key="data_upload", - # on_change=data_upload_callback, - args=(obj,), - ) - - form.text_input( - "Optional: Enter dataset name", - max_chars=30, - key="data_name", - help="""This name will be used for data selection and when downloading annotation data. - If left empty, the dataset will be named based on the uploaded file names.""", - ) - - form.form_submit_button("Upload", on_click=data_upload_callback, args=(obj,)) - - # data_upload_callback(obj) - - -def data_upload_callback(base_obj=None): - obj = base_obj or st - datafiles = st.session_state["data_upload"] - if not datafiles: - obj.warning("No files selected.") - return - # st.write(datafiles) - dataframe = pd.DataFrame() - unique_columns = set() - if st.session_state["data_name"] != "": - file_handle = st.session_state["data_name"] - else: - file_handle = "_".join(sorted([Path(f.name).stem for f in datafiles])) - for i, file in enumerate(datafiles): - if file.name.endswith(".csv"): - df = pd.read_csv(file) - elif Path(file.name).suffix.lower() in (".xls", ".xlsx"): - df = pd.read_excel(file) - elif file.name.endswith(".dfs0"): - data = file.getvalue() - TEMP_FILE = Path("temp.dfs0") - - with TEMP_FILE.open("wb") as f: - f.write(data) - - data = mikeio.read("temp.dfs0") - df = data.to_dataframe().reset_index() - TEMP_FILE.unlink() - - # df.index = df.index.tz_localize(tz="UTC") - - if len(df.select_dtypes(include=["datetime"]).columns) != 0: - date_column = df.select_dtypes(include=["datetime"]).columns[0] - elif len(df.select_dtypes(include=["object"]).columns) != 0: - date_column = df.select_dtypes(include=["object"]).columns[0] - else: - obj.error(f"Data in file {file.name} does not contain recognized timestamp column.") - return - - df.rename(columns={date_column: "index_to_be"}, inplace=True) - df.drop_duplicates(inplace=True) - - df = datetime_unififer(df, "index_to_be") - - unique_columns.update({c for c in df.drop(columns="index_to_be").columns}) - - if dataframe.empty: - dataframe = df - continue - - dataframe = dataframe.merge(df, on="index_to_be", how="outer", suffixes=["", i]) - - # after merging a new df, we validate the added data - # for each unique target colum - for column in unique_columns: - # get all series that have been merged for that column - cols = sorted([c for c in dataframe.columns if c.startswith(column)]) - - # compare the newest entry to all previous ones - newest_column = cols[-1] - newest_mask = ~dataframe[newest_column].isna() - for c in cols[:-1]: - # get mask where other series is not null - compare_mask = ~dataframe[c].isna() - - # need to compare values for all rows where both columns have entries - to_compare = dataframe.loc[newest_mask & compare_mask] - - conflict_mask = ~np.isclose(to_compare[c], to_compare[newest_column]) - if conflict_mask.any(): - conflict_data = to_compare.loc[conflict_mask].iloc[0] - obj.error( - f"""Found conflicting data for series '{column}' in uploaded files (multiple values - for same timestamp). - First mismatch: - Timestamp: {conflict_data["index_to_be"]} - Value 1: {conflict_data[newest_column].item()} - Value 2: {conflict_data[c].item()}""" - ) - - return - - # combine all merged series into single one - for column in unique_columns: - cols = sorted([c for c in dataframe.columns if c.startswith(column)])[1:] - - for c in cols: - dataframe[column].fillna(dataframe[c], inplace=True) - - dataframe = dataframe.set_index("index_to_be", drop=True)[list(unique_columns)] - dataframe.index.name = None - if len(dataframe.columns) > 1: - st.session_state["expand_data_selection"] = True - - if len(st.session_state["data_store"]) > 1: - st.session_state["expand_data_selection"] = True - - obj.success("Data uploaded and validated", icon="✅") - obj.write(f"Total rows: {len(dataframe)}") - obj.write("NaN values:") - obj.write(dataframe.isna().sum()) - - add_new_data(dataframe, file_handle) - - # For deployment 'logging' - logging.info("A new dataset was successfully uploaded.") - - -def add_new_data(df: pd.DataFrame, dataset_name: str): - """Splits a dataframe into its individual series, adds those series - to the data store and instantiates new AnnotationState instances for them.""" - - for col in df: - sub_df = df[[col]] - sub_df = sub_df[~sub_df[col].isna()] - st.session_state["data_store"][dataset_name][col] = sub_df - an_st = AnnotationState(dataset_name, col) - st.session_state["annotation_state_store"][dataset_name][col] = an_st - - st.session_state["current_dataset"] = dataset_name - st.session_state["current_series"][dataset_name] = sorted(df.columns)[0] diff --git a/tsod/active_learning/utils.py b/tsod/active_learning/utils.py deleted file mode 100644 index 862fa16..0000000 --- a/tsod/active_learning/utils.py +++ /dev/null @@ -1,219 +0,0 @@ -from copy import deepcopy -import datetime -import random -import os -from typing import Any, Dict, List, Sequence -import streamlit as st -import pandas as pd -import numpy as np -from collections import defaultdict -from tsod.active_learning.data_structures import AnnotationState -import psutil - -MODEL_OPTIONS = {"RF_1": "Random Forest Classifier"} - - -def custom_text( - text: str, - font_size: int = 30, - centered: bool = True, - font: str = "sans-serif", - vertical_align: str = "baseline", - base_obj=None, -): - obj = base_obj or st - if centered: - text_align = "center" - else: - text_align = "left" - md = f'

{text}

' - - return obj.markdown(md, unsafe_allow_html=True) - - -def get_as( - dataset: str | None = None, column: str | None = None, return_all_columns: bool = False -) -> AnnotationState: - if not dataset: - dataset = st.session_state.get("current_dataset") - if not dataset: - return None - ds_dict = st.session_state["annotation_state_store"].get(dataset) - - if not column: - if return_all_columns: - return ds_dict - column = st.session_state["current_series"][dataset] - if not ds_dict: - return None - return ds_dict.get(column) - - -def _add_to_ss_if_not_in_it(key: str, init_value: Any): - if key not in st.session_state: - st.session_state[key] = init_value - - -def init_session_state(): - _add_to_ss_if_not_in_it("annotation_state_store", defaultdict(dict)) - _add_to_ss_if_not_in_it("current_series", {}) - - _add_to_ss_if_not_in_it("data_store", defaultdict(dict)) - _add_to_ss_if_not_in_it("annotation_data_loaded", True) - _add_to_ss_if_not_in_it("uploaded_annotation_data", {}) - _add_to_ss_if_not_in_it("prediction_models", {}) - _add_to_ss_if_not_in_it("prediction_data", defaultdict(list)) - _add_to_ss_if_not_in_it("use_date_picker", True) - _add_to_ss_if_not_in_it("number_outliers", defaultdict(lambda: defaultdict(dict))) - _add_to_ss_if_not_in_it("inference_results", defaultdict(lambda: defaultdict(dict))) - _add_to_ss_if_not_in_it("uploaded_ds_features", defaultdict(dict)) - _add_to_ss_if_not_in_it("hide_choice_menus", False) - _add_to_ss_if_not_in_it("models_to_visualize", defaultdict(lambda: defaultdict(set))) - _add_to_ss_if_not_in_it("RF_features_computed_start", 0) - _add_to_ss_if_not_in_it("RF_features_computed_end", 0) - _add_to_ss_if_not_in_it("model_library", {}) - _add_to_ss_if_not_in_it("available_models", defaultdict(lambda: defaultdict(set))) - _add_to_ss_if_not_in_it("current_outlier_value_store", {}) - _add_to_ss_if_not_in_it("page_index", 0) - _add_to_ss_if_not_in_it("expand_data_selection", False) - _add_to_ss_if_not_in_it("pred_outlier_tracker", defaultdict(dict)) - _add_to_ss_if_not_in_it("models_trained_this_session", set()) - _add_to_ss_if_not_in_it("last_method_choice", "RF_1") - _add_to_ss_if_not_in_it("cleaned_dataset_counter", defaultdict(lambda: 1)) - _add_to_ss_if_not_in_it( - "suggested_points_with_annotation", defaultdict(lambda: defaultdict(list)) - ) - - -def set_session_state_items( - key: str | List[str], value: Any | List[Any], add_if_not_present: bool = False -): - if [type(key), type(value)].count(list) == 1: - raise ValueError("Either both or neither of key and value should be list.") - - if isinstance(key, list): - assert len(key) == len(value) - for k, v in zip(key, value): - if add_if_not_present: - _add_to_ss_if_not_in_it(k, v) - st.session_state[k] = v - - else: - if add_if_not_present: - _add_to_ss_if_not_in_it(key, value) - st.session_state[key] = value - - -def recursive_length_count(data: Dict, exclude_keys: Sequence = None) -> int: - total = 0 - _exclude = exclude_keys or [] - for k, v in data.items(): - if k in _exclude: - continue - if isinstance(v, (set, list)): - total += len(v) - elif isinstance(v, dict): - total += recursive_length_count(v) - - return total - - -def recursive_sum(data: Dict) -> int | float: - total = 0 - for v in data.values(): - if isinstance(v, (int, float)): - total += v - elif isinstance(v, dict): - total += recursive_sum(v) - - return total - - -def recursive_round(data: Dict, decimals: int = 3): - def value_round(v): - if isinstance(v, np.ndarray): - return v.round(decimals) - elif isinstance(v, float): - return round(v, decimals) - - local_data = deepcopy(data) - - for k, v in local_data.items(): - if isinstance(v, (np.ndarray, float)): - local_data[k] = value_round(v) - elif isinstance(v, list): - local_data[k] = [value_round(e) for e in v] - elif isinstance(v, tuple): - local_data[k] = tuple( - [value_round(x) if isinstance(x, (float, np.ndarray)) else x for x in v] - ) - elif isinstance(v, set): - local_data[k] = {value_round(e) for e in v} - elif isinstance(v, dict): - local_data[k] = recursive_round(v) - - return local_data - - -def ss_recursive_df_memory_usage(base_dict=None): - if base_dict is None: - base_dict = st.session_state - out = {} - for k, v in base_dict.items(): - if isinstance(v, dict): - out[k] = ss_recursive_df_memory_usage(v) - else: - if isinstance(v, pd.DataFrame): - out[k] = f"{round(v.memory_usage(deep=True).sum() / (2**20), 2)} MB" - - return {k: v for k, v in out.items() if v != {}} - - -def recursive_ss_search(search_str: str, base_dict=None, base_obj=None, recursion_count: int = 1): - obj = base_obj or st - if base_dict is None: - base_dict = st.session_state - - matches = {k: v for k, v in base_dict.items() if search_str.lower() in k.lower()} - if not matches: - return - if len(matches) > 1: - st.write(matches) - return - - for k, v in matches.items(): - if not isinstance(v, dict): - if isinstance(v, pd.DataFrame): - st.write(k) - st.write(v.head(30)) - elif isinstance(v, AnnotationState): - st.write(k) - st.write(v.__dict__) - else: - st.write(k) - st.write(v) - return - - st.write({k: {k_2: type(v_2) for k_2, v_2 in v.items()}}) - sub_key = obj.text_input("Enter sub key", key=f"dev_input_{recursion_count}") - if len(sub_key) > 0: - recursive_ss_search(sub_key, v, obj, recursion_count + 1) - - -def show_memory_usage(base_obj=None): - obj = base_obj or st - mem = psutil.virtual_memory() - obj.write( - { - "Total memory": f"{round(mem.total / (1024.0 ** 3), 2)} GB", - "Used memory": f"{round(mem.used / (1024.0 ** 3), 2)} GB", - "Free memory": f"{round(mem.free / (1024.0 ** 3), 2)} GB", - "Percent used": f"{mem.percent} %", - } - ) - - -def fix_random_seeds(seed=30): - random.seed(seed) - os.environ["PYTHONHASHSEED"] = str(seed) - np.random.seed(seed) diff --git a/tsod/detectors.py b/tsod/detectors.py index 64abe71..1adcf1d 100644 --- a/tsod/detectors.py +++ b/tsod/detectors.py @@ -239,15 +239,15 @@ def _detect(self, data: pd.Series) -> pd.Series: rollmax = data.rolling(self._window_size, center=True).apply(np.nanmax) rollmin = data.rolling(self._window_size, center=True).apply(np.nanmin) anomalies = np.abs(rollmax - rollmin) < self._threshold - anomalies[0] = False # first element cannot be determined - anomalies[-1] = False + anomalies.iloc[0] = False # first element cannot be determined + anomalies.iloc[-1] = False idx = np.where(anomalies)[0] if idx is not None: # assuming window size = 3 # remove also points before and after each detected anomaly - anomalies[idx[idx > 0] - 1] = True + anomalies.iloc[idx[idx > 0] - 1] = True maxidx = len(anomalies) - 1 - anomalies[idx[idx < maxidx] + 1] = True + anomalies.iloc[idx[idx < maxidx] + 1] = True return anomalies