diff --git a/params.yaml b/params.yaml index b132c7b..4df98ff 100644 --- a/params.yaml +++ b/params.yaml @@ -1,16 +1,16 @@ featurize: columns: Channel_4_Data convert_timestamp_to_datetime: true - dataset: nova10_p8_10hz + dataset: nova9_m4-20240117115737 overlap: 0 timestamp_column: timestamp - window_size: 35 + window_size: 10 postprocess: min_segment_length: 1 train: - annotations_dir: nova10_p8_10hz_annotations + annotations_dir: nova9_m4_10hz_annotations fix_predefined_centroids: false learning_method: minibatchkmeans max_iter: 100 - n_clusters: 7 + n_clusters: 6 use_predefined_centroids: true diff --git a/src/api.py b/src/api.py index 80ae201..cd8b50a 100644 --- a/src/api.py +++ b/src/api.py @@ -29,7 +29,9 @@ from plotly.subplots import make_subplots from clustermodel import ClusterModel -from config import API_MODELS_PATH, DATA_PATH_RAW, METRICS_FILE_PATH +from cluster_utils import create_event_log +from config import API_MODELS_PATH, DATA_PATH_RAW, METRICS_FILE_PATH, LABELS_PATH, PLOTS_PATH, OUTPUT_PATH +from postprocess import event_log_score from udava import Udava app = flask.Flask(__name__) @@ -302,16 +304,38 @@ def post(self): subprocess.run(["dvc", "repro", "train"], check=True) if flask.request.form.get("plot"): - fig_div = cm.run_cluster_model(inference_df=inference_df, plot_results=True) + plot_results=True + else: + plot_results=False + + print("Running cluster model...") + fig_div, timestamps, labels, distance_metric = cm.run_cluster_model( + inference_df=inference_df, plot_results=plot_results + ) + + # Evaluate event log score + print("Creating event log...") + event_log = create_event_log(labels, identifier=params["featurize"]["dataset"], feature_vector_timestamps=timestamps) + event_log.to_csv(OUTPUT_PATH / "event_log.csv") + + try: + with open("assets/data/expectations/" + params["featurize"]["dataset"] + "/expectations.json", "r") as f: + # expectations = json.load(f) + expectations = eval(f.read()) + except: + expectations = None + print("No expectations found.") + + if expectations != None: + event_log_score(event_log, expectations) + # Plot results + if flask.request.form.get("plot"): if flask.request.form.get("plot_in_new_window"): return flask.redirect("prediction") else: return flask.redirect("inference_result") else: - timestamps, labels, distance_metric = cm.run_cluster_model( - inference_df=inference_df - ) timestamps = np.array(timestamps, dtype=np.int32).reshape(-1, 1) labels = labels.reshape(-1, 1) distance_metric = distance_metric.reshape(-1, 1) @@ -344,13 +368,11 @@ def post(self): """ - input_json = flask.request.get_json() - model_id = str(input_json["param"]["modeluid"]) - - inference_df = pd.DataFrame( - input_json["scalar"]["data"], - columns=input_json["scalar"]["headers"], - ) + file = flask.request.files['file'] + model_id = flask.request.form["model_id"] + filename = "inference.csv" + file.save(filename) + inference_df = pd.read_csv(filename) models = get_models() model = models[model_id] @@ -359,13 +381,14 @@ def post(self): timestamp_column_name = params["featurize"]["timestamp_column"] inference_df.set_index(timestamp_column_name, inplace=True) + cm = ClusterModel(params_file=params) # Run DVC to fetch correct assets. subprocess.run(["dvc", "repro", "train"], check=True) - timestamps, labels, distance_metric = cm.run_cluster_model( - inference_df=inference_df + fig, timestamps, labels, distance_metric = cm.run_cluster_model( + inference_df=inference_df, plot_results=True, return_fig=True, png_only=True ) timestamps = np.array(timestamps).reshape(-1, 1) labels = labels.reshape(-1, 1) @@ -373,12 +396,32 @@ def post(self): output_data = np.concatenate([timestamps, labels, distance_metric], axis=1) output_data = output_data.tolist() + # fig.write_image(str(PLOTS_PATH / "labels_over_time.png"), height=500, width=860) + + # Evaluate event log score + print("Creating event log...") + event_log = create_event_log(labels, identifier=params["featurize"]["dataset"], feature_vector_timestamps=timestamps) + event_log.to_csv(OUTPUT_PATH / "event_log.csv") + + try: + with open("assets/data/expectations/" + params["featurize"]["dataset"] + "/expectations.json", "r") as f: + expectations = eval(f.read()) + except: + expectations = None + print("No expectations found.") + + if expectations != None: + score, _ = event_log_score(event_log, expectations) + + output = {} + # output["param"] = {"modeluid": model_id} + # output["scalar"] = { + # "headers": ["date", "cluster", "metric"], + # "data": output_data, + # } output = {} - output["param"] = {"modeluid": model_id} - output["scalar"] = { - "headers": ["date", "cluster", "metric"], - "data": output_data, - } + output["max_deviation_metric"] = {"value": distance_metric.max()} + output["event_log_score"] = {"value": score} return output diff --git a/src/cluster_utils.py b/src/cluster_utils.py index e95e8ab..f0c6627 100644 --- a/src/cluster_utils.py +++ b/src/cluster_utils.py @@ -13,6 +13,7 @@ clustering results. """ +import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -471,6 +472,8 @@ def plot_labels_over_time( show_local_distance=False, reduce_plot_size=False, filename=None, + return_fig=False, + png_only=False, ): """Plot labels over time. @@ -526,12 +529,45 @@ def plot_labels_over_time( if n_labels > 3000: reduce_plot_size = True + print("Reducing the plot size") # If reduce plot size, take only the nth data point, where nth is set to be - # a fraction of the window size. Large fraction of the window size is + # a fraction of the window size. Large fraction if the window size is # small, and small fraction if the window size is large. nth = min(int(window_size / np.log(window_size)), window_size) - nth = 1500 + nth = 100000 + print("=====") + print(n_labels) + print(len(original_data)) + + # # Reshape labels to match the DataFrame length + # expanded_labels = np.repeat(labels, window_size)[:len(original_data)] + + # # Normalize the expanded_labels to range [0,1] + # normalized_labels = (expanded_labels - expanded_labels.min()) / (expanded_labels.max() - expanded_labels.min()) + + # # Create a custom color scale + # color_scale = [(label / max(expanded_labels), color) for label, color in enumerate(COLORS) if label in np.unique(expanded_labels)] + + # Plot the data using scattergl for better performance with large datasets + # fig.add_trace( + # go.Scattergl( + # x=original_data.index, + # y=original_data['Channel_4_Data'], + # mode='markers+lines', # Use both markers and lines + # marker=dict( + # color=normalized_labels, # Set color of the markers as the normalized labels + # colorscale=color_scale, # Define custom color scale + # colorbar=dict(title='Labels'), # Optional: to show a color bar + # size=3, # Optional: adjust marker size + # cmin=0, # Set min for color scale + # cmax=1, # Set max for color scale + # ), + # line=dict(shape='hv') # Use a horizontal-vertical step line + # ) + # ) + + j = 0 @@ -559,7 +595,7 @@ def plot_labels_over_time( fig.add_trace( go.Scatter( x=t, - y=original_data[columns[i]].iloc[start:stop], + y=y, line=dict(color=color), showlegend=False, ), @@ -567,6 +603,11 @@ def plot_labels_over_time( j += 1 + # if j % 100: + # print(start) + + + if show_local_distance and not reduce_plot_size: label_indeces = labels.reshape(len(labels), 1) local_distance = np.take_along_axis(dist, label_indeces, axis=1).flatten() @@ -587,6 +628,8 @@ def plot_labels_over_time( secondary_y=True, ) + + # Plot deviation metric fig.add_trace( go.Scatter( @@ -605,10 +648,144 @@ def plot_labels_over_time( fig.update_yaxes(title_text="Sensor data unit", secondary_y=False) if filename is None: - fig.write_html(str(PLOTS_PATH / "labels_over_time.html")) - fig.write_html("src/templates/prediction.html") fig.write_image(str(PLOTS_PATH / "labels_over_time.png"), height=500, width=860) + if not png_only: + # fig.write_html(str(PLOTS_PATH / "labels_over_time.html")) + fig.write_html("src/templates/prediction.html") else: fig.write_html(filename) - return fig.to_html(full_html=False) + if return_fig: + return fig + else: + return fig.to_html(full_html=False) + + +def plot_labels_over_time_matplotlib( + feature_vector_timestamps, + labels, + feature_vectors, + original_data, + model, + mark_outliers=False, + show_local_distance=False, + reduce_plot_size=False, + filename=None, + return_fig=False, +): + """Plot labels over time. + + This function plots the labels over time. It also plots the local + distance of each data point to its cluster center. + + Args: + feature_vector_timestamps (np.array): Timestamps of feature vectors. + labels (np.array): Labels. + feature_vectors (np.array): Feature vectors. + original_data (pd.DataFrame): Original data. + model (sklearn.cluster): Cluster model. + mark_outliers (bool): If True, outliers will be marked with a grey + color. + show_local_distance (bool): If True, the local distance of each + data point to its cluster center will be plotted. + reduce_plot_size (bool): If True, the plot will be reduced in size. + + Returns: + None. + + """ + + with open("params.yaml", "r") as params_file: + params = yaml.safe_load(params_file) + + window_size = params["featurize"]["window_size"] + overlap = params["featurize"]["overlap"] + columns = params["featurize"]["columns"] + + cluster_centers = pd.read_csv( + OUTPUT_PATH / "cluster_centers.csv", index_col=0 + ).to_numpy() + + if type(columns) is str: + columns = [columns] + + step = window_size - overlap + + # dist = model.transform(feature_vectors) + dist = euclidean_distances(feature_vectors, cluster_centers) + sum_dist = dist.sum(axis=1) + + if mark_outliers: + labels = filter_outliers(labels, dist) + + fig, ax1 = plt.subplots(figsize=(10, 6)) # You can adjust the figure size + ax2 = ax1.twinx() # Create a second y-axis to plot the deviation metric + + n_features = len(columns) + n_labels = len(labels) + + timestamps = original_data.index + + if n_labels > 3000: + reduce_plot_size = True + + # If reduce plot size, take only the nth data point, where nth is set to be + # a fraction of the window size. Large fraction of the window size is + # small, and small fraction if the window size is large. + nth = min(int(window_size / np.log(window_size)), window_size) + nth = 10000 + + j = 0 + + for i in range(n_features): + # for j in range(n_labels): + while j < n_labels: + + start = j * step + stop = start + window_size + t = timestamps[start:stop] + y = original_data[columns[i]].iloc[start:stop] + + cluster = labels[j] + + if cluster == -1: + color = "grey" + else: + color = COLORS[cluster] + + if reduce_plot_size: + t = t[::nth] + y = y[::nth] + # j += 10 + + ax1.plot(t, y, color=color) + + j += 1 + + if show_local_distance and not reduce_plot_size: + label_indeces = labels.reshape(len(labels), 1) + local_distance = np.take_along_axis(dist, label_indeces, axis=1).flatten() + ax2.plot(feature_vector_timestamps, local_distance, color='blue') + + + # Plot distance to each cluster center + for i in range(dist.shape[1]): + ax2.plot(feature_vector_timestamps, dist[:, i], color=COLORS[i]) + + + # Plot deviation metric + ax2.plot(feature_vector_timestamps, sum_dist, color='black', label="Deviation metric") + + ax1.set_title("Cluster labels over time") + ax1.set_xlabel("Date") + ax1.set_ylabel("Sensor data unit") + ax2.set_ylabel("Deviation metric") + + fig.tight_layout() # Adjust the layout + + plt.savefig(str(PLOTS_PATH / "labels_over_time.png")) # Save the figure + + if return_fig: + return fig + # else: + # plt.show() # Show the plot diff --git a/src/clustermodel.py b/src/clustermodel.py index ef491d3..e47e38b 100644 --- a/src/clustermodel.py +++ b/src/clustermodel.py @@ -21,10 +21,10 @@ from pandas.api.types import is_numeric_dtype from sklearn.preprocessing import MinMaxScaler, StandardScaler -from cluster_utils import calculate_distances +from cluster_utils import calculate_distances, plot_labels_over_time_matplotlib, plot_labels_over_time from config import * from featurize import * -from postprocess import * +from postprocess import filter_segments from preprocess_utils import find_files, move_column from train import * @@ -70,7 +70,7 @@ def _check_assets_existence(self): assert check_ok, "Assets missing." - def run_cluster_model(self, inference_df, plot_results=False): + def run_cluster_model(self, inference_df, plot_results=False, return_fig=False, png_only=False): """Run cluster model. Args: @@ -115,24 +115,23 @@ def run_cluster_model(self, inference_df, plot_results=False): ) labels = filter_segments(labels, min_segment_length, distances_to_centers) - # Create event log - event_log = create_event_log(labels, - identifier=params["featurize"]["dataset"], feature_vector_timestamps=feature_vector_timestamps) - event_log.to_csv(OUTPUT_PATH / "event_log.csv") - # plt.figure() # plt.plot(labels) # plt.show() if plot_results: + print("Plotting results...") # visualize_clusters(labels, feature_vectors, model) - fig_div = plot_labels_over_time( - feature_vector_timestamps, labels, feature_vectors, inference_df, model + fig = plot_labels_over_time( + feature_vector_timestamps, labels, feature_vectors, inference_df, model, return_fig=return_fig, png_only=png_only ) + # fig = plot_labels_over_time_matplotlib( + # feature_vector_timestamps, labels, feature_vectors, inference_df, model, return_fig=return_fig + # ) # plot_cluster_center_distance(feature_vector_timestamps, feature_vectors, model) - return fig_div + return fig, feature_vector_timestamps, labels, sum_distance_to_centers else: - return feature_vector_timestamps, labels, sum_distance_to_centers + return None, feature_vector_timestamps, labels, sum_distance_to_centers def dbscan_predict(self, model, feature_vectors, metric=sp.spatial.distance.cosine): """Predict labels for cluster models without native method for diff --git a/src/postprocess.py b/src/postprocess.py index 3638563..1e29bc3 100644 --- a/src/postprocess.py +++ b/src/postprocess.py @@ -418,17 +418,31 @@ def event_log_score(event_log, expectations): event_log["duration_correct"] = 0 event_log["next_event_correct"] = 0 + if isinstance(event_log["timestamp"][0], np.ndarray): + event_log["timestamp"] = event_log["timestamp"].apply(lambda x: x[0]) + + if isinstance(event_log["timestamp"][0], str): + event_log["timestamp"] = pd.to_datetime(event_log["timestamp"]) + # Loop through every second row in dataframe for i, event in event_log.iloc[1::2].iterrows(): event_label = event["label"] + # Find duration event_duration = event["timestamp"] - event_log.iloc[i - 1]["timestamp"] + if isinstance(event_duration, np.float64): + seconds = event_duration + else: + seconds = event_duration.seconds + event_duration = timedelta(seconds=seconds) + # Find expectations - # expected_duration_ranges = [] + expected_duration_ranges = [] expected_next_events = [] for j, expectation in enumerate(expectations): if expectation["label"] == event_label: + # expected_duration_ranges.append(expectation["duration"]) expected_duration_range = expectation["duration"] @@ -444,9 +458,9 @@ def event_log_score(event_log, expectations): min_duration = timedelta(seconds=expected_duration_range[0]) max_duration = timedelta(seconds=expected_duration_range[1]) if event_duration < min_duration or event_duration > max_duration: - print( - f"Event {event_label} has duration {event_duration} which is not in the expected range {expected_duration_range} (timestamp: {event['timestamp']})" - ) + # print( + # f"Event {event_label} has duration {event_duration} which is not in the expected range {expected_duration_range} (timestamp: {event['timestamp']})" + # ) misses += 1 else: hits += 1 @@ -464,9 +478,9 @@ def event_log_score(event_log, expectations): hits += 1 event_log["next_event_correct"][i] = 1 else: - print( - f"Event {event_label} has next event {next_event_label} which is not in the expected events {expected_next_events} (timestamp: {event['timestamp']})" - ) + # print( + # f"Event {event_label} has next event {next_event_label} which is not in the expected events {expected_next_events} (timestamp: {event['timestamp']})" + # ) misses += 1 score = hits / (hits + misses)