From 4d97540b72bb2ffe59a7fa76c692d337a71418f6 Mon Sep 17 00:00:00 2001 From: komodovaran Date: Mon, 3 Feb 2020 10:17:14 -0500 Subject: [PATCH] Fixed directories, and more updates --- README.md | 89 ++++++++---- best_clusters.py | 41 ++++-- data/preprocessed/.gitkeep | 0 ksparse/__init__.py | 0 other/sparse_gam.py | 91 ++++++------ other/sparse_mnist.py | 9 +- other/sparse_mnist_new.py | 52 +++++++ .../train_flipped_classifier.py | 0 ts_to_img.py => other/ts_to_img.py | 0 overlay_data.py | 44 ++++++ prepare_data.py | 19 +-- resample_data.py | 24 ++++ st_predict.py | 132 +++++++++++++++--- test.py | 12 ++ test2.py | 125 ----------------- train_autoencoder.py | 18 +-- 16 files changed, 404 insertions(+), 252 deletions(-) create mode 100644 data/preprocessed/.gitkeep create mode 100644 ksparse/__init__.py create mode 100644 other/sparse_mnist_new.py rename train_flipped_classifier.py => other/train_flipped_classifier.py (100%) rename ts_to_img.py => other/ts_to_img.py (100%) create mode 100644 overlay_data.py create mode 100644 resample_data.py create mode 100644 test.py delete mode 100644 test2.py diff --git a/README.md b/README.md index cfcac3a..40fddcf 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,29 @@ ### Setup (tested on linux only!) -1. Install conda and a conda environment ("what?" "how?" - Google it!) -2. Install Tensorflow with `conda install tensorflow-gpu=2.0.0`. This **must** be installed as the first package. The contents -here are only tested with version 2.0, but it should work on later ones as well. If done correctly, check the script at -`/checks/test_tensorflow_gpu_is_working.py` -3. Install the rest of the conda requirements with - -````conda install -f -y -q --name py37 -c conda-forge --file conda_requirements.txt```` -3. Install everything else with `pip install -r requirements.txt` -4. If Tensorflow is installed correctly, run `checks/test_tensorflow_gpu_is_working`. If the device is correctly set up, +1. Install conda and a conda environment. Conda installation instructions for +Linux can be found on the website, as well as how to create an environment. +2. Install Tensorflow with `conda install tensorflow-gpu=2.0.0`. This **must** +be installed as the first package. The contents here are only tested with +version 2.0, but it should work on later ones as well. If done correctly, +check the script at `/checks/test_tensorflow_gpu_is_working.py` +3. Install the rest of the conda requirements with +```` +conda install -f -y -q --namepy37 -c conda-forge --file conda_requirements.txt +```` +4. Install everything else with `pip install -r requirements.txt` +5. If Tensorflow is installed correctly, run +`checks/test_tensorflow_gpu_is_working`. If the device is correctly set up, Tensorflow is working and you're good to go! +6. Conda and pip don't talk together, and this breaks some of the package +installations. If for some reason a package was not installed, try running a +script until you hit a `ModuleNotFound: no module named name_of_package` error, +and try installing the module with `pip install name_of_package`. ### Interactive scripts -Large parts run interactively in the Python-package [Streamlit](www.streamlit.io). If a script has `st_` in front of the -name, it must be run interactively through Streamlit. To launch these from the terminal, write `streamlit run st_myscript.py` +Large parts run interactively in the Python-package +[Streamlit](www.streamlit.io). If a script has `st_` in front of the name, it +must be run interactively through Streamlit (or else it doesn't produce any +visible output). To launch these from the terminal, write +`streamlit run st_myscript.py` ### Naming convention @@ -22,30 +33,56 @@ name, it must be run interactively through Streamlit. To launch these from the t * name: filename (`model_005.h5`) -### Data format -Every dataset is preprocessed so that eventually you'll have a `hdf` file and a corresponding `npz` file. All -computations are done on the `npz` file because it's much faster, and compatible with tensorflow. However, group order -must be preserved according to the parent `hdf` dataframe. +* To access something one directory up, write `../` in front of the directory +name. Two directories up is `../../`, and so on. -A `npz` has just a single group index, whereas a dataframe may have both an `id` and `sub_id` if it's combined -from multiple sources. In that case, the `id` will correspond to the `npz` index, and `sub_id` will be the actual -group index in the sub-dataset. Group order is only preserved if dataframe groups are sorted by `['file', 'particle']`, -or for combined dataframes `['source', 'file', 'particle']`. To combine dataframes, the inputs are stacked in loaded order -(which must therefore also be sorted!). All of this is done automatically, if the right sequence of steps is taken. +* All paths in use are defined in `lib/globals.py`, so they can be conveniently +changed once here, rather than everywhere in the code. + +### Data format +Every dataset is preprocessed so that eventually you'll have a `hdf` file and a +corresponding `npz` file. All computations are done on the `npz` file because +it's much faster, and compatible with tensorflow. However, group order must be +preserved according to the parent `hdf` dataframe. + +A `npz` has just a single group index (i.e. '512' means trace id 512 (remember, +Python counts from 0!), whereas a dataframe may have both an `id` and `sub_id` +if it's combined from multiple sources. In that case, the `id` will correspond +to the `npz` index (i.e. the order of appearance), and `sub_id` will be the +actual group index in the sub-dataset (which is currently not used). Group order +is only preserved if dataframe groups are sorted by `['file', 'particle']`, or +for combined dataframes `['source', 'file', 'particle']`. To combine dataframes, +the inputs are stacked in loaded order (which must therefore also be sorted!). +All of this is done automatically, if the right sequence of steps is taken. ### Scripts to run, step by step 1. `get_cme_tracks.py` to convert from CME `.mat` files to a dataframe. -2. `prepare_data.py` to filter out too short data (set it low initially to be safe), traces that would be cut by the -tracking start/end, and +2. `prepare_data.py` to filter out too short data (set it low initially if you +want to be safe - The model will work almost equally well, regardless of the +minimum length), as well as traces that would be cut by the tracking start/end ( +i.e. if something starts at frame 0 of the video, it's removed, because you +can't be sure if the actual event started at "frame -10". 3. `train_autoencoder.py` to train a model on the data. -4. `st_predict.py` to predict and plot the data. Initially, a UMAP model is trained. -5. `st_eval.py` once clustering is done and you want to explore the data. +4. `st_predict.py` to predict and plot the data. Initially, a UMAP model is +trained. This takes a while. It might even time out your Streamlit session, but +don't touch anything and it'll be ready eventually. +5. Every cluster is saved as a combination of model + data names, and will be +output to `results/cluster_indices/`. This contains the indices of every trace +(see above on how indexing works), and which cluster they belong to. Note that +every change in the analysis **OVERWRITES** the automatically created file +containing cluster indices. If you have reached a point where you want to save +them, go to `results/cluster_indices/` and rename the file so you're sure it +won't be overwritten. +6. `st_eval.py` once clustering is done and you want to explore the data. It +currently doesn't have much functionality. Only looking at one/more specific +datasets/clusters... ### Things to avoid In order to preserve group ordering, the original dataframes must be run through - `prepare_data.py` if they need to be filtered in some way. **DO NOT** run a combine dataframe through a filter, - because this messes up the internal group ordering that was first established when creating the combined dataframe. + `prepare_data.py` if they need to be filtered in some way. **DO NOT** run a + combine dataframe through a filter, because this messes up the internal group + ordering that was first established when creating the combined dataframe. ### Troubleshooting #### Packages are missing diff --git a/best_clusters.py b/best_clusters.py index 019c498..7e2a7d8 100644 --- a/best_clusters.py +++ b/best_clusters.py @@ -8,13 +8,13 @@ from sklearn.mixture import GaussianMixture from sklearn.cluster import MiniBatchKMeans from tqdm import tqdm -from lib.utils import get_index import lib.globals import lib.globals +from lib.utils import get_index -def _plot_kmeans_scores(X, min, max, step): +def _plot_kmeans_score(X, min, max, step): """ Calculates scores for multiple values of kmeans Args: @@ -25,9 +25,8 @@ def _plot_kmeans_scores(X, min, max, step): """ rng = list(range(min, max, step)) - def process(n): + def process_gaussian(n): clf = GaussianMixture(n_components = n, random_state = 42) - # clf = MiniBatchKMeans(n_clusters=n, random_state=42) labels = clf.fit_predict(X) s = silhouette_score(X, labels) @@ -36,20 +35,35 @@ def process(n): return s, c, b + def process_kmeans(n): + clf = MiniBatchKMeans(n_clusters=n, random_state=42) + labels = clf.fit_predict(X) + + s = silhouette_score(X, labels) + c = calinski_harabasz_score(X, labels) + return s, c + + n_jobs = len(rng) - results = Parallel(n_jobs=n_jobs)(delayed(process)(i) for i in tqdm(rng)) - results = np.column_stack(results).T + results_kmeans = Parallel(n_jobs=n_jobs)(delayed(process_kmeans)(i) for i in tqdm(rng)) + results_kmeans = np.column_stack(results_kmeans).T + + fig, ax = plt.subplots(nrows=3, ncols = 2) - fig, ax = plt.subplots(nrows=3) - ax[0].plot(rng, results[:, 0], "o-", color="blue", label="Silhouette score") - ax[1].plot(rng, results[:, 1], "o-", color="orange", label="CH score") - ax[2].plot(rng, results[:, 2], "o-", color="red", label="BIC") + ax[0, 0].set_title("K-means") + ax[0, 0].plot(rng, results_kmeans[:, 0], "o-", color="blue", label="Silhouette score") + ax[1, 0].plot(rng, results_kmeans[:, 1], "o-", color="orange", label="CH score") + + ax[0, 1].set_title("Gaussian Mixture") + ax[0, 1].plot(rng, results_kmeans[:, 0], "o-", color="blue", label="Silhouette score") + ax[1, 1].plot(rng, results_kmeans[:, 1], "o-", color="orange", label="CH score") + ax[2, 1].plot(rng, results_kmeans[:, 2], "o-", color="red", label="BIC") for a in ax: a.legend(loc="upper right") plt.tight_layout() - plt.savefig("plots/best_k.pdf") + plt.savefig("plots/best_clusters.pdf") plt.show() @@ -61,11 +75,14 @@ def main(encodings_name): X, encodings = f["X_true"], f["features"] + X = X[0:1000] + encodings = encodings[0:1000] + arr_lens = np.array([len(xi) for xi in X]) (len_above_idx,) = np.where(arr_lens >= 30) X, encodings, = get_index((X, encodings), index=len_above_idx) - _plot_kmeans_scores(encodings, min=2, max=100, step=3) + _plot_kmeans_score(encodings, min=2, max=100, step=3) if __name__ == "__main__": diff --git a/data/preprocessed/.gitkeep b/data/preprocessed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/ksparse/__init__.py b/ksparse/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/other/sparse_gam.py b/other/sparse_gam.py index 08b96e2..27a7801 100644 --- a/other/sparse_gam.py +++ b/other/sparse_gam.py @@ -28,7 +28,7 @@ x_train, x_val = train_test_split(X) - input_shape = x_train.shape[1:] + shape = x_train.shape[1:] # Define autoencoder parameters EPOCHS = 10 @@ -43,52 +43,61 @@ INIT_SPARSITY, END_SPARSITY, EPOCHS ) - # k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x) - - # Build the Autoencoder Model - inputs = Input(shape=input_shape, name="encoder_input") - x = inputs - - for filters in LAYER_FILTERS: - x = Conv2D( - filters=filters, - kernel_size=KERNEL_SIZE, - strides=2, - activation="relu", - padding="same", - data_format="channels_last", - )(x) + i = Input(shape) + x = Flatten()(i) + h = Dense(LATENT_DIM, activation = 'sigmoid')(x) - # Generate the latent vector - x = Flatten()(x) - x = Dense(LATENT_DIM, name="latent_vector")(x) - k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x) + k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(h) - # Decoder - x = Dense(input_shape[0] * input_shape[1] * input_shape[2])(k_sparse) - x = Reshape(input_shape)(x) + x = Dense(x_train.shape[1] * x_train.shape[2] * x_train.shape[3])(k_sparse) + x = Reshape(shape)(x) + o = Activation("sigmoid")(x) - for filters in LAYER_FILTERS[::-1]: - x = Conv2DTranspose( - filters=filters, - kernel_size=KERNEL_SIZE, - strides=1, - activation="relu", - padding="same", - data_format="channels_last", - )(x) - - x = Conv2DTranspose( - filters=input_shape[-1], - kernel_size=KERNEL_SIZE, - padding="same", - data_format="channels_last", - )(x) + # k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x) - outputs = Activation(None, name="decoder_output")(x) + # Build the Autoencoder Model + # i = Input(shape=input_shape, name= "encoder_input") + # # for filters in LAYER_FILTERS: + # # x = Conv2D( + # # filters=filters, + # # kernel_size=KERNEL_SIZE, + # # strides=2, + # # activation="relu", + # # padding="same", + # # data_format="channels_last", + # # )(x) + # + # # Generate the latent vector + # x = Flatten()(i) + # x = Dense(LATENT_DIM, activation = "sigmoid")(x) + # k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x) + # + # + # # Decoder + # x = Dense(int(np.product(input_shape)), activation = "relu")(x) + # x = Reshape(input_shape)(x) + # + # outputs = Dense(x_train.shape[1], activation = 'sigmoid')(k_sparse) + + # for filters in LAYER_FILTERS[::-1]: + # x = Conv2DTranspose( + # filters=filters, + # kernel_size=KERNEL_SIZE, + # strides=1, + # activation="relu", + # padding="same", + # data_format="channels_last", + # )(x) + # + # x = Conv2DTranspose( + # filters=input_shape[-1], + # kernel_size=KERNEL_SIZE, + # padding="same", + # data_format="channels_last", + # )(x) # Autoencoder = Encoder + Decoder - autoencoder = Model(inputs, outputs, name="autoencoder") + autoencoder = Model(i, o, name= "autoencoder") autoencoder.summary() autoencoder.compile(loss="mse", optimizer="adam") diff --git a/other/sparse_mnist.py b/other/sparse_mnist.py index 5ed39ee..8bcc6b6 100644 --- a/other/sparse_mnist.py +++ b/other/sparse_mnist.py @@ -16,11 +16,9 @@ x_train, y_train = shuffle(x_train, y_train, random_state = 1) x_test, y_test = shuffle(x_test, y_test, random_state = 1) - def process(x): return x.reshape(x.shape[0], x.shape[1] ** 2) / 255 - x_train = process(x_train) x_test = process(x_test) @@ -35,11 +33,8 @@ def process(x): # Build the k-sparse autoencoder inputs = Input((x_train.shape[1],)) - x = Dense(embedding_size, activation = "relu")(inputs) - x = Dense(embedding_size, activation = "relu")(x) - k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x) - x = Dense(embedding_size, activation = "relu")(k_sparse) - + h = Dense(embedding_size, activation = 'sigmoid')(inputs) + k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(h) outputs = Dense(x_train.shape[1], activation = 'sigmoid')(k_sparse) ae1 = Model(inputs, outputs) diff --git a/other/sparse_mnist_new.py b/other/sparse_mnist_new.py new file mode 100644 index 0000000..afc0750 --- /dev/null +++ b/other/sparse_mnist_new.py @@ -0,0 +1,52 @@ +import ksparse.utils.mnist.mnist_helper as mh +from ksparse.layers.linear_layer import LinearLayer +from ksparse.layers.sparse_layer import SparseLayer +from ksparse.nets.fcnn import * +from ksparse.utils.activations import * +from ksparse.utils.cost_functions import * +from tensorflow.keras.layers import Dense + +img_size = 28 +num_hidden = 100 +k = 70 +learning_rate = 0.01 +epochs = 10000 +batch_size = 256 +print_epochs = 1000 +num_test_examples = 10 + +helper = mh.mnist_helper() +train_lbl, train_img, test_lbl, test_img = helper.get_data() + +x_data = train_img.reshape(-1, img_size * img_size) / np.float32(256) +test_data = test_img.reshape(-1, img_size * img_size) / np.float32(256) + +layers = [ + # LinearLayer(name="input", n_in=x_data.shape[1], n_out=num_hidden, activation=sigmoid_function), + Dense(units = 32, activation = "relu", input_shape = x_data.shape[1]), + SparseLayer( + name="hidden 1", + n_in=x_data.shape[1], + n_out=num_hidden, + activation=sigmoid_function, + num_k_sparse=k, + ), + LinearLayer( + name="output", + n_in=num_hidden, + n_out=x_data.shape[1], + activation=sigmoid_function, + ), +] + +nn = FCNeuralNet(layers=layers, cost_func=subtract_err) +nn.print_network() + +nn.train( + x_data, + x_data, + learning_rate=learning_rate, + epochs=epochs, + batch_size=batch_size, + print_epochs=print_epochs, +) diff --git a/train_flipped_classifier.py b/other/train_flipped_classifier.py similarity index 100% rename from train_flipped_classifier.py rename to other/train_flipped_classifier.py diff --git a/ts_to_img.py b/other/ts_to_img.py similarity index 100% rename from ts_to_img.py rename to other/ts_to_img.py diff --git a/overlay_data.py b/overlay_data.py new file mode 100644 index 0000000..99ffcad --- /dev/null +++ b/overlay_data.py @@ -0,0 +1,44 @@ +import numpy as np +import matplotlib.pyplot as plt + +def generate_fake_tracks(): + data = [] + for i in range(16): + a = np.sin(np.arange(100) * 0.1) + a = abs(a) + + b = np.ones(len(a)) + + x = np.column_stack((a, b)) + + noise = np.random.normal(2, 0.2, len(x)) + + # Add noise + x[:, 0] *= noise + x[:, 1] *= noise + + # Add intensities + x[:, 0] *= np.random.uniform(200, 1000) + x[:, 1] *= np.random.uniform(100) + + data.append(x) + + data = np.array(data) + return data + + +def main(): + data = generate_fake_tracks() + fig, ax = plt.subplots(nrows=4, ncols=4) + ax = ax.ravel() + for i in range(len(ax)): + ax[i].plot(data[i]) + plt.show() + + overlay = data.mean(axis = 0) + fig, ax = plt.subplots() + ax.plot(overlay) + plt.show() + +if __name__ == "__main__": + main() diff --git a/prepare_data.py b/prepare_data.py index 7afb9fe..8839cf0 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -12,15 +12,16 @@ def _filter(args): Parallel filter function """ _, group = args - check1 = group["t"].values[0] == FIRST_FRAME # check that it's not first - check2 = group["t"].values[-1] == LAST_FRAME # or last frame - check3 = ( - len(group) >= LAST_FRAME - ) # check that it's not longer than maximum length - check4 = len(group) <= MIN_LEN # check that it's longer than minimum - if any( - [check1, check2, check3, check4] - ): # if any checks fail, discard trace + # check that it's not first + check1 = group["f"].values[0] == FIRST_FRAME + # or last frame + check2 = group["f"].values[-1] == LAST_FRAME + # check that it's not longer than maximum length + check3 = len(group) >= LAST_FRAME + # check that it's longer than minimum + check4 = len(group) <= MIN_LEN + # if any checks fail, discard trace + if any([check1, check2, check3, check4]): return None else: return group diff --git a/resample_data.py b/resample_data.py new file mode 100644 index 0000000..15219b2 --- /dev/null +++ b/resample_data.py @@ -0,0 +1,24 @@ +import lib.math +import parmap +import numpy as np +from tqdm import tqdm +import matplotlib.pyplot as plt + +def process(xi): + ts = lib.math.resample_timeseries(xi, new_length = 200) + return ts + +X = np.load("data/preprocessed/combined_filt20_var.npz", allow_pickle = True)["data"] + +mp_results = parmap.map(process, tqdm(X)) +newX = np.array(mp_results) + +print(newX.shape) + +fig, ax = plt.subplots(nrows = 3, ncols = 2) +for i in range(3): + ax[i, 0].plot(X[i]) + ax[i, 1].plot(newX[i]) +plt.show() + +np.savez("data/preprocessed/combined_filt20.npz", data = newX) \ No newline at end of file diff --git a/st_predict.py b/st_predict.py index 4050d47..1edbc43 100644 --- a/st_predict.py +++ b/st_predict.py @@ -4,6 +4,7 @@ from typing import Iterable import matplotlib.pyplot as plt + # noinspection PyUnresolvedReferences import mpl_scatter_density import numpy as np @@ -27,6 +28,8 @@ from tensorflow.keras.models import Model from tensorflow.python import keras from tqdm import tqdm +import matplotlib.colors +from sklearn.cluster import DBSCAN import lib.globals import lib.math @@ -215,8 +218,8 @@ def _predict(X_true, idx_true, model_path, savename, single_feature): latest_model_path = _latest_model(model_path) autoencoder = keras.models.load_model( - latest_model_path, custom_objects={"gelu": gelu, - "f1_m": lib.math.f1_m} + latest_model_path, + custom_objects={"gelu": gelu, "f1_m": lib.math.f1_m}, ) encoder = _get_encoding_layer(autoencoder) @@ -268,7 +271,6 @@ def _predict(X_true, idx_true, model_path, savename, single_feature): xi_pred = _xi_true ei = np.zeros(xi_true.shape[0]) - # Make sure they're numpy arrays now and not EagerTensors! X_true.extend(np.array(xi_true)) X_pred.extend(np.array(xi_pred)) @@ -344,7 +346,7 @@ def _cluster_kmeans(features, n_clusters): Cluster points using K-means Args: - features (np.array) + features (np.ndarray) n_clusters (int) """ clf = MiniBatchKMeans(n_clusters=n_clusters) @@ -352,6 +354,7 @@ def _cluster_kmeans(features, n_clusters): centers = clf.cluster_centers_ return labels, centers + @timeit @st.cache def _cluster_gmm(features, n_clusters): @@ -359,14 +362,35 @@ def _cluster_gmm(features, n_clusters): Cluster points using Gaussian Mixture Model Args: - features (np.array) + features (np.ndarray) n_clusters (int) """ - clf = GaussianMixture(n_components = n_clusters, covariance_type = "full") + clf = GaussianMixture(n_components=n_clusters, covariance_type="full") labels = clf.fit_predict(features) centers = clf.means_ return labels, centers +@timeit +@st.cache +def _cluster_dbscan(features, eps): + """ + Cluster points using DBSCAN + + Args: + features (np.ndarray) + eps (float) + """ + clf = DBSCAN(eps = eps, n_jobs = -1, min_samples = 200) + clf.fit_predict(features) + labels = clf.labels_ + + centers = [] + for l in set(labels): + f = features[labels == l] + centers.append(np.mean(f, axis = 0)) + centers = np.array(centers) + return labels, centers + def _resample_traces(traces, length, normalize): """ @@ -385,8 +409,7 @@ def _resample_traces(traces, length, normalize): if normalize: new = [] for trace in traces_re: - t = trace / trace.max(axis=0) - t -= np.min(t, axis=0) + t = trace / trace.max(axis=(0, 1)) new.append(t) traces_re = np.array(new) @@ -453,7 +476,7 @@ def _clath_aux_peakfinder(X): Not very tweaked and only used as a weak guidance. Args: - X (np.array): + X (np.ndarray) """ has_peak, has_no_peak = _find_peaks(X) @@ -542,6 +565,22 @@ def _plot_scatter(features, subsample, cluster_labels=None): st.write(fig) +def _plot_density(features): + """ + Plots density map of features + + Args: + features (np.ndarray) + """ + fig = plt.figure() + ax = fig.add_subplot(1, 1, 1, projection="scatter_density") + c = ax.scatter_density( + features[:, 0], features[:, 1], cmap="plasma", dpi=72 + ) + fig.colorbar(c, label="Datapoints per pixel") + st.write(fig) + + def _plot_mse( original_mse, mse_filter, ): @@ -599,6 +638,9 @@ def _plot_traces_preview( sg (float or np.ndarray) colors (list of str) """ + X_true, X_pred, mse, sample_indices = sklearn.utils.shuffle(X_true, X_pred, mse, sample_indices) + + fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 10)) axes = axes.ravel() for i, ax in enumerate(axes): @@ -905,26 +947,74 @@ def main(): st_min_length = st.sidebar.number_input( min_value=1, max_value=100, + value=20, label="Minimum length of data to cluster", key="st_min_length", ) - min_len_labels = np.zeros(len(umap_enc_orig)).astype(int) - min_len_labels.fill(0) + st_max_mse = st.sidebar.number_input( + min_value=0.0, + max_value=max(mse), + value=max(mse), + label="Maximum error of data to cluster", + key="st_max_mse", + ) + + pre_filtered = np.zeros(len(umap_enc_orig)).astype(int) + pre_filtered.fill(0) + # Remove all traces below minimum length from clustering (len_above_idx,) = np.where(arr_lens >= st_min_length) X_true, X_pred, encodings, umap_enc, mse, indices, = get_index( (X_true, X_pred, encodings, umap_enc, mse, indices), index=len_above_idx ) - min_len_labels[len_above_idx] = 1 + pre_filtered[len_above_idx] = 1 - st_clust_n = st.sidebar.number_input(label="Number of clusters", value=3) + # Remove all traces above max error from clustering + (mse_below_idx,) = np.where(mse <= st_max_mse) + X_true, X_pred, encodings, umap_enc, mse, indices, = get_index( + (X_true, X_pred, encodings, umap_enc, mse, indices), index=mse_below_idx + ) + pre_filtered[mse_below_idx] = 1 pca, explained_var = _pca(encodings, embed_into_n_components=4) _plot_explained_variance(explained_var) - # clabels, centers = _cluster_kmeans(features=pca, n_clusters=st_clust_n) - clabels, centers = _cluster_gmm(features = pca, n_clusters = st_clust_n) + clustering_methods = ("K-means", "Gaussian Mixture Model", "Density-based clustering") + + st_clust_method = st.sidebar.radio(label = "Clustering method", options = clustering_methods, index = 0) + + if st_clust_method == clustering_methods[0]: + st_clust_n = st.sidebar.number_input(label = "Number of clusters", value = 3) + clabels, centers = _cluster_kmeans(features=pca, n_clusters=st_clust_n) + + elif st_clust_method == clustering_methods[1]: + st_clust_n = st.sidebar.number_input(label="Number of clusters", value=3) + clabels, centers = _cluster_gmm(features=pca, n_clusters=st_clust_n) + + elif st_clust_method == clustering_methods[2]: + st_eps = st.sidebar.number_input(label = "eps", value = 0.5) + clabels, centers = _cluster_dbscan(features = pca, eps = st_eps) + + else: + raise NotImplementedError + + def _plot_cluster_label_distribution(data): + bins = np.arange(0 - 1.5, data.max() + 1.5) - 0.5 + + # then you plot away + fig, ax = plt.subplots() + ax.hist(data, bins) + ax.set_xticks(bins + 0.5) + + svg_write(fig) + + _plot_cluster_label_distribution(clabels) + + st.write(set(clabels)) + st.write(np.histogram(clabels)) + st.write(clabels) + st.write(centers) st.subheader("Lengths") _plot_length_dist( @@ -934,16 +1024,20 @@ def main(): ) st.subheader("UMAP displaying length cutoff filter") _plot_scatter( - features=umap_enc_orig, cluster_labels=min_len_labels, subsample=False + features=umap_enc_orig, cluster_labels=pre_filtered, subsample=False ) + + st.subheader("UMAP density on filtered") + _plot_density(umap_enc) + + st.subheader("UMAP after clustering on filtered") + _plot_scatter(features=umap_enc, cluster_labels=clabels, subsample=False) + st.subheader("PCA after clustering on filtered") _plot_scatter( features=pca[:, [0, 1]], cluster_labels=clabels, subsample=False ) - st.subheader("UMAP after clustering on filtered") - _plot_scatter(features=umap_enc, cluster_labels=clabels, subsample=False) - n_clusters = len(set(clabels)) len_X_true_postfilter = len(X_true) colormap = lib.plotting.get_colors("viridis", n_colors=n_clusters) diff --git a/test.py b/test.py new file mode 100644 index 0000000..b44e9ba --- /dev/null +++ b/test.py @@ -0,0 +1,12 @@ +import numpy as np +from mayavi import mlab + +mlab.options.offscreen = True + +n = 5000 +x = np.random.rand(n) +y = np.random.rand(n) +z = np.random.rand(n) +s = np.sin(x)**2 + np.cos(y) + +mlab.points3d(x, y, z, s, colormap="RdYlBu", scale_factor=0.02, scale_mode='none') diff --git a/test2.py b/test2.py deleted file mode 100644 index e632163..0000000 --- a/test2.py +++ /dev/null @@ -1,125 +0,0 @@ -import os.path -import matplotlib.pyplot as plt -# noinspection PyUnresolvedReferences -import mpl_scatter_density -import numpy as np -import sklearn.utils -import umap.umap_ as umap -from joblib import Parallel, delayed -from sklearn.cluster import MiniBatchKMeans -from sklearn.metrics import calinski_harabasz_score, silhouette_score -from sklearn.mixture import GaussianMixture -from tqdm import tqdm - -import lib.globals -import lib.globals -from lib.math import resample_timeseries - - -def _plot_kmeans_scores(X, min, max, step): - """ - Calculates scores for multiple values of kmeans - Args: - X (np.ndarray) - min (int) - max (int) - step (int) - """ - rng = list(range(min, max, step)) - - def process(n): - clf = GaussianMixture(n_components = n, random_state = 42) - # clf = MiniBatchKMeans(n_clusters=n, random_state=42) - labels = clf.fit_predict(X) - - s = silhouette_score(X, labels) - c = calinski_harabasz_score(X, labels) - b = clf.bic(X) - - return s, c, b - - n_jobs = len(rng) - results = Parallel(n_jobs=n_jobs)(delayed(process)(i) for i in tqdm(rng)) - results = np.column_stack(results).T - - fig, ax = plt.subplots(nrows=3) - ax[0].plot(rng, results[:, 0], "o-", color="blue", label="Silhouette score") - ax[1].plot(rng, results[:, 1], "o-", color="orange", label="CH score") - ax[2].plot(rng, results[:, 2], "o-", color="red", label="BIC") - - for a in ax: - a.legend(loc="upper right") - - plt.tight_layout() - plt.savefig("plots/best_k.pdf") - plt.show() - - -def main(encodings_name): - np.random.seed(1) - - f = np.load( - os.path.join(lib.globals.encodings_dir, encodings_name), - allow_pickle=True, - ) - - X = f["X_true"] - X = sklearn.utils.resample(X, n_samples = 10000, replace = False) - - X = np.array([lib.math.resample_timeseries(xi.ravel().reshape(-1, 1), new_length = 200) for xi in X]) - X = np.squeeze(X) - - u = umap.UMAP( - n_components = 2, - random_state = 42, - n_neighbors = 100, - min_dist = 0.0, - init = "spectral", - verbose = True, - ) - e = u.fit_transform(X) - - fig = plt.figure() - - ax = fig.add_subplot(1, 1, 1, projection = "scatter_density") - - clf = MiniBatchKMeans(n_clusters = 35) - labels = clf.fit_predict(e) - centers = clf.cluster_centers_ - - ax.scatter_density(e[:, 0], e[:, 1], c = labels, cmap = "viridis_r") - - for i in range(len(set(labels))): - m = centers[i] - ax.annotate( - xy = m, - s = i, - bbox = dict(boxstyle = "square", fc = "w", ec = "grey", alpha = 0.9), - ) - - Xa = X[labels == 26] - Xb = X[labels == 33] - - colors = ["black", "red"] - - for X in Xb, Xa: - fig, ax = plt.subplots(nrows = 5, ncols = 5) - ax = ax.ravel() - for i in range(len(ax)): - xi = np.zeros((100, 2)) - xi[:, 0] = X[i][:100] - xi[:, 1] = X[i][100:] - - for c in range(xi.shape[-1]): - ax[i].plot(xi[..., c], color = colors[c]) - - ax[i].set_xticks(()) - ax[i].set_yticks(()) - plt.tight_layout() - - plt.show() - - -if __name__ == "__main__": - NAME = "20200124-0206_lstm_vae_bidir_data=combined_filt5_var.npz_dim=128_act=None_bat=4_eps=0.1_zdim=8_anneal=20___pred__combined_filt20_var.npz" - main(NAME) diff --git a/train_autoencoder.py b/train_autoencoder.py index cd8fcb8..db96eb8 100644 --- a/train_autoencoder.py +++ b/train_autoencoder.py @@ -8,6 +8,7 @@ import sklearn.preprocessing import tensorflow as tf import parmap +from tqdm import tqdm import lib.math import lib.models @@ -78,7 +79,7 @@ def _preprocess(X, n_features, max_batch_size, train_size): if __name__ == "__main__": MODELF = (lib.models.lstm_vae_bidir,) - INPUT_NPZ = ("data/preprocessed/fake_tracks_type_3.npz",) + INPUT_NPZ = ("data/preprocessed/combined_filt5_var.npz",) N_TIMESTEPS = None EARLY_STOPPING = 3 @@ -93,10 +94,10 @@ def _preprocess(X, n_features, max_batch_size, train_size): # Best results have been found with keeping latent dim high and zdim low LATENT_DIM = (128,) ACTIVATION = (None,) - ZDIM = (16,) - EPS = (1,) + ZDIM = (8, 16,) + EPS = (0.1, 1,) KEEP_ONLY = (None,) - ANNEAL_TIME = (1, 5, 999,) + ANNEAL_TIME = (1, 5, 20) # Add iterables here for ( @@ -123,15 +124,6 @@ def _preprocess(X, n_features, max_batch_size, train_size): X_raw = _get_data(_input_npz) - - def process(xi): - ts = lib.math.resample_timeseries(xi, new_length = 32) - return ts - - mp_results = parmap.map(process, X_raw) - data = np.array(mp_results) - - if _keep_only is not None: X_raw = np.array([x[:, _keep_only].reshape(-1, 1) for x in X_raw])