From 4d97540b72bb2ffe59a7fa76c692d337a71418f6 Mon Sep 17 00:00:00 2001
From: komodovaran <komodovaran.ku@gmail.com>
Date: Mon, 3 Feb 2020 10:17:14 -0500
Subject: [PATCH] Fixed directories, and more updates

---
 README.md                                     |  89 ++++++++----
 best_clusters.py                              |  41 ++++--
 data/preprocessed/.gitkeep                    |   0
 ksparse/__init__.py                           |   0
 other/sparse_gam.py                           |  91 ++++++------
 other/sparse_mnist.py                         |   9 +-
 other/sparse_mnist_new.py                     |  52 +++++++
 .../train_flipped_classifier.py               |   0
 ts_to_img.py => other/ts_to_img.py            |   0
 overlay_data.py                               |  44 ++++++
 prepare_data.py                               |  19 +--
 resample_data.py                              |  24 ++++
 st_predict.py                                 | 132 +++++++++++++++---
 test.py                                       |  12 ++
 test2.py                                      | 125 -----------------
 train_autoencoder.py                          |  18 +--
 16 files changed, 404 insertions(+), 252 deletions(-)
 create mode 100644 data/preprocessed/.gitkeep
 create mode 100644 ksparse/__init__.py
 create mode 100644 other/sparse_mnist_new.py
 rename train_flipped_classifier.py => other/train_flipped_classifier.py (100%)
 rename ts_to_img.py => other/ts_to_img.py (100%)
 create mode 100644 overlay_data.py
 create mode 100644 resample_data.py
 create mode 100644 test.py
 delete mode 100644 test2.py

diff --git a/README.md b/README.md
index cfcac3a..40fddcf 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,29 @@
 ### Setup (tested on linux only!)
-1. Install conda and a conda environment ("what?" "how?" - Google it!)
-2. Install Tensorflow with `conda install tensorflow-gpu=2.0.0`. This **must** be installed as the first package. The contents
-here are only tested with version 2.0, but it should work on later ones as well. If done correctly, check the script at 
-`/checks/test_tensorflow_gpu_is_working.py`
-3. Install the rest of the conda requirements with
- 
-````conda install -f -y -q --name py37 -c conda-forge --file conda_requirements.txt````
-3. Install everything else with `pip install -r requirements.txt`
-4. If Tensorflow is installed correctly, run `checks/test_tensorflow_gpu_is_working`. If the device is correctly set up,
+1. Install conda and a conda environment. Conda installation instructions for
+Linux can be found on the website, as well as how to create an environment.
+2. Install Tensorflow with `conda install tensorflow-gpu=2.0.0`. This **must**
+be installed as the first package. The contents here are only tested with 
+version 2.0, but it should work on later ones as well. If done correctly,
+check the script at  `/checks/test_tensorflow_gpu_is_working.py`
+3. Install the rest of the conda requirements with   
+````
+conda install -f -y -q --namepy37 -c conda-forge --file conda_requirements.txt
+````
+4. Install everything else with `pip install -r requirements.txt`
+5. If Tensorflow is installed correctly, run 
+`checks/test_tensorflow_gpu_is_working`. If the device is correctly set up,
 Tensorflow is working and you're good to go!
+6. Conda and pip don't talk together, and this breaks some of the package
+installations. If for some reason a package was not installed, try running a 
+script until you hit a `ModuleNotFound: no module named name_of_package` error,
+and try installing the module with `pip install name_of_package`.
 
 ### Interactive scripts
-Large parts run interactively in the Python-package [Streamlit](www.streamlit.io). If a script has `st_` in front of the
-name, it must be run interactively through Streamlit. To launch these from the terminal, write `streamlit run st_myscript.py`
+Large parts run interactively in the Python-package
+[Streamlit](www.streamlit.io). If a script has `st_` in front of the name, it
+must be run interactively through Streamlit (or else it doesn't produce any
+visible output). To launch these from the terminal, write 
+`streamlit run st_myscript.py`
 
 ### Naming convention
 
@@ -22,30 +33,56 @@ name, it must be run interactively through Streamlit. To launch these from the t
 
 * name: filename (`model_005.h5`)
 
-### Data format
-Every dataset is preprocessed so that eventually you'll have a `hdf` file and a corresponding `npz` file. All
-computations are done on the `npz` file because it's much faster, and compatible with tensorflow. However, group order
-must be preserved according to the parent `hdf` dataframe.
+* To access something one directory up, write `../` in front of the directory
+name. Two directories up is `../../`, and so on.
 
-A `npz` has just a single group index, whereas a dataframe may have both an `id` and `sub_id` if it's combined
-from multiple sources. In that case, the `id` will correspond to the `npz` index, and `sub_id` will be the actual
-group index in the sub-dataset. Group order is only preserved if dataframe groups are sorted by `['file', 'particle']`,
-or for combined dataframes `['source', 'file', 'particle']`. To combine dataframes, the inputs are stacked in loaded order
-(which must therefore also be sorted!). All of this is done automatically, if the right sequence of steps is taken. 
+* All paths in use are defined in `lib/globals.py`, so they can be conveniently
+changed once here, rather than everywhere in the code.
+
+### Data format
+Every dataset is preprocessed so that eventually you'll have a `hdf` file and a
+corresponding `npz` file. All computations are done on the `npz` file because
+it's much faster, and compatible with tensorflow. However, group order must be
+preserved according to the parent `hdf` dataframe.
+
+A `npz` has just a single group index (i.e. '512' means trace id 512 (remember,
+Python counts from 0!), whereas a dataframe may have both an `id` and `sub_id`
+if it's combined from multiple sources. In that case, the `id` will correspond
+to the `npz` index (i.e. the order of appearance), and `sub_id` will be the
+actual group index in the sub-dataset (which is currently not used). Group order
+is only preserved if dataframe groups are sorted by `['file', 'particle']`, or
+for combined dataframes `['source', 'file', 'particle']`. To combine dataframes,
+the inputs are stacked in loaded order (which must therefore also be sorted!).
+All of this is done automatically, if the right sequence of steps is taken. 
 
 
 ### Scripts to run, step by step
 1. `get_cme_tracks.py` to convert from CME `.mat` files to a dataframe.
-2. `prepare_data.py` to filter out too short data (set it low initially to be safe), traces that would be cut by the
-tracking start/end, and 
+2. `prepare_data.py` to filter out too short data (set it low initially if you 
+want to be safe - The model will work almost equally well, regardless of the 
+minimum length), as well as traces that would be cut by the tracking start/end (
+i.e. if something starts at frame 0 of the video, it's removed, because you
+can't be sure if the actual event started at "frame -10". 
 3. `train_autoencoder.py` to train a model on the data.
-4. `st_predict.py` to predict and plot the data. Initially, a UMAP model is trained.
-5. `st_eval.py` once clustering is done and you want to explore the data.
+4. `st_predict.py` to predict and plot the data. Initially, a UMAP model is
+trained. This takes a while. It might even time out your Streamlit session, but
+don't touch anything and it'll be ready eventually.
+5. Every cluster is saved as a combination of model + data names, and will be
+output to `results/cluster_indices/`. This contains the indices of every trace
+(see above on how indexing works), and which cluster they belong to. Note that
+every change in the analysis **OVERWRITES** the automatically created file
+containing cluster indices. If you have reached a point where you want to save
+them, go to `results/cluster_indices/` and rename the file so you're sure it
+won't be overwritten. 
+6. `st_eval.py` once clustering is done and you want to explore the data. It
+currently doesn't have much functionality. Only looking at one/more specific
+datasets/clusters...
 
 ### Things to avoid
 In order to preserve group ordering, the original dataframes must be run through
- `prepare_data.py` if they need to be filtered in some way. **DO NOT** run a combine dataframe through a filter,
- because this messes up the internal group ordering that was first established when creating the combined dataframe.
+ `prepare_data.py` if they need to be filtered in some way. **DO NOT** run a
+ combine dataframe through a filter, because this messes up the internal group
+ ordering that was first established when creating the combined dataframe.
 
 ### Troubleshooting
 #### Packages are missing
diff --git a/best_clusters.py b/best_clusters.py
index 019c498..7e2a7d8 100644
--- a/best_clusters.py
+++ b/best_clusters.py
@@ -8,13 +8,13 @@
 from sklearn.mixture import GaussianMixture
 from sklearn.cluster import MiniBatchKMeans
 from tqdm import tqdm
-from lib.utils import get_index
 
 import lib.globals
 import lib.globals
+from lib.utils import get_index
 
 
-def _plot_kmeans_scores(X, min, max, step):
+def _plot_kmeans_score(X, min, max, step):
     """
     Calculates scores for multiple values of kmeans
     Args:
@@ -25,9 +25,8 @@ def _plot_kmeans_scores(X, min, max, step):
     """
     rng = list(range(min, max, step))
 
-    def process(n):
+    def process_gaussian(n):
         clf = GaussianMixture(n_components = n, random_state = 42)
-        # clf = MiniBatchKMeans(n_clusters=n, random_state=42)
         labels = clf.fit_predict(X)
 
         s = silhouette_score(X, labels)
@@ -36,20 +35,35 @@ def process(n):
 
         return s, c, b
 
+    def process_kmeans(n):
+        clf = MiniBatchKMeans(n_clusters=n, random_state=42)
+        labels = clf.fit_predict(X)
+
+        s = silhouette_score(X, labels)
+        c = calinski_harabasz_score(X, labels)
+        return s, c
+
+
     n_jobs = len(rng)
-    results = Parallel(n_jobs=n_jobs)(delayed(process)(i) for i in tqdm(rng))
-    results = np.column_stack(results).T
+    results_kmeans = Parallel(n_jobs=n_jobs)(delayed(process_kmeans)(i) for i in tqdm(rng))
+    results_kmeans = np.column_stack(results_kmeans).T
+
+    fig, ax = plt.subplots(nrows=3, ncols = 2)
 
-    fig, ax = plt.subplots(nrows=3)
-    ax[0].plot(rng, results[:, 0], "o-", color="blue", label="Silhouette score")
-    ax[1].plot(rng, results[:, 1], "o-", color="orange", label="CH score")
-    ax[2].plot(rng, results[:, 2], "o-", color="red", label="BIC")
+    ax[0, 0].set_title("K-means")
+    ax[0, 0].plot(rng, results_kmeans[:, 0], "o-", color="blue", label="Silhouette score")
+    ax[1, 0].plot(rng, results_kmeans[:, 1], "o-", color="orange", label="CH score")
+
+    ax[0, 1].set_title("Gaussian Mixture")
+    ax[0, 1].plot(rng, results_kmeans[:, 0], "o-", color="blue", label="Silhouette score")
+    ax[1, 1].plot(rng, results_kmeans[:, 1], "o-", color="orange", label="CH score")
+    ax[2, 1].plot(rng, results_kmeans[:, 2], "o-", color="red", label="BIC")
 
     for a in ax:
         a.legend(loc="upper right")
 
     plt.tight_layout()
-    plt.savefig("plots/best_k.pdf")
+    plt.savefig("plots/best_clusters.pdf")
     plt.show()
 
 
@@ -61,11 +75,14 @@ def main(encodings_name):
 
     X, encodings = f["X_true"], f["features"]
 
+    X = X[0:1000]
+    encodings = encodings[0:1000]
+
     arr_lens = np.array([len(xi) for xi in X])
     (len_above_idx,) = np.where(arr_lens >= 30)
     X, encodings, = get_index((X, encodings), index=len_above_idx)
 
-    _plot_kmeans_scores(encodings, min=2, max=100, step=3)
+    _plot_kmeans_score(encodings, min=2, max=100, step=3)
 
 
 if __name__ == "__main__":
diff --git a/data/preprocessed/.gitkeep b/data/preprocessed/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/ksparse/__init__.py b/ksparse/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/other/sparse_gam.py b/other/sparse_gam.py
index 08b96e2..27a7801 100644
--- a/other/sparse_gam.py
+++ b/other/sparse_gam.py
@@ -28,7 +28,7 @@
 
     x_train, x_val = train_test_split(X)
 
-    input_shape = x_train.shape[1:]
+    shape = x_train.shape[1:]
 
     # Define autoencoder parameters
     EPOCHS = 10
@@ -43,52 +43,61 @@
         INIT_SPARSITY, END_SPARSITY, EPOCHS
     )
 
-    # k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x)
-
-    # Build the Autoencoder Model
-    inputs = Input(shape=input_shape, name="encoder_input")
-    x = inputs
-
-    for filters in LAYER_FILTERS:
-        x = Conv2D(
-            filters=filters,
-            kernel_size=KERNEL_SIZE,
-            strides=2,
-            activation="relu",
-            padding="same",
-            data_format="channels_last",
-        )(x)
+    i = Input(shape)
+    x = Flatten()(i)
+    h = Dense(LATENT_DIM, activation = 'sigmoid')(x)
 
-    # Generate the latent vector
-    x = Flatten()(x)
-    x = Dense(LATENT_DIM, name="latent_vector")(x)
-    k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x)
+    k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(h)
 
-    # Decoder
-    x = Dense(input_shape[0] * input_shape[1] * input_shape[2])(k_sparse)
-    x = Reshape(input_shape)(x)
+    x = Dense(x_train.shape[1] * x_train.shape[2] * x_train.shape[3])(k_sparse)
+    x = Reshape(shape)(x)
+    o = Activation("sigmoid")(x)
 
-    for filters in LAYER_FILTERS[::-1]:
-        x = Conv2DTranspose(
-            filters=filters,
-            kernel_size=KERNEL_SIZE,
-            strides=1,
-            activation="relu",
-            padding="same",
-            data_format="channels_last",
-        )(x)
-
-    x = Conv2DTranspose(
-        filters=input_shape[-1],
-        kernel_size=KERNEL_SIZE,
-        padding="same",
-        data_format="channels_last",
-    )(x)
+    # k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x)
 
-    outputs = Activation(None, name="decoder_output")(x)
+    # Build the Autoencoder Model
+    # i = Input(shape=input_shape, name= "encoder_input")
+    # # for filters in LAYER_FILTERS:
+    # #     x = Conv2D(
+    # #         filters=filters,
+    # #         kernel_size=KERNEL_SIZE,
+    # #         strides=2,
+    # #         activation="relu",
+    # #         padding="same",
+    # #         data_format="channels_last",
+    # #     )(x)
+    #
+    # # Generate the latent vector
+    # x = Flatten()(i)
+    # x = Dense(LATENT_DIM, activation = "sigmoid")(x)
+    # k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x)
+    #
+    #
+    # # Decoder
+    # x = Dense(int(np.product(input_shape)), activation = "relu")(x)
+    # x = Reshape(input_shape)(x)
+    #
+    # outputs = Dense(x_train.shape[1], activation = 'sigmoid')(k_sparse)
+
+    # for filters in LAYER_FILTERS[::-1]:
+    #     x = Conv2DTranspose(
+    #         filters=filters,
+    #         kernel_size=KERNEL_SIZE,
+    #         strides=1,
+    #         activation="relu",
+    #         padding="same",
+    #         data_format="channels_last",
+    #     )(x)
+    #
+    # x = Conv2DTranspose(
+    #     filters=input_shape[-1],
+    #     kernel_size=KERNEL_SIZE,
+    #     padding="same",
+    #     data_format="channels_last",
+    # )(x)
 
     # Autoencoder = Encoder + Decoder
-    autoencoder = Model(inputs, outputs, name="autoencoder")
+    autoencoder = Model(i, o, name= "autoencoder")
     autoencoder.summary()
 
     autoencoder.compile(loss="mse", optimizer="adam")
diff --git a/other/sparse_mnist.py b/other/sparse_mnist.py
index 5ed39ee..8bcc6b6 100644
--- a/other/sparse_mnist.py
+++ b/other/sparse_mnist.py
@@ -16,11 +16,9 @@
     x_train, y_train = shuffle(x_train, y_train, random_state = 1)
     x_test, y_test = shuffle(x_test, y_test, random_state = 1)
 
-
     def process(x):
         return x.reshape(x.shape[0], x.shape[1] ** 2) / 255
 
-
     x_train = process(x_train)
     x_test = process(x_test)
 
@@ -35,11 +33,8 @@ def process(x):
 
     # Build the k-sparse autoencoder
     inputs = Input((x_train.shape[1],))
-    x = Dense(embedding_size, activation = "relu")(inputs)
-    x = Dense(embedding_size, activation = "relu")(x)
-    k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(x)
-    x = Dense(embedding_size, activation = "relu")(k_sparse)
-
+    h = Dense(embedding_size, activation = 'sigmoid')(inputs)
+    k_sparse = KSparse(sparsity_levels = sparsity_levels, name = 'KSparse')(h)
     outputs = Dense(x_train.shape[1], activation = 'sigmoid')(k_sparse)
 
     ae1 = Model(inputs, outputs)
diff --git a/other/sparse_mnist_new.py b/other/sparse_mnist_new.py
new file mode 100644
index 0000000..afc0750
--- /dev/null
+++ b/other/sparse_mnist_new.py
@@ -0,0 +1,52 @@
+import ksparse.utils.mnist.mnist_helper as mh
+from ksparse.layers.linear_layer import LinearLayer
+from ksparse.layers.sparse_layer import SparseLayer
+from ksparse.nets.fcnn import *
+from ksparse.utils.activations import *
+from ksparse.utils.cost_functions import *
+from tensorflow.keras.layers import Dense
+
+img_size = 28
+num_hidden = 100
+k = 70
+learning_rate = 0.01
+epochs = 10000
+batch_size = 256
+print_epochs = 1000
+num_test_examples = 10
+
+helper = mh.mnist_helper()
+train_lbl, train_img, test_lbl, test_img = helper.get_data()
+
+x_data = train_img.reshape(-1, img_size * img_size) / np.float32(256)
+test_data = test_img.reshape(-1, img_size * img_size) / np.float32(256)
+
+layers = [
+    # LinearLayer(name="input", n_in=x_data.shape[1], n_out=num_hidden, activation=sigmoid_function),
+    Dense(units = 32, activation = "relu", input_shape = x_data.shape[1]),
+    SparseLayer(
+        name="hidden 1",
+        n_in=x_data.shape[1],
+        n_out=num_hidden,
+        activation=sigmoid_function,
+        num_k_sparse=k,
+    ),
+    LinearLayer(
+        name="output",
+        n_in=num_hidden,
+        n_out=x_data.shape[1],
+        activation=sigmoid_function,
+    ),
+]
+
+nn = FCNeuralNet(layers=layers, cost_func=subtract_err)
+nn.print_network()
+
+nn.train(
+    x_data,
+    x_data,
+    learning_rate=learning_rate,
+    epochs=epochs,
+    batch_size=batch_size,
+    print_epochs=print_epochs,
+)
diff --git a/train_flipped_classifier.py b/other/train_flipped_classifier.py
similarity index 100%
rename from train_flipped_classifier.py
rename to other/train_flipped_classifier.py
diff --git a/ts_to_img.py b/other/ts_to_img.py
similarity index 100%
rename from ts_to_img.py
rename to other/ts_to_img.py
diff --git a/overlay_data.py b/overlay_data.py
new file mode 100644
index 0000000..99ffcad
--- /dev/null
+++ b/overlay_data.py
@@ -0,0 +1,44 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+def generate_fake_tracks():
+    data = []
+    for i in range(16):
+        a = np.sin(np.arange(100) * 0.1)
+        a = abs(a)
+
+        b = np.ones(len(a))
+
+        x = np.column_stack((a, b))
+
+        noise = np.random.normal(2, 0.2, len(x))
+
+        # Add noise
+        x[:, 0] *= noise
+        x[:, 1] *= noise
+
+        # Add intensities
+        x[:, 0] *= np.random.uniform(200, 1000)
+        x[:, 1] *= np.random.uniform(100)
+
+        data.append(x)
+
+    data = np.array(data)
+    return data
+
+
+def main():
+    data = generate_fake_tracks()
+    fig, ax = plt.subplots(nrows=4, ncols=4)
+    ax = ax.ravel()
+    for i in range(len(ax)):
+        ax[i].plot(data[i])
+    plt.show()
+
+    overlay = data.mean(axis = 0)
+    fig, ax = plt.subplots()
+    ax.plot(overlay)
+    plt.show()
+
+if __name__ == "__main__":
+    main()
diff --git a/prepare_data.py b/prepare_data.py
index 7afb9fe..8839cf0 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -12,15 +12,16 @@ def _filter(args):
     Parallel filter function
     """
     _, group = args
-    check1 = group["t"].values[0] == FIRST_FRAME  # check that it's not first
-    check2 = group["t"].values[-1] == LAST_FRAME  # or last frame
-    check3 = (
-        len(group) >= LAST_FRAME
-    )  # check that it's not longer than maximum length
-    check4 = len(group) <= MIN_LEN  # check that it's longer than minimum
-    if any(
-        [check1, check2, check3, check4]
-    ):  # if any checks fail, discard trace
+    # check that it's not first
+    check1 = group["f"].values[0] == FIRST_FRAME
+    # or last frame
+    check2 = group["f"].values[-1] == LAST_FRAME
+    # check that it's not longer than maximum length
+    check3 = len(group) >= LAST_FRAME
+    # check that it's longer than minimum
+    check4 = len(group) <= MIN_LEN
+    # if any checks fail, discard trace
+    if any([check1, check2, check3, check4]):
         return None
     else:
         return group
diff --git a/resample_data.py b/resample_data.py
new file mode 100644
index 0000000..15219b2
--- /dev/null
+++ b/resample_data.py
@@ -0,0 +1,24 @@
+import lib.math
+import parmap
+import numpy as np
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+
+def process(xi):
+    ts = lib.math.resample_timeseries(xi, new_length = 200)
+    return ts
+
+X = np.load("data/preprocessed/combined_filt20_var.npz", allow_pickle = True)["data"]
+
+mp_results = parmap.map(process, tqdm(X))
+newX = np.array(mp_results)
+
+print(newX.shape)
+
+fig, ax = plt.subplots(nrows = 3, ncols = 2)
+for i in range(3):
+    ax[i, 0].plot(X[i])
+    ax[i, 1].plot(newX[i])
+plt.show()
+
+np.savez("data/preprocessed/combined_filt20.npz", data = newX)
\ No newline at end of file
diff --git a/st_predict.py b/st_predict.py
index 4050d47..1edbc43 100644
--- a/st_predict.py
+++ b/st_predict.py
@@ -4,6 +4,7 @@
 from typing import Iterable
 
 import matplotlib.pyplot as plt
+
 # noinspection PyUnresolvedReferences
 import mpl_scatter_density
 import numpy as np
@@ -27,6 +28,8 @@
 from tensorflow.keras.models import Model
 from tensorflow.python import keras
 from tqdm import tqdm
+import matplotlib.colors
+from sklearn.cluster import DBSCAN
 
 import lib.globals
 import lib.math
@@ -215,8 +218,8 @@ def _predict(X_true, idx_true, model_path, savename, single_feature):
         latest_model_path = _latest_model(model_path)
 
         autoencoder = keras.models.load_model(
-            latest_model_path, custom_objects={"gelu": gelu,
-                                               "f1_m": lib.math.f1_m}
+            latest_model_path,
+            custom_objects={"gelu": gelu, "f1_m": lib.math.f1_m},
         )
 
         encoder = _get_encoding_layer(autoencoder)
@@ -268,7 +271,6 @@ def _predict(X_true, idx_true, model_path, savename, single_feature):
                 xi_pred = _xi_true
                 ei = np.zeros(xi_true.shape[0])
 
-
             # Make sure they're numpy arrays now and not EagerTensors!
             X_true.extend(np.array(xi_true))
             X_pred.extend(np.array(xi_pred))
@@ -344,7 +346,7 @@ def _cluster_kmeans(features, n_clusters):
     Cluster points using K-means
 
     Args:
-        features (np.array)
+        features (np.ndarray)
         n_clusters (int)
     """
     clf = MiniBatchKMeans(n_clusters=n_clusters)
@@ -352,6 +354,7 @@ def _cluster_kmeans(features, n_clusters):
     centers = clf.cluster_centers_
     return labels, centers
 
+
 @timeit
 @st.cache
 def _cluster_gmm(features, n_clusters):
@@ -359,14 +362,35 @@ def _cluster_gmm(features, n_clusters):
     Cluster points using Gaussian Mixture Model
 
     Args:
-        features (np.array)
+        features (np.ndarray)
         n_clusters (int)
     """
-    clf = GaussianMixture(n_components = n_clusters, covariance_type = "full")
+    clf = GaussianMixture(n_components=n_clusters, covariance_type="full")
     labels = clf.fit_predict(features)
     centers = clf.means_
     return labels, centers
 
+@timeit
+@st.cache
+def _cluster_dbscan(features, eps):
+    """
+    Cluster points using DBSCAN
+
+    Args:
+        features (np.ndarray)
+        eps (float)
+    """
+    clf = DBSCAN(eps = eps, n_jobs = -1, min_samples = 200)
+    clf.fit_predict(features)
+    labels = clf.labels_
+
+    centers = []
+    for l in set(labels):
+        f = features[labels == l]
+        centers.append(np.mean(f, axis = 0))
+    centers = np.array(centers)
+    return labels, centers
+
 
 def _resample_traces(traces, length, normalize):
     """
@@ -385,8 +409,7 @@ def _resample_traces(traces, length, normalize):
     if normalize:
         new = []
         for trace in traces_re:
-            t = trace / trace.max(axis=0)
-            t -= np.min(t, axis=0)
+            t = trace / trace.max(axis=(0, 1))
             new.append(t)
 
         traces_re = np.array(new)
@@ -453,7 +476,7 @@ def _clath_aux_peakfinder(X):
     Not very tweaked and only used as a weak guidance.
 
     Args:
-        X (np.array):
+        X (np.ndarray)
     """
 
     has_peak, has_no_peak = _find_peaks(X)
@@ -542,6 +565,22 @@ def _plot_scatter(features, subsample, cluster_labels=None):
     st.write(fig)
 
 
+def _plot_density(features):
+    """
+    Plots density map of features
+
+    Args:
+        features (np.ndarray)
+    """
+    fig = plt.figure()
+    ax = fig.add_subplot(1, 1, 1, projection="scatter_density")
+    c = ax.scatter_density(
+        features[:, 0], features[:, 1], cmap="plasma", dpi=72
+    )
+    fig.colorbar(c, label="Datapoints per pixel")
+    st.write(fig)
+
+
 def _plot_mse(
     original_mse, mse_filter,
 ):
@@ -599,6 +638,9 @@ def _plot_traces_preview(
         sg (float or np.ndarray)
         colors (list of str)
     """
+    X_true, X_pred, mse, sample_indices = sklearn.utils.shuffle(X_true, X_pred, mse, sample_indices)
+
+
     fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 10))
     axes = axes.ravel()
     for i, ax in enumerate(axes):
@@ -905,26 +947,74 @@ def main():
     st_min_length = st.sidebar.number_input(
         min_value=1,
         max_value=100,
+        value=20,
         label="Minimum length of data to cluster",
         key="st_min_length",
     )
 
-    min_len_labels = np.zeros(len(umap_enc_orig)).astype(int)
-    min_len_labels.fill(0)
+    st_max_mse = st.sidebar.number_input(
+        min_value=0.0,
+        max_value=max(mse),
+        value=max(mse),
+        label="Maximum error of data to cluster",
+        key="st_max_mse",
+    )
+
+    pre_filtered = np.zeros(len(umap_enc_orig)).astype(int)
+    pre_filtered.fill(0)
 
+    # Remove all traces below minimum length from clustering
     (len_above_idx,) = np.where(arr_lens >= st_min_length)
     X_true, X_pred, encodings, umap_enc, mse, indices, = get_index(
         (X_true, X_pred, encodings, umap_enc, mse, indices), index=len_above_idx
     )
-    min_len_labels[len_above_idx] = 1
+    pre_filtered[len_above_idx] = 1
 
-    st_clust_n = st.sidebar.number_input(label="Number of clusters", value=3)
+    # Remove all traces above max error from clustering
+    (mse_below_idx,) = np.where(mse <= st_max_mse)
+    X_true, X_pred, encodings, umap_enc, mse, indices, = get_index(
+        (X_true, X_pred, encodings, umap_enc, mse, indices), index=mse_below_idx
+    )
+    pre_filtered[mse_below_idx] = 1
 
     pca, explained_var = _pca(encodings, embed_into_n_components=4)
     _plot_explained_variance(explained_var)
 
-    # clabels, centers = _cluster_kmeans(features=pca, n_clusters=st_clust_n)
-    clabels, centers = _cluster_gmm(features = pca, n_clusters = st_clust_n)
+    clustering_methods = ("K-means", "Gaussian Mixture Model", "Density-based clustering")
+
+    st_clust_method = st.sidebar.radio(label = "Clustering method", options = clustering_methods, index = 0)
+
+    if st_clust_method == clustering_methods[0]:
+        st_clust_n = st.sidebar.number_input(label = "Number of clusters", value = 3)
+        clabels, centers = _cluster_kmeans(features=pca, n_clusters=st_clust_n)
+
+    elif st_clust_method == clustering_methods[1]:
+        st_clust_n = st.sidebar.number_input(label="Number of clusters", value=3)
+        clabels, centers = _cluster_gmm(features=pca, n_clusters=st_clust_n)
+
+    elif st_clust_method == clustering_methods[2]:
+        st_eps = st.sidebar.number_input(label = "eps", value = 0.5)
+        clabels, centers = _cluster_dbscan(features = pca, eps = st_eps)
+
+    else:
+        raise NotImplementedError
+
+    def _plot_cluster_label_distribution(data):
+        bins = np.arange(0 - 1.5, data.max() + 1.5) - 0.5
+
+        # then you plot away
+        fig, ax = plt.subplots()
+        ax.hist(data, bins)
+        ax.set_xticks(bins + 0.5)
+
+        svg_write(fig)
+
+    _plot_cluster_label_distribution(clabels)
+
+    st.write(set(clabels))
+    st.write(np.histogram(clabels))
+    st.write(clabels)
+    st.write(centers)
 
     st.subheader("Lengths")
     _plot_length_dist(
@@ -934,16 +1024,20 @@ def main():
     )
     st.subheader("UMAP displaying length cutoff filter")
     _plot_scatter(
-        features=umap_enc_orig, cluster_labels=min_len_labels, subsample=False
+        features=umap_enc_orig, cluster_labels=pre_filtered, subsample=False
     )
+
+    st.subheader("UMAP density on filtered")
+    _plot_density(umap_enc)
+
+    st.subheader("UMAP after clustering on filtered")
+    _plot_scatter(features=umap_enc, cluster_labels=clabels, subsample=False)
+
     st.subheader("PCA after clustering on filtered")
     _plot_scatter(
         features=pca[:, [0, 1]], cluster_labels=clabels, subsample=False
     )
 
-    st.subheader("UMAP after clustering on filtered")
-    _plot_scatter(features=umap_enc, cluster_labels=clabels, subsample=False)
-
     n_clusters = len(set(clabels))
     len_X_true_postfilter = len(X_true)
     colormap = lib.plotting.get_colors("viridis", n_colors=n_clusters)
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..b44e9ba
--- /dev/null
+++ b/test.py
@@ -0,0 +1,12 @@
+import numpy as np
+from mayavi import mlab
+
+mlab.options.offscreen = True
+
+n = 5000
+x = np.random.rand(n)
+y = np.random.rand(n)
+z = np.random.rand(n)
+s = np.sin(x)**2 + np.cos(y)
+
+mlab.points3d(x, y, z, s, colormap="RdYlBu", scale_factor=0.02, scale_mode='none')
diff --git a/test2.py b/test2.py
deleted file mode 100644
index e632163..0000000
--- a/test2.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import os.path
-import matplotlib.pyplot as plt
-# noinspection PyUnresolvedReferences
-import mpl_scatter_density
-import numpy as np
-import sklearn.utils
-import umap.umap_ as umap
-from joblib import Parallel, delayed
-from sklearn.cluster import MiniBatchKMeans
-from sklearn.metrics import calinski_harabasz_score, silhouette_score
-from sklearn.mixture import GaussianMixture
-from tqdm import tqdm
-
-import lib.globals
-import lib.globals
-from lib.math import resample_timeseries
-
-
-def _plot_kmeans_scores(X, min, max, step):
-    """
-    Calculates scores for multiple values of kmeans
-    Args:
-        X (np.ndarray)
-        min (int)
-        max (int)
-        step (int)
-    """
-    rng = list(range(min, max, step))
-
-    def process(n):
-        clf = GaussianMixture(n_components = n, random_state = 42)
-        # clf = MiniBatchKMeans(n_clusters=n, random_state=42)
-        labels = clf.fit_predict(X)
-
-        s = silhouette_score(X, labels)
-        c = calinski_harabasz_score(X, labels)
-        b = clf.bic(X)
-
-        return s, c, b
-
-    n_jobs = len(rng)
-    results = Parallel(n_jobs=n_jobs)(delayed(process)(i) for i in tqdm(rng))
-    results = np.column_stack(results).T
-
-    fig, ax = plt.subplots(nrows=3)
-    ax[0].plot(rng, results[:, 0], "o-", color="blue", label="Silhouette score")
-    ax[1].plot(rng, results[:, 1], "o-", color="orange", label="CH score")
-    ax[2].plot(rng, results[:, 2], "o-", color="red", label="BIC")
-
-    for a in ax:
-        a.legend(loc="upper right")
-
-    plt.tight_layout()
-    plt.savefig("plots/best_k.pdf")
-    plt.show()
-
-
-def main(encodings_name):
-    np.random.seed(1)
-
-    f = np.load(
-        os.path.join(lib.globals.encodings_dir, encodings_name),
-        allow_pickle=True,
-    )
-
-    X = f["X_true"]
-    X = sklearn.utils.resample(X, n_samples = 10000, replace = False)
-
-    X = np.array([lib.math.resample_timeseries(xi.ravel().reshape(-1, 1), new_length = 200) for xi in X])
-    X = np.squeeze(X)
-
-    u = umap.UMAP(
-        n_components = 2,
-        random_state = 42,
-        n_neighbors = 100,
-        min_dist = 0.0,
-        init = "spectral",
-        verbose = True,
-    )
-    e = u.fit_transform(X)
-
-    fig = plt.figure()
-
-    ax = fig.add_subplot(1, 1, 1, projection = "scatter_density")
-
-    clf = MiniBatchKMeans(n_clusters = 35)
-    labels = clf.fit_predict(e)
-    centers = clf.cluster_centers_
-
-    ax.scatter_density(e[:, 0], e[:, 1], c = labels, cmap = "viridis_r")
-
-    for i in range(len(set(labels))):
-        m = centers[i]
-        ax.annotate(
-            xy = m,
-            s = i,
-            bbox = dict(boxstyle = "square", fc = "w", ec = "grey", alpha = 0.9),
-        )
-
-    Xa = X[labels == 26]
-    Xb = X[labels == 33]
-
-    colors = ["black", "red"]
-
-    for X in Xb, Xa:
-        fig, ax = plt.subplots(nrows = 5, ncols = 5)
-        ax = ax.ravel()
-        for i in range(len(ax)):
-            xi = np.zeros((100, 2))
-            xi[:, 0] = X[i][:100]
-            xi[:, 1] = X[i][100:]
-
-            for c in range(xi.shape[-1]):
-                ax[i].plot(xi[..., c], color = colors[c])
-
-            ax[i].set_xticks(())
-            ax[i].set_yticks(())
-        plt.tight_layout()
-
-    plt.show()
-
-
-if __name__ == "__main__":
-    NAME = "20200124-0206_lstm_vae_bidir_data=combined_filt5_var.npz_dim=128_act=None_bat=4_eps=0.1_zdim=8_anneal=20___pred__combined_filt20_var.npz"
-    main(NAME)
diff --git a/train_autoencoder.py b/train_autoencoder.py
index cd8fcb8..db96eb8 100644
--- a/train_autoencoder.py
+++ b/train_autoencoder.py
@@ -8,6 +8,7 @@
 import sklearn.preprocessing
 import tensorflow as tf
 import parmap
+from tqdm import tqdm
 
 import lib.math
 import lib.models
@@ -78,7 +79,7 @@ def _preprocess(X, n_features, max_batch_size, train_size):
 if __name__ == "__main__":
     MODELF = (lib.models.lstm_vae_bidir,)
 
-    INPUT_NPZ = ("data/preprocessed/fake_tracks_type_3.npz",)
+    INPUT_NPZ = ("data/preprocessed/combined_filt5_var.npz",)
 
     N_TIMESTEPS = None
     EARLY_STOPPING = 3
@@ -93,10 +94,10 @@ def _preprocess(X, n_features, max_batch_size, train_size):
     # Best results have been found with keeping latent dim high and zdim low
     LATENT_DIM = (128,)
     ACTIVATION = (None,)
-    ZDIM = (16,)
-    EPS = (1,)
+    ZDIM = (8, 16,)
+    EPS = (0.1, 1,)
     KEEP_ONLY = (None,)
-    ANNEAL_TIME = (1, 5, 999,)
+    ANNEAL_TIME = (1, 5, 20)
 
     # Add iterables here
     for (
@@ -123,15 +124,6 @@ def _preprocess(X, n_features, max_batch_size, train_size):
 
         X_raw = _get_data(_input_npz)
 
-
-        def process(xi):
-            ts = lib.math.resample_timeseries(xi, new_length = 32)
-            return ts
-
-        mp_results = parmap.map(process, X_raw)
-        data = np.array(mp_results)
-
-
         if _keep_only is not None:
             X_raw = np.array([x[:, _keep_only].reshape(-1, 1) for x in X_raw])