Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
  • Loading branch information
komodovaran committed Jan 30, 2020
1 parent 88e4f99 commit ea44f83
Show file tree
Hide file tree
Showing 20 changed files with 1,210 additions and 174 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/data
/results
/venv
/models
/models
/data/
29 changes: 22 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
### Setup (tested on linux only!)
1. Install conda and a conda environment ("what?" "how?" - Google it!)
2. Install Tensorflow with `conda install tensorflow-gpu`. This **must** be installed as the first package. The contents
here are only tested with version 2.0, but it should work on later ones as well. If done correctly, check
2. Install Tensorflow with `conda install tensorflow-gpu=2.0.0`. This **must** be installed as the first package. The contents
here are only tested with version 2.0, but it should work on later ones as well. If done correctly, check the script at
`/checks/test_tensorflow_gpu_is_working.py`
3. Install everything else with `pip install requirements.txt -r`
3. Install the rest of the conda requirements with

````conda install -f -y -q --name py37 -c conda-forge --file conda_requirements.txt````
3. Install everything else with `pip install -r requirements.txt`
4. If Tensorflow is installed correctly, run `checks/test_tensorflow_gpu_is_working`. If the device is correctly set up,
Tensorflow is working and you're good to go!

Expand Down Expand Up @@ -33,10 +36,10 @@ or for combined dataframes `['source', 'file', 'particle']`. To combine datafram

### Scripts to run, step by step
1. `get_cme_tracks.py` to convert from CME `.mat` files to a dataframe.
2. `prepare_data.py` to filter out too short data (set it low initially to be safe).
3. `autoencoder_train.py` to train a model on the data.
4. `st_predict.py` to predict and plot the data. This also requires training a
UMAP model, which can take ~10 min for each refresh.
2. `prepare_data.py` to filter out too short data (set it low initially to be safe), traces that would be cut by the
tracking start/end, and
3. `train_autoencoder.py` to train a model on the data.
4. `st_predict.py` to predict and plot the data. Initially, a UMAP model is trained.
5. `st_eval.py` once clustering is done and you want to explore the data.

### Things to avoid
Expand All @@ -45,6 +48,18 @@ In order to preserve group ordering, the original dataframes must be run through
because this messes up the internal group ordering that was first established when creating the combined dataframe.

### Troubleshooting
#### Packages are missing
If any scripts raise complaints about packages I may have missed, they can be installed with
`pip install packagename`


#### The interface is slow!
Streamlit was never designed for super heavy computations. The underlying calculations are as fast as possible
but due to the way Streamlit is set up, it appears to be slow. Rest assured, after you put in the parameters,
Streamlit will get there eventually. Just don't touch anything until it's done, because the script will re-run
whenever any parameters are changed.


#### What to do if Streamlit doesn't finish running:

1. Hit `Ctrl+Z` in the terminal
Expand Down
73 changes: 73 additions & 0 deletions best_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os.path
import os.path

import matplotlib.pyplot as plt
import numpy as np
from joblib import Parallel, delayed
from sklearn.metrics import calinski_harabasz_score, silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm
from lib.utils import get_index

import lib.globals
import lib.globals


def _plot_kmeans_scores(X, min, max, step):
"""
Calculates scores for multiple values of kmeans
Args:
X (np.ndarray)
min (int)
max (int)
step (int)
"""
rng = list(range(min, max, step))

def process(n):
clf = GaussianMixture(n_components = n, random_state = 42)
# clf = MiniBatchKMeans(n_clusters=n, random_state=42)
labels = clf.fit_predict(X)

s = silhouette_score(X, labels)
c = calinski_harabasz_score(X, labels)
b = clf.bic(X)

return s, c, b

n_jobs = len(rng)
results = Parallel(n_jobs=n_jobs)(delayed(process)(i) for i in tqdm(rng))
results = np.column_stack(results).T

fig, ax = plt.subplots(nrows=3)
ax[0].plot(rng, results[:, 0], "o-", color="blue", label="Silhouette score")
ax[1].plot(rng, results[:, 1], "o-", color="orange", label="CH score")
ax[2].plot(rng, results[:, 2], "o-", color="red", label="BIC")

for a in ax:
a.legend(loc="upper right")

plt.tight_layout()
plt.savefig("plots/best_k.pdf")
plt.show()


def main(encodings_name):
f = np.load(
os.path.join(lib.globals.encodings_dir, encodings_name),
allow_pickle=True,
)

X, encodings = f["X_true"], f["features"]

arr_lens = np.array([len(xi) for xi in X])
(len_above_idx,) = np.where(arr_lens >= 30)
X, encodings, = get_index((X, encodings), index=len_above_idx)

_plot_kmeans_scores(encodings, min=2, max=100, step=3)


if __name__ == "__main__":
NAME = "20200124-0206_lstm_vae_bidir_data=combined_filt5_var.npz_dim=128_act=None_bat=4_eps=0.1_zdim=8_anneal=20___pred__combined_filt20_var.npz"
main(NAME)
131 changes: 131 additions & 0 deletions checks/test_mnist_autoencoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import keras
from keras.layers import Activation, Dense, Input
from keras.layers import Conv2D, Flatten
from keras.layers import Reshape, Conv2DTranspose
from keras.models import Model
from keras import backend as K
from keras.datasets import mnist
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

np.random.seed(1337)

# MNIST dataset
(x_train, _), (x_test, _) = mnist.load_data()

image_size = x_train.shape[1]
x_train = np.reshape(x_train, [-1, image_size, image_size, 1])
x_test = np.reshape(x_test, [-1, image_size, image_size, 1])
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

# Generate corrupted MNIST images by adding noise with normal dist
# centered at 0.5 and std=0.5
noise = np.random.normal(loc=0.5, scale=0.5, size=x_train.shape)
x_train_noisy = x_train + noise
noise = np.random.normal(loc=0.5, scale=0.5, size=x_test.shape)
x_test_noisy = x_test + noise

x_train_noisy = np.clip(x_train_noisy, 0., 1.)
x_test_noisy = np.clip(x_test_noisy, 0., 1.)

# Network parameters
input_shape = (image_size, image_size, 1)
batch_size = 128
kernel_size = 3
latent_dim = 16
# Encoder/Decoder number of CNN layers and filters per layer
layer_filters = [32, 64]

# Build the Autoencoder Model
# First build the Encoder Model
inputs = Input(shape=input_shape, name='encoder_input')
x = inputs
# Stack of Conv2D blocks
# Notes:
# 1) Use Batch Normalization before ReLU on deep networks
# 2) Use MaxPooling2D as alternative to strides>1
# - faster but not as good as strides>1
for filters in layer_filters:
x = Conv2D(filters=filters,
kernel_size=kernel_size,
strides=2,
activation='relu',
padding='same')(x)

# Shape info needed to build Decoder Model
shape = K.int_shape(x)

# Generate the latent vector
x = Flatten()(x)
latent = Dense(latent_dim, name='latent_vector')(x)

# Instantiate Encoder Model
encoder = Model(inputs, latent, name='encoder')
encoder.summary()

# Build the Decoder Model
latent_inputs = Input(shape=(latent_dim,), name='decoder_input')
x = Dense(shape[1] * shape[2] * shape[3])(latent_inputs)
x = Reshape((shape[1], shape[2], shape[3]))(x)

# Stack of Transposed Conv2D blocks
# Notes:
# 1) Use Batch Normalization before ReLU on deep networks
# 2) Use UpSampling2D as alternative to strides>1
# - faster but not as good as strides>1
for filters in layer_filters[::-1]:
x = Conv2DTranspose(filters=filters,
kernel_size=kernel_size,
strides=2,
activation='relu',
padding='same')(x)

x = Conv2DTranspose(filters=1,
kernel_size=kernel_size,
padding='same')(x)

outputs = Activation('sigmoid', name='decoder_output')(x)

# Instantiate Decoder Model
decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

# Autoencoder = Encoder + Decoder
# Instantiate Autoencoder Model
autoencoder = Model(inputs, decoder(encoder(inputs)), name='autoencoder')
autoencoder.summary()

autoencoder.compile(loss='mse', optimizer='adam')

# Train the autoencoder
autoencoder.fit(x_train_noisy,
x_train,
validation_data=(x_test_noisy, x_test),
epochs=30,
batch_size=batch_size)

# Predict the Autoencoder output from corrupted test images
x_decoded = autoencoder.predict(x_test_noisy)

# Display the 1st 8 corrupted and denoised images
rows, cols = 10, 30
num = rows * cols
imgs = np.concatenate([x_test[:num], x_test_noisy[:num], x_decoded[:num]])
imgs = imgs.reshape((rows * 3, cols, image_size, image_size))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 3, -1, image_size, image_size))
imgs = np.vstack([np.hstack(i) for i in imgs])
imgs = (imgs * 255).astype(np.uint8)
plt.figure()
plt.axis('off')
plt.title('Original images: top rows, '
'Corrupted Input: middle rows, '
'Denoised Input: third rows')
plt.imshow(imgs, interpolation='none', cmap='gray')
Image.fromarray(imgs).save('corrupted_and_denoised.png')
plt.show()
Loading

0 comments on commit ea44f83

Please sign in to comment.