Improvements.

komodovaran · Jan 29, 2020 · 88e4f99 · 88e4f99
1 parent 29aadee
commit 88e4f99
Show file tree

Hide file tree

Showing 15 changed files with 839 additions and 505 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 here are only tested with version 2.0, but it should work on later ones as well. If done correctly, check 
 `/checks/test_tensorflow_gpu_is_working.py`
 3. Install everything else with `pip install requirements.txt -r`
+4. If Tensorflow is installed correctly, run `checks/test_tensorflow_gpu_is_working`. If the device is correctly set up,
+Tensorflow is working and you're good to go!
 
 ### Interactive scripts
 Large parts run interactively in the Python-package [Streamlit](www.streamlit.io). If a script has `st_` in front of the
@@ -24,38 +26,43 @@ must be preserved according to the parent `hdf` dataframe.
 
 A `npz` has just a single group index, whereas a dataframe may have both an `id` and `sub_id` if it's combined
 from multiple sources. In that case, the `id` will correspond to the `npz` index, and `sub_id` will be the actual
-group index in the sub-dataset. Group order is only preserved if dataframe groups are sorted by `['file', 'particle']`, or
-for combined dataframes `['source', 'file', 'particle']`. To combine dataframes, the inputs are stacked in loaded order
+group index in the sub-dataset. Group order is only preserved if dataframe groups are sorted by `['file', 'particle']`,
+or for combined dataframes `['source', 'file', 'particle']`. To combine dataframes, the inputs are stacked in loaded order
 (which must therefore also be sorted!). All of this is done automatically, if the right sequence of steps is taken. 
 
 
 ### Scripts to run, step by step
 1. `get_cme_tracks.py` to convert from CME `.mat` files to a dataframe.
 2. `prepare_data.py` to filter out too short data (set it low initially to be safe).
 3. `autoencoder_train.py` to train a model on the data.
-4. `st_predict.py` to predict and plot the data. This also requires training a UMAP model, which can take ~10 min for each refresh.
+4. `st_predict.py` to predict and plot the data. This also requires training a 
+UMAP model, which can take ~10 min for each refresh.
 5. `st_eval.py` once clustering is done and you want to explore the data.
 
 ### Things to avoid
-In order to preserve group ordering, the original dataframes must be run through `prepare_data.py` if they need to be
-filtered in some way. **DO NOT** run a combine dataframe through a filter, because this messes up the internal group
-ordering that was first established when creating the combined dataframe.
+In order to preserve group ordering, the original dataframes must be run through
+ `prepare_data.py` if they need to be filtered in some way. **DO NOT** run a combine dataframe through a filter,
+ because this messes up the internal group ordering that was first established when creating the combined dataframe.
 
 ### Troubleshooting
-What to do if Streamlit doesn't finish running:
+#### What to do if Streamlit doesn't finish running:
 
 1. Hit `Ctrl+Z` in the terminal
 2. If the above doesn't work, write `killall streamlit`
 
-If, after force-quitting Streamlit you get an error like
+If after using Streamlit you get an error like
 ````
 tensorflow/core/kernels/cudnn_rnn_ops.cc:1624] Check failed: stream->parent()->GetRnnAlgorithms(&algorithms)
 ````
-It means that Streamlit was improperly exited and a Tensorflow GPU session is still active.
+It means that a Tensorflow GPU session is still active from the Streamlit session.
 To fix this, open a terminal and write
 1. `nvidia-smi`, and search for the `pid`of the Streamlit process.
 2. `kill -9 pid`, where `pid` is the number above
 
+#### What to do if you can't delete a directory:
+Tensorflow by default creates directorities with incorrect permissions for PyCharm.
+To fix this and make them deletable from PyCharm navigate to the base directory and write
+`sudo chmod -R 777 models`.
 
 ### Still having problems?
 Open an issue and I'll see what I can do...
diff --git a/get_cme_tracks.py b/get_cme_tracks.py
@@ -61,6 +61,8 @@ def main(names, input, output):
         input = input.format(name)
         output = output.format(name)
 
+        print(input)
+
         experiment_name = (
             input.split("/")[-3].replace(" ", "_").replace("-", "_").upper()
         )
@@ -94,12 +96,12 @@ def main(names, input, output):
 
 
 if __name__ == "__main__":
-    INPUT = "/media/tklab/linux-data/Data/{}/**/ProcessedTracks.mat"
+    INPUT = "../../../Data/{}/**/ProcessedTracks.mat"
     OUTPUT = "data/preprocessed/tracks-{}.h5"
     NAMES = (
         "CLTA-TagRFP EGFP-Aux1-A7D2 EGFP-Gak-F6",
         "CLTA-TagRFP EGFP-Aux1-A7D2",
         "CLTA-TagRFP EGFP-Gak-A8",
     )
 
-    main(names=NAMES, input=INPUT, output=OUTPUT)
+    main(names=NAMES, input=INPUT, output=OUTPUT)
diff --git a/lib/globals.py b/lib/globals.py
@@ -1,9 +1,12 @@
 groupby = ["file", "particle"]
 groupby_src = ["file", "particle", "source"]
 
+top_video_dir = "/media/linux-data/Data/"
+
 models_dir = "models/"
 data_preprocessed_dir = "data/preprocessed/"
 results_base_dir = "results/"
 cluster_idx_dir = "results/cluster_indices/"
 encodings_dir = "results/encodings/"
-umap_dir = "results/umap/"
+umap_dir = "results/umap/"
+labels_dir = "results/saved_labels/"
diff --git a/lib/math.py b/lib/math.py
@@ -1,11 +1,15 @@
 import numpy as np
 import parmap
 import scipy.interpolate
-from scipy import fftpack
-from lib.utils import timeit, est_proc
 import sklearn.mixture
+from scipy import fftpack
+from scipy.cluster import hierarchy
+from scipy.spatial import distance
+from sklearn.metrics.pairwise import euclidean_distances
 from tqdm import tqdm
 
+from lib.utils import est_proc, timeit
+
 
 def div0(a, b):
     """Converts all zero-division results to zeros"""
@@ -182,10 +186,15 @@ def modified_z_score(x):
     return modified_z
 
 
-def standardize(X: np.ndarray, mu: float, sigma: float) -> np.ndarray:
+def standardize(X, mu, sigma):
     """
     Standardizes given samples individually to (0, 1) normal distribution.
     Works on unevenly sized arrays too.
+
+    Args:
+        X (np.ndarray)
+        mu (float)
+        sigma (float)
     """
     return np.array([((xi - mu) / sigma) for xi in X])
 
@@ -492,4 +501,18 @@ def mean_abs_dev_outlier(array, cutoff = 3.5):
     """
     med = np.median(array)
     modified_std = np.median(np.abs(array - med))
-    return modified_std, med+cutoff*modified_std
+    return modified_std, med+cutoff*modified_std
+
+
+def hierachical_linkage(points):
+    """
+    Converts distances between points to a condensed hierachical distance
+    array for scipy dendrogram
+
+    Args:
+        points (np.ndarray)
+    """
+    square_distance_mat = np.round(euclidean_distances(points), 2)
+    condensed_dist = distance.squareform(square_distance_mat)
+    z = hierarchy.linkage(condensed_dist, method = "single", metric = "euclidean")
+    return z