-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_data.py
115 lines (91 loc) · 3.15 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
import pandas as pd
from tqdm import tqdm
import parmap
import re
pd.set_option("display.max_columns", 100)
def _filter(args):
"""
Parallel filter function
Note:
t column: actual real time (seconds, miliseconds, etc)
f column: relative apperance time in video [26, 27, 28...]
"""
_, group = args
failsafes = []
# check that it's not first
if REMOVE_EDGES:
print(REMOVE_EDGES)
fail1 = group["f"].values[0] == FIRST_FRAME
# or last frame
fail2 = group["f"].values[-1] == LAST_FRAME
failsafes.extend([fail1, fail2])
# check that group is not somehow longer than maximum length
fail3 = len(group) >= LAST_FRAME
# check that it's longer than minimum
fail4 = len(group) <= MIN_LEN
failsafes.extend([fail3, fail4])
# if any fails, discard trace
if any(failsafes):
return None
else:
return group
def _process_to_arrays(df, path, filter):
"""
Save as filtered arrays, to make downstream processing easier
"""
by = ["file", "particle"]
columns = []
for c in sorted(df.columns):
r = re.search(pattern="int_c*", string=c)
if r is not None:
columns.append(r.string)
grouped_df = df.groupby(by)
try:
df_filtered = pd.concat(
parmap.map(_filter, tqdm(grouped_df), pm_processes=16), sort=False
)
except ValueError:
print(
"Couldn't concatenate anything!\n"
"Maybe all traces were removed in filtering?"
)
return
arrays = [g[columns].values for _, g in tqdm(df_filtered.groupby(by))]
df_filtered.to_hdf(path[:-3] + "_filt{}_var.h5".format(filter), key="df")
np.savez(
path[:-3] + "_filt{}_var.npz".format(filter),
data=arrays,
allow_pickle=True,
)
print("File used: {}".format(path))
print("{} traces kept".format(len(arrays)))
print("{} traces discarded".format(grouped_df.ngroups - len(arrays)))
print("Example check: ", arrays[-1].shape)
def main(input, min_len):
for i in input:
# Need to globally declare for the parallel function to easily grab
global FIRST_FRAME
global LAST_FRAME
global MIN_LEN
global REMOVE_EDGES
df = pd.DataFrame(pd.read_hdf(i))
FIRST_FRAME = df["f"].min()
LAST_FRAME = df["f"].max()
if LAST_FRAME == 0:
raise ValueError(
"Couldn't determine max frame, as last timepoint in 't' column "
"is zero. Fix the 't' column before attempting to run this "
"script"
)
_process_to_arrays(df=df, path=i, filter=min_len)
if __name__ == "__main__":
INPUT_DF = (
# "data/preprocessed/tracks-CLTA-TagRFP EGFP-Aux1-A7D2 EGFP-Gak-F6_filt5_var.h5",
# "data/preprocessed/tracks-CLTA-TagRFP EGFP-Aux1-A7D2_filt5_var.h5",
# "data/preprocessed/tracks-CLTA-TagRFP EGFP-Gak-A8_filt5_var.h5",
"data/preprocessed/test-CS2_CAV-GFP_VLP-CF640_filt5_var.h5",
)
MIN_LEN = 5 # default 5
REMOVE_EDGES = False # default False
main(input=INPUT_DF, min_len=MIN_LEN)