-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_cme_tracks_guiseppe.py
96 lines (70 loc) · 2.51 KB
/
get_cme_tracks_guiseppe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import re
from glob import glob
import h5py
import numpy as np
import pandas as pd
import parmap
def index(m, track_id, column):
"""
Indexing in the matlab h5 file returns a reference only. This reference is
then used to go back and find the values in the file.
"""
ref = m[column][track_id][0]
return np.ravel(m[ref][:])
def cme_tracks_to_pandas(mat_path, project_name):
"""
Converts CME-derived ProcessedTracks.mat to Pandas DataFrame format.
This version was specifically created for matlab 7.3 files.
Add extra columns as required.
"""
COLUMNS = "A", "f", "t", "x", "y", "z"
h5file = h5py.File(mat_path, "r")
m = h5file["tracks"]
n_tracks = len(m["A"])
df = []
for i in range(n_tracks):
A, f, t, x, y, z = [index(m=m, track_id=i, column=c) for c in COLUMNS]
track_len = len(A)
# Find out where parent dirs can be skipped
real_dir = re.search(string=mat_path, pattern=project_name)
# Create path from actual directory
filepath = mat_path[real_dir.start() :]
group = pd.DataFrame(
{
"file": np.repeat(filepath, track_len),
"particle": np.repeat(i, track_len),
"int_c0": A,
"f": f,
"t": t,
"x": x,
"y": y,
"z": z,
}
)
df.append(group)
group.fillna(method="ffill", inplace=True)
return pd.concat(df)
def main(names, input, output):
for name in names:
_input = input.format(name)
_output = output.format(name)
files = sorted(glob(_input, recursive=True))
print("\nFound files:")
[print(f) for f in files]
print()
df = pd.concat(
parmap.map(cme_tracks_to_pandas, files, project_name=name)
)
print("Number of files in df: {}".format(len(df["file"].unique())))
# ALl traces
df.to_hdf(_output, key="df")
print("Each trace will be tagged with 'file' like:")
print(df["file"].values[0])
if __name__ == "__main__":
# Project name goes into curly path
PROJECT_NAMES = ("Test",)
# Search for tracks in this path. ** means multiple wildcard subdirectories
SEARCH_PATTERN = "../../../Data/{}/**/ProcessedTracks.mat"
# Output to a file that also contains the project name in the curly bracket
OUTPUT_NAME = "data/preprocessed/test-{}.h5"
main(names=PROJECT_NAMES, input=SEARCH_PATTERN, output=OUTPUT_NAME)