-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_dataset.py
51 lines (42 loc) · 1.41 KB
/
clean_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from utils_tesis.dataset_creator import clean_file
from utils.relay_list import signals
from tqdm import tqdm
import concurrent.futures
import os
signals.append("time")
def repeated_clean(file):
clean_file(
file,
downsampling=8,
# keep_types=["time", "I"],
keep_columns=signals,
rmv_cycles_start=2,
# rmv_cycles_end=2,
frequency=60,
)
def safe_repeated_clean(file_path):
try:
repeated_clean(file_path)
except Exception as e:
print(f"Error processing {file_path}: {e}")
# instalar tqdm, pyarrow y actualizar utils-tesis a versión más reciente
if __name__ == "__main__":
cores = os.cpu_count() - 1
dataset_dir = "D:/PaperLSTM/database/DB1_nueva_full/DB1"
# dataset_dir = "D:/PaperLSTM/database/DB1_nueva"
file_set = set()
for dir_, _, files in os.walk(dataset_dir):
for file_name in files:
rel_dir = os.path.relpath(dir_, dataset_dir)
rel_file = os.path.join(rel_dir, file_name)
file_set.add(f"{dataset_dir}/{rel_file}")
csv_list = list(file_set)
with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
# Use tqdm to wrap the map for the progress bar
list(
tqdm(
executor.map(safe_repeated_clean, csv_list),
total=len(csv_list),
desc="Processing Files",
)
)