-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata_pre_processing.py
40 lines (31 loc) · 1.24 KB
/
data_pre_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from sklearn.model_selection import train_test_split
def clean_data(X):
"""
This function is used to clean the data in the dataset
:param X: the dataframe containing the features of the datapoints
:return: the dataframe with the clean, processed values of the features
"""
X_cleaned = []
for i in X:
X_cleaned.append([i[0], i[1], i[2].timestamp(), i[3], i[4].timestamp(), i[5]])
return X_cleaned
def clean_labels(Y):
"""
This function is used to clean the labels in the dataset
:param Y: the array containing the labels of the datapoints
:return: the array of cleaned labels of the datapoints
"""
Y_cleaned = []
for i in range(Y.shape[0]):
Y_cleaned.append([Y.iloc[i][0].total_seconds()])
return Y_cleaned
def split_train_test(X, Y, ratio=0.3):
"""
This function is used to split the dataset into training and testing dataset
:param X: the feature set of the datapoints for regression prediction
:param Y: the labels of the datapoints
:param ratio: the test dataset ratio
:return: the training and testing datasets
"""
# Splitting the dataset into the Training set and Test set
return train_test_split(X, Y, test_size = ratio, random_state = 0)