-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexperimentsData.py~
96 lines (76 loc) · 3.68 KB
/
experimentsData.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
def gender_headset(df, name, n_train, n_test):
females = df.loc[(df['Gender'] == 'f') & (df['Microphone'].isin(['headset', 'logitech_headset']))]
males = df.loc[(df['Gender'] == 'f') & (df['Microphone'].isin(['headset', 'logitech_headset']))]
save_names(make_samples(females, n_train, males, females, n_test),
name)
def gender(df, name, n_train, n_test):
females = df.loc[(df['Gender'] == 'f')]
males = df.loc[(df['Gender'] == 'f')]
save_names(make_samples(females, n_train, males, females, n_test),
name)
def microphone_simple(df, name, n_train, n_test):
phone = df.loc[df['Microphone'].str.contains("Samsung|iphone|iPhone|oneplus|samsung|HUAWEI")==True]
computer = df.loc[df['Microphone'] == 'iMac']
headset = df.loc[df['Microphone'].isin(['headset', 'logitech_headset'])]
save_names(make_samples(headset, n_train, computer, phone, n_test),
name)
def microphone_double(df, name, n_train, n_test):
computer = df['Microphone'] == 'iMac'
headset = df['Microphone'].isin(['headset', 'logitech_headset'])
computer_headset = df.loc[computer | headset]
phone = df.loc[df['Microphone'].str.contains("Samsung|iphone|iPhone|oneplus|samsung|HUAWEI")==True]
save_names(make_samples(computer_headset, n_train, computer_headset, phone, n_test),
name)
def accent_n_to_nn(df, name, n_train, n_test):
native = df.loc[df['Accent'].str.contains("NA")==True]
non_native = df.loc[df['Accent'].str.contains("NN")==True]
save_names(make_samples(native, n_train, non_native, native, n_test),
name)
def accent_nn_to_n(df, name, n_train, n_test):
native = df.loc[df['Accent'].str.contains("NA")==True]
non_native = df.loc[df['Accent'].str.contains("NN")==True]
save_names(make_samples(non_native, n_train, non_native, native, n_test),
name)
def general(df, name, n_train, n_test):
save_names(make_samples(df, n_train, df, df, n_test),
name)
def make_samples(df_train, n_train, df_test, df_test_2, n_test):
train_sample = df_train.sample(n=n_train)
df_test = remove_sample(df_test, train_sample)
test_sample = df_test.sample(n=n_test)
df_test_2 = remove_sample(df_test_2, test_sample)
df_test_2 = remove_sample(df_test_2, train_sample)
test_sample_2 = df_test_2.sample(n=n_test)
return [train_sample, test_sample, test_sample_2]
def remove_sample(df, sample):
""" Remove all rows that are in sample and in df.
"""
return (pd.merge(df,sample, indicator=True, how='outer')
.query('_merge=="left_only"')
.drop('_merge', axis=1))
def save_names(dfs, experiment):
""" Saving the names of the training and testing dataframes.
- Input: an array [Train DF, Test DF]
"""
t = ['train', 'test', 'test-2']
for n in range(3):
dfs[n]['Name'].to_csv('experiment-data/' + experiment + '-' + t[n] + '.txt',
header=False, index=False)
def sanity_check(df, good_users_file):
users_file = open(good_users_file, 'r')
users = [line[:-1] for line in users_file.readlines()]
return df[~df['Name'].isin(users)]
def main():
df = pd.read_csv('log.csv')
df = sanity_check(df, 'sanity_users.txt')
gender_headset(df, '1-gender-headset', 20, 10)
gender(df, '2-gender', 20, 10)
microphone_simple(df, '3-microphone-simple', 20, 10)
microphone_double(df, '4-microphone-double', 20, 10)
accent_n_to_nn(df, '5-accent-native-nn', 20, 10)
accent_nn_to_n(df, '6-accent-nn-native', 20, 10)
general(df, '7-hmm-50', 50, 10)
general(df, '8-hmm-20', 20, 10)
if __name__ == "__main__":
main()