-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpre-avazu.py
executable file
·53 lines (42 loc) · 1.61 KB
/
pre-avazu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python
import os, sys, hashlib
def hashstr(str, nr_bins):
return int(hashlib.md5(str.encode('utf8')).hexdigest(), 16)%(nr_bins - 1) + 1
def process_tr(src_path, dst_path):
with open(dst_path, 'w+') as f:
for line in open(src_path, 'r'):
temp_string = ""
for i, token in enumerate(line.rstrip().split(',')):
if i == 0:
continue
elif i == 1:
temp_string = token + " " + temp_string
elif 5 <= i and i <= 13:
temp_string += (str(i-1) + ":" + str(int(hashstr(token, 1e+6))) + " ")
else:
temp_string += (str(i-1) + ":" + token + " ")
f.write(temp_string + "\n")
f.close()
def process_te(src_path, dst_path):
with open(dst_path, 'w+') as f:
for line in open(src_path, 'r'):
temp_string = "0 "
for i, token in enumerate(line.rstrip().split(',')):
if i == 0:
continue
elif 4 <= i and i <= 12:
temp_string += (str(i) + ":" + str(int(hashstr(token, 1e+6))) + " ")
else:
temp_string += (str(i) + ":" + token + " ")
f.write(temp_string + "\n")
f.close()
if __name__ == '__main__':
usage_string = 'usage: pre-avazu.py {tr|te} input output'
if len(sys.argv) != 4:
print(usage_string)
exit(1)
dtype, src_path, dst_path = sys.argv[1:]
if dtype == "tr":
process_tr(src_path, dst_path)
else:
process_te(src_path, dst_path)