forked from keqianli/conceptRelevance
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathretain_alphanumeric.py
40 lines (29 loc) · 1.02 KB
/
retain_alphanumeric.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
import re
import logging
import sys
file = '../data/data_oneFilePerLine/jmlr_vldb/texts.txt'
if len(sys.argv) > 1:
file = sys.argv[1]
file_out = file+'_alphanumeric'
if len(sys.argv) > 2:
file_out = sys.argv[2]
def processSingleLine(string, outputQueue):
# remove all except alphanumeric, concat, underscore
string = re.sub(r"[^A-Za-z0-9_\-<>/]", " ", string)
# merge consecutive spaces
string = re.sub(r"\s{2,}", " ", string)
string = string.lower()
return string
def processByLineSameOutput(inputFile, processSingleLine, outputFile=None):
if not outputFile:
outputFile = inputFile+'_processed'
with open(os.path.join(outputFile), 'w') as f_out:
with open(inputFile) as f:
for l in f:
try:
f_out.write(str(processSingleLine(l.strip(), None)).strip()+'\n')
except Exception, e:
logging.debug(e)
if __name__ == '__main__':
processByLineSameOutput(file, processSingleLine, file_out)