-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsegmented2phrase_as_word.py
67 lines (50 loc) · 1.82 KB
/
segmented2phrase_as_word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import sys
import logging
import re
import random
import re
logging.basicConfig(filename=__file__+'.log', level=logging.DEBUG)
file = None
if len(sys.argv) > 1:
file = destinedir = sys.argv[1]
outfile = file+'_phraseAsWord'
# '../data/data_oneFilePerLine/jmlr_vldb/segmented_text.txt_phraseAsWord_alnumRetained_removedJournalHeader_strong_concepts'
if len(sys.argv) > 2:
outfile = sys.argv[2]
square_brackets_enclosed = re.compile(
r"<phrase>(?P<phrase>[^<]*)</phrase>"
)
def brackets2UnderScoreNotation(l):
return square_brackets_enclosed.sub(lambda x: "<phrase>%s</phrase>" % re.sub('\s', '_', x.group('phrase')), l)
# x.group('phrase')
underScore = '_'
consecutive_underScore_regex = re.compile('%s{5,}' % underScore)
def condenseConsecutiveunderScoreToOne(l):
return consecutive_underScore_regex.sub(lambda x: underScore, l)
def singleFileClean(file, file_output):
# import ipdb; ipdb.set_trace()
lineno = -1
with open(file_output, "w") as f_out:
with open(file) as f:
for l in f:
lineno += 1
# l = l.replace('$', ' ')
# l = l.lower()
l_o = l
l = brackets2UnderScoreNotation(l)
l = condenseConsecutiveunderScoreToOne(l)
# if '_____' in l:
# import ipdb;ipdb.set_trace()
# if 'dimension)._There' in l:
# import ipdb; ipdb.set_trace()
try:
# f_forword2vec.write(l.strip() + ' ')
f_out.write(l.strip() + '\n')
except Exception, e:
import ipdb; ipdb.set_trace()
logging.debug(e)
else:
pass
if __name__ == '__main__':
singleFileClean(file, outfile)