-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuildIndex.py
139 lines (100 loc) · 4.12 KB
/
buildIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import logging
import random
import re
import sys
import gensim
import numpy as np
from gensim import corpora, models
from gensim.models import word2vec
from nltk.corpus import stopwords
file = '../data/data_oneFilePerLineBySection/nips/segmented_text.txt_phraseAsWord'
if len(sys.argv) > 1:
file = sys.argv[1]
file_wordvec = file + '.wordvec'
if len(sys.argv) > 2:
file_wordvec = sys.argv[2]
file_tfidf = file + '.tfidf'
if len(sys.argv) > 3:
file_tfidf = sys.argv[3]
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger('log')
logger.addHandler(logging.FileHandler(__file__ + '.log'))
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
logger.debug('==================================')
logger.debug('for file %s' % file)
short_word = re.compile(
r"^\w{,1}$"
)
doesnt_contain_vowel = re.compile(
r"^[^aeiou]*$"
)
stopwordsSet = set(stopwords.words('english'))
def notMeaningfulWord(word):
return short_word.match(word)
square_brackets_enclosed = re.compile(
r"<phrase>(?P<phrase>[^<]*)</phrase>"
)
def trim_rule(word, count, min_count):
if square_brackets_enclosed.match(word):
return gensim.utils.RULE_KEEP
if notMeaningfulWord(word):
return gensim.utils.RULE_DISCARD
return gensim.utils.RULE_DEFAULT
# return gensim.utils.RULE_DEFAULT
def displayString(w):
# return w
return re.sub(r'</?phrase>', '', w)
valid_size = 2 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
dictionary = {}
valid_examples_givenword = ['']
def compute_wordvec():
for size in [200]: # 50,128,
for sg in [1]: # 0
for max_vocab_size in [None]: # 60000,
model_concepts_file = file_wordvec
try:
model = word2vec.Word2Vec.load(model_concepts_file)
except Exception as e:
print(1)
with open(file) as f:
model = word2vec.Word2Vec(word2vec.LineSentence(f), size=size, workers=120, max_vocab_size=max_vocab_size, trim_rule=trim_rule, sg=sg)
model.save(model_concepts_file)
print(model.wv.index2word[:100])
# validation
validate_word2vec(model)
def validate_word2vec(model):
if dictionary == {}:
for _, word in enumerate(model.wv.index2word):
dictionary[word] = len(dictionary)
dictionary['UNK'] = len(dictionary)
valid_examples_frequent = random.sample(range(min(valid_window, len(dictionary))), valid_size // 2)
valid_examples_phrase = random.sample([index for word, index in dictionary.items() if '_' in word],
valid_size // 2)
try:
valid_examples_frequent[0] = dictionary['analysis']
valid_examples_phrase[0] = dictionary['machine_learning']
except Exception as e:
pass
valid_examples = np.array(valid_examples_frequent + valid_examples_phrase)
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
for i in range(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 10 # number of nearest neighbors
print('Nearest to %s: %s' % (displayString(valid_word), ', '.join(
[displayString(word) for word, score in model.most_similar(positive=[valid_word], topn=top_k)])))
def compute_tfidf():
def readIntoListsOfWords(file):
return [document.lower().split() for document in open(file).readlines()]
wordsLists = readIntoListsOfWords(file)
dictionary = corpora.Dictionary(wordsLists)
corpus = [dictionary.doc2bow(text) for text in wordsLists]
modelTfidf = models.TfidfModel(corpus)
corpora.MmCorpus.serialize(file_tfidf + '.corpus', corpus) # store to disk, for later use
dictionary.save(file_tfidf + '.dict') # store the dictionary, for future reference
modelTfidf.save(file_tfidf + '.modelTfidf')
def main():
compute_wordvec()
if __name__ == '__main__':
main()