-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_backend.py
305 lines (258 loc) · 13.7 KB
/
search_backend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ I M P O R T S ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
import math
import sys
import re
# from inverted_index_gcp import MultiFileReader, InvertedIndex # change this line
import json
import string
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby, chain
import pandas as pd
import os
from operator import itemgetter
from time import time
from pathlib import Path
import pickle
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
import numpy as np
from google.cloud import storage
nltk.download('stopwords')
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ T A B L E O F C O N T E N T S ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~ 1. Globals and Constants ~~~~
#~~~~ 2. Create Inverted Index ~~~~
#~~~~ 2.1 GCP ~~~~
#~~~~ 2.1.1 inverted_body_with_stemming ~~~~
#~~~~ 2.1.2 inverted_title_with_stemming ~~~~
#~~~~ 2.1.3 DL ~~~~
#~~~~ 2.1.4 NF ~~~~
#~~~~ 3. Helper Functions ~~~~
#~~~~ 3.1 tokenization Query ~~~~
#~~~~ 3.2 return Top N doc With Titles ~~~~
#~~~~ 4. Ranking Methods ~~~~
#~~~~ 4.1 Boolean ~~~~
#~~~~ 4.1.1 booleanRanking ~~~~
#~~~~ 4.2 BM25 ~~~~
#~~~~ 4.2.1 calc_idf ~~~~
#~~~~ 4.2.2 BM25_and_binary_search ~~~~
#~~~~ 4.3 Cosine Similarity ~~~~
#~~~~ 4.3.1 search_body_implement ~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ G L O B A L S ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# create stopwords set
english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links",
"may", "first", "see", "history", "people", "one", "two",
"part", "thumb", "including", "second", "following",
"many", "however", "would", "became"]
all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)
########################################################################################################################
##################################### C R E A T E I N V E R T E D I N D E X ######################################
########################################################################################################################
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ G C P ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
os.environ["GCLOUD_PROJECT"] = "ir-ass3-84763"
bucket_name = 'project_ir_os' # project_ir_os
client = storage.Client()
bucket = client.bucket(bucket_name)
# This will be usefully for the calculation of AVGDL (utilized in BM25)
# DL from bucket
dl_src = "DL.pkl"
blob_DL = bucket.blob(f"DL/{dl_src}")
pickel_in = blob_DL.download_as_string()
DL = pickle.loads(pickel_in)
# This will be usefully for the calculation
# NF from bucket
nf_src = "nf.pkl"
blob_nf = bucket.blob(f"NF/{nf_src}")
pickel_in = blob_nf.download_as_string()
NF = pickle.loads(pickel_in)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ H E L P E R F U N C T I O N S ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def tokenizationQuery(query, doStemming=False):
"""
Tokenize the query and return a list of tokens
"""
# create stopwords set
tokens = [token.group() for token in RE_WORD.finditer(query.lower())]
tokens_without_stopwords = [i for i in tokens if not i in all_stopwords] # query without stop words
if doStemming:
# create stopwords set without stemmer
stemmer = PorterStemmer()
for token in range(len(tokens_without_stopwords)):
tokens_without_stopwords[token] = stemmer.stem(tokens_without_stopwords[token])
return tokens_without_stopwords
def returnTopNdocWithTitles(inverted_title, searchDictVal, N=None):
"""
Sort and return the highest N documents according to the cosine similarity score.
Generate a dictionary of cosine similarity scores
Parameters:
-----------
inverted_title:
searchDictVal: a dictionary of similarity score as follows:
key: document id (e.g., doc_id)
value: similarity score. We keep up to 5 digits after the decimal point. (e.g., round(score,5))
N: Integer (how many documents to retrieve). By default, N = None
Returns:
-----------
a ranked list of pairs (doc_id, score) in the length of N.
"""
res = []
if N is None:
temp_res = list(sorted(searchDictVal.items(), key=lambda x: x[1], reverse=True)) # sort by score
else:
temp_res = list(sorted(searchDictVal.items(), key=lambda x: x[1], reverse=True))[:N] # sort by score (Cosine Similarity)
for doc_id in temp_res:
try:
title = inverted_title.TitlesOfDocs[doc_id[0]]
res.append((doc_id[0], title))
except:
pass
return res
def normalizeQuery(word_counter):
"""
Normalize the query vector, calculate (1/|q|)
"""
sumNorm = sum(term ** 2 for term in word_counter.values())
return (1 / math.sqrt(sumNorm))
########################################################################################################################
############################################ R A N K I N G M E T H O D S #############################################
########################################################################################################################
"""
1. Boolean (Search title and anchor)
2. MB25 combine with binary (Search)
3. Cosine Similarity (Search body)
"""
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~ B O O L E A N ~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def booleanRanking(inverted_index, query):
query_tokens = np.unique(tokenizationQuery(query, doStemming=False))
ID_SCORE = {}
try:
for term in query_tokens:
if term in inverted_index.df:
pls = inverted_index.read_posting_list(term, bucket_name)
for tup_id_tf in pls:
ID_SCORE[tup_id_tf[0]] = ID_SCORE.get(tup_id_tf[0], 0) + 1
except:
pass
return ID_SCORE
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~ B M 2 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def calc_idf(list_of_tokens, index):
"""
This function calculate the idf values according to the BM25 idf formula for each term in the query.
Parameters:
-----------
query: list of token representing the query. For example: ['look', 'blue', 'sky']
Returns:
-----------
idf: dictionary of idf scores. As follows:
key: term
value: bm25 idf score
"""
idf = {}
N = len(DL) # number of documents in the corpus
for term in list_of_tokens:
if term in index.df.keys():
n_ti = index.df[term]
idf[term] = math.log(1 + (N - n_ti + 0.5) / (n_ti + 0.5))
else:
pass
return idf
def BM25_and_binary_search(inverted_body, inverted_title, query, k1=1.5, b=0.75, N=None):
TITLE_WEIGHT = 0.8
BODY_WEIGHT = 0.2
score = {}
tokens_without_stopwords = tokenizationQuery(query, doStemming=False)
N = len(DL)
AVGDL = sum(DL.values()) / N
idf = calc_idf(tokens_without_stopwords, inverted_body)
for token in tokens_without_stopwords:
try:
pls = dict(inverted_body.read_posting_list(token, bucket_name))[:3000]
except:
continue
for doc_id, df in pls.items():
if doc_id not in score.keys():
score[doc_id] = 0
tf = pls.get(doc_id, 0)
numerator = idf[token] * tf * (k1 + 1)
denominator = (tf + k1 * (1 - b + b * DL.get(doc_id) / AVGDL))
try:
score[doc_id] += numerator / denominator # sum up the score for each term in the query
except ZeroDivisionError:
pass
bodyVal = list(sorted(score.items(), key=lambda x: x[1], reverse=True))
TITLE_SCORE = booleanRanking(inverted_title, query)
titleVal = list(sorted(TITLE_SCORE.items(), key=lambda x: x[1], reverse=True))
merge_res = {}
# merge results of body and title:
for i in range(100):
if i < len(bodyVal):
docBody, scoreBody = bodyVal[i]
scoreBody = scoreBody / bodyVal[0][1]
if merge_res.get(docBody, 0) == 0:
merge_res[docBody] = scoreBody
else:
merge_res[docBody] = merge_res.get(docBody) * TITLE_WEIGHT + BODY_WEIGHT * scoreBody
for i in range(100):
if i < len(titleVal):
docTitle, scoreTitle = titleVal[i]
scoreTitle = scoreTitle / len(tokens_without_stopwords)
if merge_res.get(docTitle, 0) == 0:
merge_res[docTitle] = scoreTitle
else:
merge_res[docTitle] = merge_res.get(docTitle) * BODY_WEIGHT + TITLE_WEIGHT * scoreTitle
return returnTopNdocWithTitles(inverted_title=inverted_title, searchDictVal=merge_res, N=20) ###
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~ C O S I N E S I M I L A R I T Y ~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def search_body_implement(inverted_body, query):
"""
Returns up to a 100 search results for the query using TFIDF AND COSINE
SIMILARITY OF THE BODY OF ARTICLES ONLY. DO NOT use stemming. DO USE the
staff-provided tokenizer from Assignment 3 (GCP part) to do the
tokenization and remove stopwords.
Args:
inverted_body:
query: string of query words separated by spaces.
Returns:
list of up to 100 search results, ordered from best to worst where each
element is a tuple (wiki_id, title).
"""
query_tokens = tokenizationQuery(query, doStemming=False)
word_counter = Counter(query_tokens)
docRank = {} # dictionary of (wiki_id, cosine similarity)
NUMBER_OF_DOCS = inverted_body.numberOfDocs # number of docs in the corpus is 6348910
cosineSimilarity = {}
try:
for term in query_tokens:
pls = inverted_body.read_posting_list(term, bucket_name)
idf = math.log(NUMBER_OF_DOCS / inverted_body.df[term], 10) # calculate idf
# calculate tf-idf
for tup_id_tf in pls:
tf = tup_id_tf[1] # TODO: how to normalize???? ask Nir for doc or feq word per query
docRank[tup_id_tf[0]] = docRank.get(tup_id_tf[0],
0) + tf * idf # calculate cosine similarity {id: tf-idf}
normalize_query = normalizeQuery(word_counter) # calculate (1/|q|)
for doc, cosSin_score_tf in docRank.items():
nfi = inverted_body.nf.get(doc, 0)
cosineSimilarity[doc] = cosSin_score_tf * normalize_query * nfi
except:
pass
return returnTopNdocWithTitles(inverted_title=inverted_body, searchDictVal=cosineSimilarity, N=100)