-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_recommendations.py
88 lines (62 loc) · 2.71 KB
/
generate_recommendations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import sys
import pickle
from utils import connect_db, query_db, update_newsboat_records
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sentence_transformers import SentenceTransformer
from torch.nn import Embedding, Linear
from torch.quantization import quantize_dynamic
db_path = sys.argv[1]
prefix = os.path.basename(db_path).split('.')[0]
tfidf_path = os.path.join('models', prefix + 'tfidf.p')
meta_path = os.path.join('models', prefix + 'tfidf_meta.p')
model_path = os.path.join('models', prefix + 'model.p')
max_train = 5000
max_features = 5000
max_recommendations = 30
cool_nlp_model = quantize_dynamic(SentenceTransformer('paraphrase-xlm-r-multilingual-v1', device='cpu'), {Linear, Embedding})
def generate_recs_from_model(meta_path, tfidf_path, model_path):
print("Generating Recommendations...")
sqldb = connect_db(db_path)
records = query_db(sqldb, '''select feedurl, author, id, title, content, flags from rss_item order by pubDate DESC LIMIT 200;''')
content_list = []
outcome_list = []
id_list = []
title_list = []
for record in records:
# We should not judge the book by it's cover
content_list.append('||'+ record['feedurl'] + '|| \n ||' + record['author'] + '|| \n ||' + record['title'] + '|| \n' + record['content'])
outcome_list.append((record['flags'] is not None and 'r' not in record['flags'] and 's' in record['flags']) * 1)
id_list.append(record['id'])
# Yes, we are judging the book by it's cover but we are using the cool NLP model to judge
title_list.append(record['title'])
print("Total %d feed items found" %(len(content_list)))
print(content_list[0])
#Loading the pickle files
meta = pickle.load(open(meta_path, 'rb'))
out = pickle.load(open(tfidf_path, 'rb'))
model = pickle.load(open(model_path, 'rb'))
v = out['v']
print("Projecting them to a mathematical space..")
X_tfidf = v.transform(content_list)
X_smart = cool_nlp_model.encode(title_list)
clf = model['clf']
beclf = model['beclf']
y = out['y']
X_tfidf = X_tfidf.todense().astype(np.float32)
y = np.array(y).astype(np.float32)
print("Recommending...")
s_tfidf = clf.decision_function(X_tfidf)
s_smart = beclf.decision_function(X_smart)
s = s_smart * 0.65 + s_tfidf * 0.35
sortix = np.argsort(-s)
recs = sortix[y[sortix] == 0]
recs = recs[:max_recommendations]
print(recs)
print([id_list[x] for x in recs])
return [id_list[x] for x in recs]
if __name__ == "__main__":
recs = generate_recs_from_model(meta_path, tfidf_path, model_path)
update_newsboat_records(meta_path, db_path, recs)