-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSimilarEntities.py
143 lines (126 loc) · 5.91 KB
/
SimilarEntities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import faiss # Facebook AI Similarity Search
import csv
import logging
# Setup logging to print to console only
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# GPU Configuration
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")
def get_embeddings(texts, tokenizer, model, batch_size=256):
"""
Generate GPU-accelerated embeddings for a list of texts.
Processes texts in batches to optimize memory usage.
"""
model.eval()
embeddings = []
with torch.no_grad():
for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
batch_texts = texts[i:i+batch_size]
inputs = tokenizer(
batch_texts,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512
).to(device)
outputs = model(**inputs, output_hidden_states=True, return_dict=True)
batch_embeddings = outputs.hidden_states[-1].mean(dim=1) # Shape: (batch_size, hidden_size)
embeddings.append(batch_embeddings.cpu().numpy())
logger.debug(f"Processed batch {(i // batch_size) + 1}")
embeddings = np.vstack(embeddings) # Shape: (num_texts, hidden_size)
return embeddings
def build_faiss_index(embeddings):
"""
Build a FAISS index for the given embeddings.
"""
dimension = embeddings.shape[1]
logger.info("Building FAISS index on CPU...")
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(embeddings)
index.add(embeddings)
logger.info(f"FAISS index built with {index.ntotal} vectors.")
return index
def find_similar_objects(embeddings, index, objects, similarity_threshold=0.8, top_k=100):
"""
For each embedding, find similar objects with similarity above the threshold.
"""
logger.info("Searching for similar objects using FAISS...")
faiss.normalize_L2(embeddings)
similarities, indices = index.search(embeddings, top_k)
similar_objects_list = []
for idx, (sim, ind) in enumerate(zip(similarities, indices)):
similar = [objects[index_] for score, index_ in zip(sim, ind) if score >= similarity_threshold and index_ != idx]
similar_objects_list.append(similar)
logger.info("Similarity search completed.")
return similar_objects_list
def serialize_list_with_double_quotes(lst):
"""Serialize a list of strings ensuring all elements are wrapped in double quotes."""
return '[' + ', '.join(['"{}"'.format(s.replace('"', '\\"')) for s in lst]) + ']'
def main():
try:
# Configuration
model_name = "NovaSearch/stella_en_1.5B_v5"
csv_path = "Object_Value_Counts.csv"
embeddings_output_path = "object_embeddings.npy"
objects_output_path = "objects_list.npy"
similarity_threshold = 0.8
top_k = 100
filtered_output_path = "filtered_results_with_similars.csv"
cleaned_output_path = "cleaned_file.csv"
merged_file_path = "merged.csv"
final_output_path = "output_file.csv"
# Load tokenizer and model
logger.info("Loading tokenizer and model")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)
logger.info("Tokenizer and model loaded successfully")
# Read CSV and generate embeddings
logger.info(f"Reading CSV from {csv_path}")
data = pd.read_csv(csv_path)
objects = data["Object"].astype(str).tolist()
logger.info(f"Loaded {len(objects)} objects from CSV")
logger.info("Generating embeddings for all objects")
embeddings = get_embeddings(objects, tokenizer, model)
np.save(embeddings_output_path, embeddings)
np.save(objects_output_path, np.array(objects))
logger.info("Embeddings and objects saved successfully")
# Build FAISS index and find similar objects
index = build_faiss_index(embeddings)
similar_objects_list = find_similar_objects(embeddings, index, objects, similarity_threshold, top_k)
# Serialize similar objects and save results
data["Similar_Objects"] = [serialize_list_with_double_quotes(lst) for lst in similar_objects_list]
data.to_csv(filtered_output_path, index=False, quoting=csv.QUOTE_MINIMAL)
logger.info(f"Filtered results saved to {filtered_output_path}")
# Remove duplicates
logger.info("Removing duplicate objects")
data = pd.read_csv(filtered_output_path)
data["Similar_Objects"] = data["Similar_Objects"].str.lower()
unique_data = data[~data.apply(lambda row: any(row['Object'].lower() in obj for obj in eval(row['Similar_Objects'])), axis=1)]
unique_data.to_csv(cleaned_output_path, index=False)
logger.info(f"Cleaned data saved to {cleaned_output_path}")
# Update merged file
logger.info("Updating merged file")
cleaned_data = pd.read_csv(cleaned_output_path)
merged_data = pd.read_csv(merged_file_path)
cleaned_data["Similar_Objects"] = cleaned_data["Similar_Objects"].apply(eval)
mapping = {similar.lower(): row["Object"] for _, row in cleaned_data.iterrows() for similar in row["Similar_Objects"]}
merged_data["Object"] = merged_data["Object"].fillna("None").apply(lambda obj: mapping[obj.lower()] if obj.lower() in mapping else obj)
merged_data.to_csv(final_output_path, index=False)
logger.info(f"Final output saved to {final_output_path}")
except Exception as e:
logger.exception("An error occurred")
if __name__ == "__main__":
main()