diff --git a/Package.swift b/Package.swift index 31fcb64..91d4f75 100644 --- a/Package.swift +++ b/Package.swift @@ -18,6 +18,7 @@ let package = Package( name: "SimilaritySearchKitDistilbert", targets: ["SimilaritySearchKitDistilbert"] ), + .library( name: "SimilaritySearchKitMiniLMAll", targets: ["SimilaritySearchKitMiniLMAll"] diff --git a/Sources/SimilaritySearchKit/AddOns/Embeddings/BGE/BGEEmbeddings.swift b/Sources/SimilaritySearchKit/AddOns/Embeddings/BGE/BGEEmbeddings.swift new file mode 100644 index 0000000..076550e --- /dev/null +++ b/Sources/SimilaritySearchKit/AddOns/Embeddings/BGE/BGEEmbeddings.swift @@ -0,0 +1,66 @@ +// +// MiniLMAllEmbeddings.swift +// +// +// Created by Zach Nagengast on 4/20/23. +// + +import CoreML +import Foundation +import SimilaritySearchKit + +@available(macOS 12.0, iOS 15.0, *) +public class BGEEmbeddings: EmbeddingsProtocol { + public let model: BGE_small + public let tokenizer: BertTokenizer + public let inputDimention: Int = 512 + public let outputDimention: Int = 384 + + public init(tokenizer: BertTokenizer? = nil) { + let modelConfig = MLModelConfiguration() + modelConfig.computeUnits = .all + print("INIT BGE", tokenizer == nil ? "NO TOKENIZER" : "tokenizer provided") + do { + self.model = try BGE_small(configuration: modelConfig) + print("INIT BGE MODEL LOADED") + + } catch { + fatalError("Failed to load the Core ML model. Error: \(error.localizedDescription)") + } + print("INIT BGE Tokenizer adding") + + self.tokenizer = tokenizer ?? BertTokenizer() + } + + // MARK: - Dense Embeddings + + public func encode(sentence: String) async -> [Float]? { + // Encode input text as bert tokens + let inputTokens = Array(tokenizer.buildModelTokens(sentence: sentence)) + let (inputIds, attentionMask) = tokenizer.buildModelInputs(from: inputTokens) + + // Send tokens through the MLModel + // print(inputIds.count, inputIds[0]) + // print(inputIds, attentionMask) + let embeddings = generateEmbeddings(inputIds: inputIds, attentionMask: attentionMask) + + return embeddings + } + + public func generateEmbeddings(inputIds: MLMultiArray, attentionMask: MLMultiArray) -> [Float]? { + let inputFeatures = BGE_smallInput(input_ids: inputIds, attention_mask_1: attentionMask) + // print(inputFeatures) + let output = try? model.prediction(input: inputFeatures) + + guard let embeddings = output?.var_1059 else { + print("failed") + print(output) + return nil + } + // print("EMBEDDINGS") + // print(embeddings) + let embeddingsArray: [Float] = (0.. [Float]? { + let embeddings: [Float]? = try? await withCheckedContinuation { continuation in + getEmbedding(url: url, options: ["texts": sentence]) { (result) in + if let error = result["error"] as? Never { + print("Error encoding:", error.localizedDescription) + continuation.resume(throwing: error) + return + } + // if (sentence != "Test sentence"){ + // assumes the web embeddings can embed multiple sentences and thus returns an array of embeddings + let arrayOfEmbeddings = result["data"] as? NSArray + // assuming you've embedded one sentence and just once that, so gets the first array + let firstEmbedding = arrayOfEmbeddings?[0] as? NSArray + let floatArray: [Float]? = (firstEmbedding)?.compactMap { + // Try to convert each element to Float + if let number = $0 as? NSNumber { + return number.floatValue + } else if let string = $0 as? String, let floatValue = Float(string) { + return floatValue + } + return nil + } + print(floatArray?.count ?? "No count", result) + + // } + continuation.resume(returning: floatArray) + // continuation.success(embeddings) + } + + } + return embeddings + } +} + +func getEmbedding( + url: String, options: [String: Any]?, completion: @escaping (([String: Any]) -> Void) +) { + + let headers = [ + "Content-Type": "application/json", + "Authorization": "Bearer token", // Replace with the appropriate value + ] + + let model = options?["model"] as? String + func generateOptionString(options: [String: Any]?) -> String { + guard let options = options else { return "" } + return options.map { (key, value) in + "&\(key)=\("\(value ?? "")".addingPercentEncoding(withAllowedCharacters: .urlHostAllowed) ?? "")" + }.joined() + } + do { + let baseURL = url + let url = baseURL + generateOptionString(options: options) + print(url) + var request = URLRequest(url: URL(string: url)!) + var requestBody: Data? = nil + if let requestBody = requestBody, + let data = try? JSONSerialization.data(withJSONObject: requestBody, options: []) + { + + request.httpBody = data + request.httpMethod = "POST" + + } else { + print("No request body", requestBody) + request.httpMethod = "GET" + } + + request.allHTTPHeaderFields = headers + + let task = URLSession.shared.dataTask(with: request) { (data, response, error) in + guard let data = data, error == nil else { + print("Error: \(error?.localizedDescription ?? "Unknown error")") + return + } + // parse the result as JSON, since that's what the API provides + do { + + if let json = try JSONSerialization.jsonObject(with: data, options: []) + as? [String: Any] + { + if json["data"] == nil { print(json) } + + completion(json) + + } else { + print("Unable to parse JSON, \(error?.localizedDescription ?? "Unknown error")") + + } + } catch let error { + let responseString = String(data: data, encoding: .utf8) + print("Error parsing JSON: \(error)\nJSON:\n\(responseString)") + print( + "URLSession response: \(String(data: data, encoding: .utf8) ?? "Unable to decode response")" + ) + + } + } + task.resume() + } + +} diff --git a/Sources/SimilaritySearchKit/Core/Embeddings/Tokenizers/BertTokenizer.swift b/Sources/SimilaritySearchKit/Core/Embeddings/Tokenizers/BertTokenizer.swift index 96d15cc..2072db3 100644 --- a/Sources/SimilaritySearchKit/Core/Embeddings/Tokenizers/BertTokenizer.swift +++ b/Sources/SimilaritySearchKit/Core/Embeddings/Tokenizers/BertTokenizer.swift @@ -16,9 +16,12 @@ public class BertTokenizer: TokenizerProtocol { private let vocab: [String: Int] private let ids_to_tokens: [Int: String] - public init() { - let url = Bundle.module.url(forResource: "bert_tokenizer_vocab", withExtension: "txt")! - let vocabTxt = try! String(contentsOf: url) + public init(vocabText: String? = nil) { + var vocabTxt = vocabText ?? ""; + if vocabTxt.isEmpty { + let url = Bundle.module.url(forResource: "bert_tokenizer_vocab", withExtension: "txt")! + vocabTxt = try! String(contentsOf: url) + } let tokens = vocabTxt.split(separator: "\n").map { String($0) } var vocab: [String: Int] = [:] var ids_to_tokens: [Int: String] = [:] diff --git a/Sources/SimilaritySearchKit/Core/Index/SimilarityIndex.swift b/Sources/SimilaritySearchKit/Core/Index/SimilarityIndex.swift index 6e85b4d..b66b9ca 100644 --- a/Sources/SimilaritySearchKit/Core/Index/SimilarityIndex.swift +++ b/Sources/SimilaritySearchKit/Core/Index/SimilarityIndex.swift @@ -18,312 +18,351 @@ public typealias VectorStoreType = SimilarityIndex.VectorStoreType @available(macOS 11.0, iOS 15.0, *) public class SimilarityIndex { - // MARK: - Properties - - /// The items stored in the index. - public var indexItems: [IndexItem] = [] - - /// The dimension of the embeddings in the index. - /// Used to validate emebdding updates - public private(set) var dimension: Int = 0 - - /// The name of the index. - public var indexName: String - - public let indexModel: any EmbeddingsProtocol - public var indexMetric: any DistanceMetricProtocol - public let vectorStore: any VectorStoreProtocol - - /// An object representing an item in the index. - public struct IndexItem: Codable { - /// The unique identifier of the item. - public let id: String - - /// The text associated with the item. - public var text: String - - /// The embedding vector of the item. - public var embedding: [Float] - - /// A dictionary containing metadata for the item. - public var metadata: [String: String] + // MARK: - Properties + + /// The items stored in the index. + public var indexItems: [IndexItem] = [] + + /// The dimension of the embeddings in the index. + /// Used to validate emebdding updates + public private(set) var dimension: Int = 0 + + /// The name of the index. + public var indexName: String + + public let indexModel: any EmbeddingsProtocol + public var indexMetric: any DistanceMetricProtocol + public let vectorStore: any VectorStoreProtocol + + /// An object representing an item in the index. + public struct IndexItem: Codable { + /// The unique identifier of the item. + public let id: String + + /// The text associated with the item. + public var text: String + + /// The embedding vector of the item. + public var embedding: [Float] + + /// A dictionary containing metadata for the item. + public var metadata: [String: String] + } + + /// An Identifiable object containing information about a search result. + public struct SearchResult: Identifiable { + /// The unique identifier of the associated index item + public let id: String + + /// The similarity score between the query and the result. + public let score: Float + + /// The text associated with the result. + public let text: String + + /// A dictionary containing metadata for the result. + public let metadata: [String: String] + } + + /// An enumeration of available embedding models. + public enum EmbeddingModelType { + /// DistilBERT, a small version of BERT model fine tuned for questing-answering. + case distilbert + + /// MiniLM All, a smaller but faster model. + case minilmAll + + /// Multi-QA MiniLM, a fast model fine-tuned for question-answering tasks. + case minilmMultiQA + + /// A native model provided by Apple's NaturalLanguage library. + case native + } + + public enum SimilarityMetricType { + case dotproduct + case cosine + case euclidian + } + + public enum TextSplitterType { + case token + case character + case recursive + } + + public enum VectorStoreType { + case json + // TODO: + // case mlmodel + // case protobuf + // case sqlite + } + + // MARK: - Initializers + + public init( + name: String? = nil, model: (any EmbeddingsProtocol)? = nil, + metric: (any DistanceMetricProtocol)? = nil, vectorStore: (any VectorStoreProtocol)? = nil + ) async { + // Setup index with defaults + self.indexName = name ?? "SimilaritySearchKitIndex" + self.indexModel = model ?? NativeEmbeddings() + self.indexMetric = metric ?? CosineSimilarity() + self.vectorStore = vectorStore ?? JsonStore() + + // Run the model once to discover dimention size + await setupDimension() + } + + private func setupDimension() async { + if let testVector = await indexModel.encode(sentence: "Test sentence") { + dimension = testVector.count + } else { + print("Failed to generate a test input vector.") } - - /// An Identifiable object containing information about a search result. - public struct SearchResult: Identifiable { - /// The unique identifier of the associated index item - public let id: String - - /// The similarity score between the query and the result. - public let score: Float - - /// The text associated with the result. - public let text: String - - /// A dictionary containing metadata for the result. - public let metadata: [String: String] + } + + // MARK: - Encoding + + public func getEmbedding(for text: String, embedding: [Float]? = nil) async -> [Float] { + if let embedding = embedding, embedding.count == dimension { + // Valid embedding, no encoding needed + return embedding + } else { + // Encoding needed before adding to index + guard let encoded = await indexModel.encode(sentence: text) else { + print("Failed to encode text. \(text)") + return Array(repeating: Float(0), count: dimension) + } + return encoded } + } - /// An enumeration of available embedding models. - public enum EmbeddingModelType { - /// DistilBERT, a small version of BERT model fine tuned for questing-answering. - case distilbert - - /// MiniLM All, a smaller but faster model. - case minilmAll + // MARK: - Search - /// Multi-QA MiniLM, a fast model fine-tuned for question-answering tasks. - case minilmMultiQA - - /// A native model provided by Apple's NaturalLanguage library. - case native + public func search( + _ query: String, top resultCount: Int? = nil, metric: DistanceMetricProtocol? = nil + ) async -> [SearchResult] { + let resultCount = resultCount ?? 5 + guard let queryEmbedding = await indexModel.encode(sentence: query) else { + print("Failed to generate query embedding for '\(query)'.") + return [] } - public enum SimilarityMetricType { - case dotproduct - case cosine - case euclidian - } - - public enum TextSplitterType { - case token - case character - case recursive - } - - public enum VectorStoreType { - case json - // TODO: - // case mlmodel - // case protobuf - // case sqlite - } - - // MARK: - Initializers - - public init(name: String? = nil, model: (any EmbeddingsProtocol)? = nil, metric: (any DistanceMetricProtocol)? = nil, vectorStore: (any VectorStoreProtocol)? = nil) async { - // Setup index with defaults - self.indexName = name ?? "SimilaritySearchKitIndex" - self.indexModel = model ?? NativeEmbeddings() - self.indexMetric = metric ?? CosineSimilarity() - self.vectorStore = vectorStore ?? JsonStore() - - // Run the model once to discover dimention size - await setupDimension() - } - - private func setupDimension() async { - if let testVector = await indexModel.encode(sentence: "Test sentence") { - dimension = testVector.count - } else { - print("Failed to generate a test input vector.") - } - } - - // MARK: - Encoding - - public func getEmbedding(for text: String, embedding: [Float]? = nil) async -> [Float] { - if let embedding = embedding, embedding.count == dimension { - // Valid embedding, no encoding needed - return embedding - } else { - // Encoding needed before adding to index - guard let encoded = await indexModel.encode(sentence: text) else { - print("Failed to encode text. \(text)") - return Array(repeating: Float(0), count: dimension) - } - return encoded - } - } - - // MARK: - Search - - public func search(_ query: String, top resultCount: Int? = nil, metric: DistanceMetricProtocol? = nil) async -> [SearchResult] { - let resultCount = resultCount ?? 5 - guard let queryEmbedding = await indexModel.encode(sentence: query) else { - print("Failed to generate query embedding for '\(query)'.") - return [] - } - - var indexIds: [String] = [] - var indexEmbeddings: [[Float]] = [] - - indexItems.forEach { item in - indexIds.append(item.id) - indexEmbeddings.append(item.embedding) - } + var indexIds: [String] = [] + var indexEmbeddings: [[Float]] = [] - // Calculate distances and find nearest neighbors - if let customMetric = metric { - // Allow custom metrics at time of query - indexMetric = customMetric - } - let searchResults = indexMetric.findNearest(for: queryEmbedding, in: indexEmbeddings, resultsCount: resultCount) - - // Map results to index ids - return searchResults.compactMap { result in - let (score, index) = result - let id = indexIds[index] - - if let item = getItem(id: id) { - return SearchResult(id: item.id, score: score, text: item.text, metadata: item.metadata) - } else { - print("Failed to find item with id '\(id)' in indexItems.") - return SearchResult(id: "000000", score: 0.0, text: "fail", metadata: [:]) - } - } + indexItems.forEach { item in + indexIds.append(item.id) + indexEmbeddings.append(item.embedding) } - public class func combinedResultsString(_ results: [SearchResult]) -> String { - let combinedResults = results.map { result -> String in - let metadataString = result.metadata.map { key, value in - "\(key.uppercased()): \(value)" - }.joined(separator: "\n") - - return "\(result.text)\n\(metadataString)" - }.joined(separator: "\n\n") - - return combinedResults + // Calculate distances and find nearest neighbors + if let customMetric = metric { + // Allow custom metrics at time of query + indexMetric = customMetric } - - public class func exportLLMPrompt(query: String, results: [SearchResult]) -> String { - let sourcesText = combinedResultsString(results) - let prompt = - """ - Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). - If you don't know the answer, just say that you don't know. Don't try to make up an answer. - ALWAYS return a "SOURCES" part in your answer. - - QUESTION: \(query) - ========= - \(sourcesText) - ========= - FINAL ANSWER: - """ - return prompt + let searchResults = indexMetric.findNearest( + for: queryEmbedding, in: indexEmbeddings, resultsCount: resultCount) + + // Map results to index ids + return searchResults.compactMap { result in + let (score, index) = result + let id = indexIds[index] + + if let item = getItem(id: id) { + return SearchResult(id: item.id, score: score, text: item.text, metadata: item.metadata) + } else { + print("Failed to find item with id '\(id)' in indexItems.") + return SearchResult(id: "000000", score: 0.0, text: "fail", metadata: [:]) + } } + } + + public class func combinedResultsString(_ results: [SearchResult]) -> String { + let combinedResults = results.map { result -> String in + let metadataString = result.metadata.map { key, value in + "\(key.uppercased()): \(value)" + }.joined(separator: "\n") + + return "\(result.text)\n\(metadataString)" + }.joined(separator: "\n\n") + + return combinedResults + } + + public class func exportLLMPrompt(query: String, results: [SearchResult]) -> String { + let sourcesText = combinedResultsString(results) + let prompt = + """ + Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). + If you don't know the answer, just say that you don't know. Don't try to make up an answer. + ALWAYS return a "SOURCES" part in your answer. + + QUESTION: \(query) + ========= + \(sourcesText) + ========= + FINAL ANSWER: + """ + return prompt + } } // MARK: - CRUD @available(macOS 11.0, iOS 15.0, *) extension SimilarityIndex { - // MARK: Create - - // Add an item with optional pre-computed embedding - public func addItem(id: String, text: String, metadata: [String: String], embedding: [Float]? = nil) async { - let embeddingResult = await getEmbedding(for: text, embedding: embedding) - - let item = IndexItem(id: id, text: text, embedding: embeddingResult, metadata: metadata) - indexItems.append(item) + // MARK: Create + + // Add an item with optional pre-computed embedding + public func addItem( + id: String, text: String, metadata: [String: String], embedding: [Float]? = nil + ) async { + let embeddingResult = await getEmbedding(for: text, embedding: embedding) + + let item = IndexItem(id: id, text: text, embedding: embeddingResult, metadata: metadata) + indexItems.append(item) + } + + public func addItems( + ids: [String], texts: [String], metadata: [[String: String]], embeddings: [[Float]?]? = nil, + onProgress: ((String) -> Void)? = nil + ) async { + // Check if all input arrays have the same length + guard ids.count == texts.count, texts.count == metadata.count else { + fatalError("Input arrays must have the same length.") } - public func addItems(ids: [String], texts: [String], metadata: [[String: String]], embeddings: [[Float]?]? = nil, onProgress: ((String) -> Void)? = nil) async { - // Check if all input arrays have the same length - guard ids.count == texts.count, texts.count == metadata.count else { - fatalError("Input arrays must have the same length.") - } - - if let embeddings = embeddings, embeddings.count != ids.count { - print("Embeddings array length must be the same as ids array length. \(embeddings.count) vs \(ids.count)") - } - - await withTaskGroup(of: Void.self) { taskGroup in - for i in 0.. IndexItem? { - return indexItems.first { $0.id == id } - } + public func getItem(id: String) -> IndexItem? { + return indexItems.first { $0.id == id } + } - public func sample(_ count: Int) -> [IndexItem]? { - return Array(indexItems.prefix(upTo: count)) - } + public func sample(_ count: Int) -> [IndexItem]? { + return Array(indexItems.prefix(upTo: count)) + } - // MARK: Update + // MARK: Update - public func updateItem(id: String, text: String? = nil, embedding: [Float]? = nil, metadata: [String: String]? = nil) { - // Check if the provided embedding has the correct dimension - if let embedding = embedding, embedding.count != dimension { - print("Dimension mismatch, expected \(dimension), saw \(embedding.count)") - } + public func updateItem( + id: String, text: String? = nil, embedding: [Float]? = nil, metadata: [String: String]? = nil + ) { + // Check if the provided embedding has the correct dimension + if let embedding = embedding, embedding.count != dimension { + print("Dimension mismatch, expected \(dimension), saw \(embedding.count)") + } - // Find the item with the specified id - if let index = indexItems.firstIndex(where: { $0.id == id }) { - // Update the text if provided - if let text = text { - indexItems[index].text = text - } - - // Update the embedding if provided - if let embedding = embedding { - indexItems[index].embedding = embedding - } - - // Update the metadata if provided - if let metadata = metadata { - indexItems[index].metadata = metadata - } - } + // Find the item with the specified id + if let index = indexItems.firstIndex(where: { $0.id == id }) { + // Update the text if provided + if let text = text { + indexItems[index].text = text + } + + // Update the embedding if provided + if let embedding = embedding { + indexItems[index].embedding = embedding + } + + // Update the metadata if provided + if let metadata = metadata { + indexItems[index].metadata = metadata + } } + } - // MARK: Delete + // MARK: Delete - public func removeItem(id: String) { - indexItems.removeAll { $0.id == id } - } + public func removeItem(id: String) { + indexItems.removeAll { $0.id == id } + } - public func removeAll() { - indexItems.removeAll() - } + public func removeAll() { + indexItems.removeAll() + } } // MARK: - Persistence -@available(macOS 13.0, iOS 16.0, *) +@available(macOS 12.0, iOS 16.0, *) extension SimilarityIndex { - public func saveIndex(toDirectory path: URL? = nil, name: String? = nil) throws -> URL { - let indexName = name ?? self.indexName - let basePath: URL - - if let specifiedPath = path { - basePath = specifiedPath - } else { - // Default local path - basePath = try getDefaultStoragePath() - } - - let savedVectorStore = try vectorStore.saveIndex(items: indexItems, to: basePath, as: indexName) - - print("Saved \(indexItems.count) index items to \(savedVectorStore.absoluteString)") - - return savedVectorStore + public func saveIndex(toDirectory path: URL? = nil, name: String? = nil) throws -> URL { + let indexName = name ?? self.indexName + let basePath: URL + + if let specifiedPath = path { + basePath = specifiedPath + } else { + // Default local path + basePath = try getDefaultStoragePath() } - public func loadIndex(fromDirectory path: URL? = nil, name: String? = nil) throws -> [IndexItem]? { + let savedVectorStore = try vectorStore.saveIndex(items: indexItems, to: basePath, as: indexName) + + print("Saved \(indexItems.count) index items to \(savedVectorStore.absoluteString)") + + return savedVectorStore + } + + // public func loadIndex(fromDirectory path: URL? = nil, name: String? = nil) throws -> [IndexItem]? + // { + // let indexName = name ?? self.indexName + // let basePath: URL + + // if let specifiedPath = path { + // basePath = specifiedPath + // } else { + // // Default local path + // basePath = try getDefaultStoragePath() + // } + + // if let vectorStorePath = vectorStore.listIndexes(at: basePath).first(where: { + // $0.lastPathComponent.contains(indexName) + // }) { + // let loadedIndexItems = try vectorStore.loadIndex(from: vectorStorePath) + // addItems(loadedIndexItems) + // print("Loaded \(indexItems.count) index items from \(vectorStorePath.absoluteString)") + // return loadedIndexItems + // } + + // return nil + // } + public func loadIndex(fromDirectory path: URL? = nil, name: String? = nil) throws -> [IndexItem]? { if let indexPath = try getIndexPath(fromDirectory: path, name: name) { let loadedIndexItems = try vectorStore.loadIndex(from: indexPath) addItems(loadedIndexItems) @@ -354,44 +393,46 @@ extension SimilarityIndex { return vectorStore.listIndexes(at: basePath).first(where: { $0.lastPathComponent.contains(indexName) }) } - private func getDefaultStoragePath() throws -> URL { - let appName = Bundle.main.bundleIdentifier ?? "SimilaritySearchKit" - let fileManager = FileManager.default - let appSupportDirectory = try fileManager.url(for: .applicationSupportDirectory, in: .userDomainMask, appropriateFor: nil, create: true) + private func getDefaultStoragePath() throws -> URL { + let appName = Bundle.main.bundleIdentifier ?? "SimilaritySearchKit" + let fileManager = FileManager.default + let appSupportDirectory = try fileManager.url( + for: .applicationSupportDirectory, in: .userDomainMask, appropriateFor: nil, create: true) - let appSpecificDirectory = appSupportDirectory.appendingPathComponent(appName) + let appSpecificDirectory = appSupportDirectory.appendingPathComponent(appName) - if !fileManager.fileExists(atPath: appSpecificDirectory.path) { - try fileManager.createDirectory(at: appSpecificDirectory, withIntermediateDirectories: true, attributes: nil) - } - - return appSpecificDirectory + if !fileManager.fileExists(atPath: appSpecificDirectory.path) { + try fileManager.createDirectory( + at: appSpecificDirectory, withIntermediateDirectories: true, attributes: nil) } - public func estimatedSizeInBytes() -> Int { - var totalSize = 0 + return appSpecificDirectory + } - for item in indexItems { - // Calculate the size of 'id' property - let idSize = item.id.utf8.count + public func estimatedSizeInBytes() -> Int { + var totalSize = 0 - // Calculate the size of 'text' property - let textSize = item.text.utf8.count + for item in indexItems { + // Calculate the size of 'id' property + let idSize = item.id.utf8.count - // Calculate the size of 'embedding' property - let floatSize = MemoryLayout.size - let embeddingSize = item.embedding.count * floatSize + // Calculate the size of 'text' property + let textSize = item.text.utf8.count - // Calculate the size of 'metadata' property - let metadataSize = item.metadata.reduce(0) { (size, keyValue) -> Int in - let keySize = keyValue.key.utf8.count - let valueSize = keyValue.value.utf8.count - return size + keySize + valueSize - } + // Calculate the size of 'embedding' property + let floatSize = MemoryLayout.size + let embeddingSize = item.embedding.count * floatSize - totalSize += idSize + textSize + embeddingSize + metadataSize - } + // Calculate the size of 'metadata' property + let metadataSize = item.metadata.reduce(0) { (size, keyValue) -> Int in + let keySize = keyValue.key.utf8.count + let valueSize = keyValue.value.utf8.count + return size + keySize + valueSize + } - return totalSize + totalSize += idSize + textSize + embeddingSize + metadataSize } + + return totalSize + } } diff --git a/Sources/SimilaritySearchKit/Core/Persistence/BinaryStore/BinaryStore.swift b/Sources/SimilaritySearchKit/Core/Persistence/BinaryStore/BinaryStore.swift index a05d69a..7cdfea0 100644 --- a/Sources/SimilaritySearchKit/Core/Persistence/BinaryStore/BinaryStore.swift +++ b/Sources/SimilaritySearchKit/Core/Persistence/BinaryStore/BinaryStore.swift @@ -39,8 +39,9 @@ public class BinaryStore: VectorStoreProtocol { public func loadIndex(from url: URL) throws -> [IndexItem] { let compressedData = try Data(contentsOf: url) let decompressedData = compressedData.withUnsafeBytes { ptr -> Data in - let buffer = UnsafeMutablePointer.allocate(capacity: compressedData.count * 5) // assuming the compressed data is at most 5 times smaller than the original data - let decompressedSize = compression_decode_buffer(buffer, compressedData.count * 5, ptr.baseAddress!.assumingMemoryBound(to: UInt8.self), compressedData.count, nil, COMPRESSION_LZMA) + let buffer = UnsafeMutablePointer.allocate(capacity: compressedData.count * 10) // assuming the compressed data is at most 10 times smaller than the original data + let decompressedSize = compression_decode_buffer(buffer, compressedData.count * 10, ptr.baseAddress!.assumingMemoryBound(to: UInt8.self), compressedData.count, nil, COMPRESSION_LZMA) +// print("Decompressed data size: \(decompressedSize), buffer size: \(compressedData.count * 10)") return Data(bytes: buffer, count: decompressedSize) } diff --git a/Tests/SimilaritySearchKitTests/BenchmarkTests.swift b/Tests/SimilaritySearchKitTests/BenchmarkTests.swift index c3aea86..2e493aa 100644 --- a/Tests/SimilaritySearchKitTests/BenchmarkTests.swift +++ b/Tests/SimilaritySearchKitTests/BenchmarkTests.swift @@ -33,6 +33,56 @@ class BenchmarkTests: XCTestCase { XCTAssertNotNil(embeddings) } + + func testBGEEmbeddings() async { + let text = MSMarco.testPassage.text + + let model = BGEEmbeddings() + + let embeddings = await model.encode(sentence: text) + + XCTAssertNotNil(embeddings) + } + func testBGESearch() { + let searchResults = ["nappy","a moose runs amok through the park, chased by a lion", "a toddler runs giggling through the park, chased by dad", ] + let testQuery = "a laughing baby gurgles happily in the arms of its father, gazing up at his eyes" + + + + let expectation = XCTestExpectation(description: "Encoding passage texts") + + Task { + let similarityIndex = await SimilarityIndex(model: BGEEmbeddings()) +// let webIndex = await SimilarityIndex(model: WebEmbeddings()) + for searchResult in searchResults { + let item = SimilarityIndex.SearchResult(id: UUID().uuidString, score: 0, text: searchResult, metadata: [:]) + await similarityIndex.addItem( + id: item.id, + text: item.text, + metadata: item.metadata + ) +// await webIndex.addItem( +// id: item.id, +// text: item.text, +// metadata: item.metadata +// ) + } + + let top_k = await similarityIndex.search(testQuery) +// let top_k_web = await webIndex.search(testQuery) + let searchResult: SimilarityIndex.SearchResult = top_k.first! +// let webResult = top_k_web.first! + print("similarity: ", testQuery, "\n vs \n", (top_k.map { "\($0.score) - \($0.text)" }).joined(separator: "\n") ) +// print("web similarity: ", testQuery, "\n vs \n", (top_k_web.map { "\($0.score) - \($0.text)" }).joined(separator: "\n")) +// XCTAssertEqual(searchResult.text, searchPassage.text) +// XCTAssertEqual(searchResult.metadata, searchPassage.metadata) + XCTAssertNotEqual(searchResult.score, 0) + + expectation.fulfill() + } + + wait(for: [expectation], timeout: 60) + } func testDistilbertSearch() { let searchPassage = MSMarco.testPassage diff --git a/Tests/SimilaritySearchKitTests/SimilaritySearchKitTests.swift b/Tests/SimilaritySearchKitTests/SimilaritySearchKitTests.swift index c908040..ec056d2 100644 --- a/Tests/SimilaritySearchKitTests/SimilaritySearchKitTests.swift +++ b/Tests/SimilaritySearchKitTests/SimilaritySearchKitTests.swift @@ -5,8 +5,9 @@ // Created by Zach Nagengast on 4/13/23. // -import XCTest import CoreML +import XCTest + @testable import SimilaritySearchKit @testable import SimilaritySearchKitDistilbert @testable import SimilaritySearchKitMiniLMAll @@ -15,60 +16,72 @@ import CoreML @available(macOS 13.0, iOS 16.0, *) class SimilaritySearchKitTests: XCTestCase { - override func setUp() { - executionTimeAllowance = 60 - continueAfterFailure = true - } + override func setUp() { + executionTimeAllowance = 60 + continueAfterFailure = true + } + + func testSavingJsonIndex() async { + let similarityIndex = await SimilarityIndex( + model: DistilbertEmbeddings(), vectorStore: JsonStore()) - func testSavingJsonIndex() async { - let similarityIndex = await SimilarityIndex(model: DistilbertEmbeddings(), vectorStore: JsonStore()) + await similarityIndex.addItem( + id: "1", text: "Example text", metadata: ["source": "test source"], + embedding: [0.1, 0.2, 0.3]) - await similarityIndex.addItem(id: "1", text: "Example text", metadata: ["source": "test source"], embedding: [0.1, 0.2, 0.3]) + let successPath = try! similarityIndex.saveIndex(name: "TestIndexForSaving") - let successPath = try! similarityIndex.saveIndex(name: "TestIndexForSaving") + XCTAssertNotNil(successPath) + } - XCTAssertNotNil(successPath) - } - - func testLoadingJsonIndex() async { - let similarityIndex = await SimilarityIndex(model: DistilbertEmbeddings(), vectorStore: JsonStore()) + func testLoadingJsonIndex() async { + let similarityIndex = await SimilarityIndex( + model: DistilbertEmbeddings(), vectorStore: JsonStore()) - await similarityIndex.addItem(id: "1", text: "Example text", metadata: ["source": "test source"]) + await similarityIndex.addItem( + id: "1", text: "Example text", metadata: ["source": "test source"]) - let successPath = try! similarityIndex.saveIndex(name: "TestIndexForLoading") + let successPath = try! similarityIndex.saveIndex(name: "TestIndexForLoading") - XCTAssertNotNil(successPath) + XCTAssertNotNil(successPath) - let similarityIndex2 = await SimilarityIndex(model: DistilbertEmbeddings(), vectorStore: JsonStore()) + let similarityIndex2 = await SimilarityIndex( + model: DistilbertEmbeddings(), vectorStore: JsonStore()) - let loadedItems = try! similarityIndex2.loadIndex(name: "TestIndexForLoading") + let loadedItems = try! similarityIndex2.loadIndex(name: "TestIndexForLoading") - XCTAssertNotNil(loadedItems) - } + XCTAssertNotNil(loadedItems) + } - func testSavingBinaryIndex() async { - let similarityIndex = await SimilarityIndex(model: DistilbertEmbeddings(), vectorStore: BinaryStore()) + func testSavingBinaryIndex() async { + let similarityIndex = await SimilarityIndex( + model: DistilbertEmbeddings(), vectorStore: BinaryStore()) - await similarityIndex.addItem(id: "1", text: "Example text", metadata: ["source": "test source"], embedding: [0.1, 0.2, 0.3]) + await similarityIndex.addItem( + id: "1", text: "Example text", metadata: ["source": "test source"], + embedding: [0.1, 0.2, 0.3]) - let successPath = try! similarityIndex.saveIndex(name: "TestIndexForSaving") + let successPath = try! similarityIndex.saveIndex(name: "TestIndexForSaving") - XCTAssertNotNil(successPath) - } + XCTAssertNotNil(successPath) + } - func testLoadingBinaryIndex() async { - let similarityIndex = await SimilarityIndex(model: DistilbertEmbeddings(), vectorStore: BinaryStore()) + func testLoadingBinaryIndex() async { + let similarityIndex = await SimilarityIndex( + model: DistilbertEmbeddings(), vectorStore: BinaryStore()) - await similarityIndex.addItem(id: "1", text: "Example text", metadata: ["source": "test source"]) + await similarityIndex.addItem( + id: "1", text: "Example text", metadata: ["source": "test source"]) - let successPath = try! similarityIndex.saveIndex(name: "TestIndexForLoading") + let successPath = try! similarityIndex.saveIndex(name: "TestIndexForLoading") - XCTAssertNotNil(successPath) + XCTAssertNotNil(successPath) - let similarityIndex2 = await SimilarityIndex(model: DistilbertEmbeddings(), vectorStore: BinaryStore()) + let similarityIndex2 = await SimilarityIndex( + model: DistilbertEmbeddings(), vectorStore: BinaryStore()) - let loadedItems = try! similarityIndex2.loadIndex(name: "TestIndexForLoading") + let loadedItems = try! similarityIndex2.loadIndex(name: "TestIndexForLoading") - XCTAssertNotNil(loadedItems) - } + XCTAssertNotNil(loadedItems) + } }