Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added BGE embeddings model and Web Embeddings options for similarity search #30

Open
wants to merge 35 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
31083f0
Felt like fatal errors were overkill here, and added more observability
michaeljelly Jul 10, 2023
9dc8b8d
Update SimilarityIndex.swift
michaeljelly Jul 10, 2023
3b24fe4
Update SimilarityIndex.swift
michaeljelly Jul 10, 2023
4b48d7d
Update SimilarityIndex.swift
michaeljelly Jul 10, 2023
2ec895a
Create BinaryStore.swift
michaeljelly Jul 14, 2023
84f9073
public init for BinaryStore
michaeljelly Jul 14, 2023
5fd55d2
Public save and loadIndex for similarityindex
michaeljelly Jul 14, 2023
104752d
Possible fix to EXC_BAD_ACCESS on appending to indexItems
michaeljelly Jul 14, 2023
f714fa6
Add tests
ZachNagengast Jul 15, 2023
3898491
Testing if can be made available for 12.0
michaeljelly Jul 18, 2023
d38f0e3
Update BertTokenizer.swift
michaeljelly Aug 14, 2023
38ca39c
Update BertTokenizer.swift
michaeljelly Aug 14, 2023
db77d65
Update BertTokenizer.swift
michaeljelly Aug 14, 2023
63c52d3
Update BertTokenizer.swift
michaeljelly Aug 14, 2023
8abdded
Update BertTokenizer.swift
michaeljelly Aug 14, 2023
31d5c24
Update BertTokenizer.swift
michaeljelly Aug 14, 2023
8b40214
commit
michaeljelly Nov 9, 2023
8bbf636
commit
michaeljelly Nov 27, 2023
f51314b
commit
michaeljelly Nov 27, 2023
8918418
commit
michaeljelly Nov 28, 2023
052bf21
commit
michaeljelly Nov 28, 2023
41b8bbf
merged with upstream
michaeljelly Nov 28, 2023
c9e496c
commit
michaeljelly Nov 28, 2023
dc9419a
commit
michaeljelly Nov 28, 2023
96a98d8
commit
michaeljelly Nov 30, 2023
dad0977
commit
michaeljelly Dec 2, 2023
10cda23
commit
michaeljelly Dec 2, 2023
edd6d7c
commit
michaeljelly Dec 2, 2023
1b800ce
commit
michaeljelly Dec 2, 2023
b34c103
commit
michaeljelly Dec 2, 2023
724b261
commit
michaeljelly Dec 2, 2023
dcdd96c
commit
michaeljelly Dec 2, 2023
de1db24
commit
michaeljelly Dec 2, 2023
7738aad
commit
michaeljelly Dec 2, 2023
b2685c0
Update BinaryStore.swift
michaeljelly Jan 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ let package = Package(
name: "SimilaritySearchKitDistilbert",
targets: ["SimilaritySearchKitDistilbert"]
),

.library(
name: "SimilaritySearchKitMiniLMAll",
targets: ["SimilaritySearchKitMiniLMAll"]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//
// MiniLMAllEmbeddings.swift
//
//
// Created by Zach Nagengast on 4/20/23.
//

import CoreML
import Foundation
import SimilaritySearchKit

@available(macOS 12.0, iOS 15.0, *)
public class BGEEmbeddings: EmbeddingsProtocol {
public let model: BGE_small
public let tokenizer: BertTokenizer
public let inputDimention: Int = 512
public let outputDimention: Int = 384

public init(tokenizer: BertTokenizer? = nil) {
let modelConfig = MLModelConfiguration()
modelConfig.computeUnits = .all
print("INIT BGE", tokenizer == nil ? "NO TOKENIZER" : "tokenizer provided")
do {
self.model = try BGE_small(configuration: modelConfig)
print("INIT BGE MODEL LOADED")

} catch {
fatalError("Failed to load the Core ML model. Error: \(error.localizedDescription)")
}
print("INIT BGE Tokenizer adding")

self.tokenizer = tokenizer ?? BertTokenizer()
}

// MARK: - Dense Embeddings

public func encode(sentence: String) async -> [Float]? {
// Encode input text as bert tokens
let inputTokens = Array(tokenizer.buildModelTokens(sentence: sentence))
let (inputIds, attentionMask) = tokenizer.buildModelInputs(from: inputTokens)

// Send tokens through the MLModel
// print(inputIds.count, inputIds[0])
// print(inputIds, attentionMask)
let embeddings = generateEmbeddings(inputIds: inputIds, attentionMask: attentionMask)

return embeddings
}

public func generateEmbeddings(inputIds: MLMultiArray, attentionMask: MLMultiArray) -> [Float]? {
let inputFeatures = BGE_smallInput(input_ids: inputIds, attention_mask_1: attentionMask)
// print(inputFeatures)
let output = try? model.prediction(input: inputFeatures)

guard let embeddings = output?.var_1059 else {
print("failed")
print(output)
return nil
}
// print("EMBEDDINGS")
// print(embeddings)
let embeddingsArray: [Float] = (0..<embeddings.count).map { Float(embeddings[$0].floatValue) }
// print(embeddingsArray)
return embeddingsArray
}
}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"fileFormatVersion": "1.0.0",
"itemInfoEntries": {
"A41C6F89-E177-41A5-9FAE-DE663D24D4CB": {
"author": "com.apple.CoreML",
"description": "CoreML Model Weights",
"name": "weights",
"path": "com.apple.CoreML/weights"
},
"FAEAF560-C419-4C5D-9E85-4EFF4005B14E": {
"author": "com.apple.CoreML",
"description": "CoreML Model Specification",
"name": "model.mlmodel",
"path": "com.apple.CoreML/model.mlmodel"
}
},
"rootModelIdentifier": "FAEAF560-C419-4C5D-9E85-4EFF4005B14E"
}
137 changes: 137 additions & 0 deletions Sources/SimilaritySearchKit/Core/Embeddings/Models/WebEmbeddings.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
////
//// WebEmbeddings.swift
////
////
//// Created by Michael Jelly on 28/11/23.
////
//

import Foundation
import SimilaritySearchKit


//
//@available(macOS 11.0, iOS 15.0, *)
//public
class WebEmbeddings: EmbeddingsProtocol {
var tokenizer: SimilaritySearchKit.NativeEmbeddings.TokenizerType

var model: SimilaritySearchKit.NativeEmbeddings.ModelType

typealias TokenizerType = NativeEmbeddings.TokenizerType

typealias ModelType = NativeEmbeddings.ModelType

let url: String

public init(url: String) {
// for convenience innited these things so I didn't have to change the Protocol code
self.tokenizer = NativeTokenizer()
self.model = NativeEmbeddings().model
self.url = url
}

// MARK: - Dense Embeddings

func encode(sentence: String) async -> [Float]? {
let embeddings: [Float]? = try? await withCheckedContinuation { continuation in
getEmbedding(url: url, options: ["texts": sentence]) { (result) in
if let error = result["error"] as? Never {
print("Error encoding:", error.localizedDescription)
continuation.resume(throwing: error)
return
}
// if (sentence != "Test sentence"){
// assumes the web embeddings can embed multiple sentences and thus returns an array of embeddings
let arrayOfEmbeddings = result["data"] as? NSArray
// assuming you've embedded one sentence and just once that, so gets the first array
let firstEmbedding = arrayOfEmbeddings?[0] as? NSArray
let floatArray: [Float]? = (firstEmbedding)?.compactMap {
// Try to convert each element to Float
if let number = $0 as? NSNumber {
return number.floatValue
} else if let string = $0 as? String, let floatValue = Float(string) {
return floatValue
}
return nil
}
print(floatArray?.count ?? "No count", result)

// }
continuation.resume(returning: floatArray)
// continuation.success(embeddings)
}

}
return embeddings
}
}

func getEmbedding(
url: String, options: [String: Any]?, completion: @escaping (([String: Any]) -> Void)
) {

let headers = [
"Content-Type": "application/json",
"Authorization": "Bearer token", // Replace with the appropriate value
]

let model = options?["model"] as? String
func generateOptionString(options: [String: Any]?) -> String {
guard let options = options else { return "" }
return options.map { (key, value) in
"&\(key)=\("\(value ?? "")".addingPercentEncoding(withAllowedCharacters: .urlHostAllowed) ?? "")"
}.joined()
}
do {
let baseURL = url
let url = baseURL + generateOptionString(options: options)
print(url)
var request = URLRequest(url: URL(string: url)!)
var requestBody: Data? = nil
if let requestBody = requestBody,
let data = try? JSONSerialization.data(withJSONObject: requestBody, options: [])
{

request.httpBody = data
request.httpMethod = "POST"

} else {
print("No request body", requestBody)
request.httpMethod = "GET"
}

request.allHTTPHeaderFields = headers

let task = URLSession.shared.dataTask(with: request) { (data, response, error) in
guard let data = data, error == nil else {
print("Error: \(error?.localizedDescription ?? "Unknown error")")
return
}
// parse the result as JSON, since that's what the API provides
do {

if let json = try JSONSerialization.jsonObject(with: data, options: [])
as? [String: Any]
{
if json["data"] == nil { print(json) }

completion(json)

} else {
print("Unable to parse JSON, \(error?.localizedDescription ?? "Unknown error")")

}
} catch let error {
let responseString = String(data: data, encoding: .utf8)
print("Error parsing JSON: \(error)\nJSON:\n\(responseString)")
print(
"URLSession response: \(String(data: data, encoding: .utf8) ?? "Unable to decode response")"
)

}
}
task.resume()
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ public class BertTokenizer: TokenizerProtocol {
private let vocab: [String: Int]
private let ids_to_tokens: [Int: String]

public init() {
let url = Bundle.module.url(forResource: "bert_tokenizer_vocab", withExtension: "txt")!
let vocabTxt = try! String(contentsOf: url)
public init(vocabText: String? = nil) {
var vocabTxt = vocabText ?? "";
if vocabTxt.isEmpty {
let url = Bundle.module.url(forResource: "bert_tokenizer_vocab", withExtension: "txt")!
vocabTxt = try! String(contentsOf: url)
}
let tokens = vocabTxt.split(separator: "\n").map { String($0) }
var vocab: [String: Int] = [:]
var ids_to_tokens: [Int: String] = [:]
Expand Down
Loading