diff --git a/.changeset/five-queens-tan.md b/.changeset/five-queens-tan.md new file mode 100644 index 00000000..5be52afe --- /dev/null +++ b/.changeset/five-queens-tan.md @@ -0,0 +1,7 @@ +--- +"tiktoken": patch +"js-tiktoken": patch +"@dqbd/tiktoken": patch +--- + +Add support for gpt-4o, gpt-4o-mini, text-embedding-3-small and text-embedding-3-large diff --git a/js/src/core.ts b/js/src/core.ts index 556eab19..7aee83fc 100644 --- a/js/src/core.ts +++ b/js/src/core.ts @@ -271,11 +271,16 @@ export function getEncodingNameForModel(model: TiktokenModel) { case "gpt-4-turbo-2024-04-09": case "gpt-4-turbo-preview": case "gpt-4-0125-preview": - case "text-embedding-ada-002": { + case "text-embedding-ada-002": + case "text-embedding-3-small": + case "text-embedding-3-large": { return "cl100k_base"; } case "gpt-4o": - case "gpt-4o-2024-05-13": { + case "gpt-4o-2024-05-13": + case "gpt-4o-2024-08-06": + case "gpt-4o-mini-2024-07-18": + case "gpt-4o-mini": { return "o200k_base"; } default: diff --git a/tiktoken/model_to_encoding.json b/tiktoken/model_to_encoding.json index e9bfb424..e7890e2a 100644 --- a/tiktoken/model_to_encoding.json +++ b/tiktoken/model_to_encoding.json @@ -20,6 +20,8 @@ "text-davinci-edit-001": "p50k_edit", "code-davinci-edit-001": "p50k_edit", "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", "text-similarity-davinci-001": "r50k_base", "text-similarity-curie-001": "r50k_base", "text-similarity-babbage-001": "r50k_base", @@ -54,5 +56,8 @@ "gpt-4-0125-preview": "cl100k_base", "gpt-4-vision-preview": "cl100k_base", "gpt-4o": "o200k_base", - "gpt-4o-2024-05-13": "o200k_base" + "gpt-4o-2024-05-13": "o200k_base", + "gpt-4o-2024-08-06":"o200k_base", + "gpt-4o-mini-2024-07-18": "o200k_base", + "gpt-4o-mini": "o200k_base" } diff --git a/wasm/src/lib.rs b/wasm/src/lib.rs index d0c1b4e0..72cb06b1 100644 --- a/wasm/src/lib.rs +++ b/wasm/src/lib.rs @@ -420,6 +420,9 @@ export type TiktokenModel = | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" + | "gpt-4o-2024-08-06" + | "gpt-4o-mini-2024-07-18" + | "gpt-4o-mini" /** * @param {TiktokenModel} encoding @@ -436,7 +439,6 @@ pub fn encoding_for_model( extend_special_tokens: JsValue, ) -> Result { let encoding = match model { - "davinci" => Ok("p50k_base"), "text-davinci-003" => Ok("p50k_base"), "text-davinci-002" => Ok("p50k_base"), "text-davinci-001" => Ok("r50k_base"), @@ -457,6 +459,8 @@ pub fn encoding_for_model( "text-davinci-edit-001" => Ok("p50k_edit"), "code-davinci-edit-001" => Ok("p50k_edit"), "text-embedding-ada-002" => Ok("cl100k_base"), + "text-embedding-3-small" => Ok("cl100k_base"), + "text-embedding-3-large" => Ok("cl100k_base"), "text-similarity-davinci-001" => Ok("r50k_base"), "text-similarity-curie-001" => Ok("r50k_base"), "text-similarity-babbage-001" => Ok("r50k_base"), @@ -492,6 +496,9 @@ pub fn encoding_for_model( "gpt-4-0125-preview" => Ok("cl100k_base"), "gpt-4o" => Ok("o200k_base"), "gpt-4o-2024-05-13" => Ok("o200k_base"), + "gpt-4o-2024-08-06" => Ok("o200k_base"), + "gpt-4o-mini-2024-07-18" => Ok("o200k_base"), + "gpt-4o-mini" => Ok("o200k_base"), model => Err(JsError::new( format!("Invalid model: {}", model.to_string()).as_str(), )),