Skip to content

Commit

Permalink
feat: Add tokenize_tool with build support
Browse files Browse the repository at this point in the history
Branch: HFTokenizers

Signed-off-by: Gabe Goodhart <[email protected]>
  • Loading branch information
gabe-l-hart committed Dec 11, 2024
1 parent e286ba0 commit c1776a3
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 0 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ set(CMAKE_CXX_STANDARD 17)
project(Tokenizers)

option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)

# Ignore weak attribute warning
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
Expand Down Expand Up @@ -83,3 +84,8 @@ if(TOKENIZERS_BUILD_TEST)
add_test(${test_name} "${test_name}")
endforeach()
endif()

# Build tools
if(TOKENIZERS_BUILD_TOOLS)
add_subdirectory(tools/tokenize_tool)
endif()
9 changes: 9 additions & 0 deletions tools/tokenize_tool/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
#
# This source code is licensed under the BSD-style license found in the LICENSE
# file in the root directory of this source tree.

file(GLOB source_files ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
get_filename_component(tool_name ${CMAKE_CURRENT_SOURCE_DIR} NAME)
add_executable(${tool_name} ${source_files})
target_link_libraries(${tool_name} PRIVATE tokenizers)
103 changes: 103 additions & 0 deletions tools/tokenize_tool/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

/**
* This is a simple tool to instantiate a tokenizer and run it over some text.
* It can be used to evaluate the tokenization done by a given tokenizer model
* relative to its native python library.
*/

// Standard
#include <iostream>
#include <memory>
#include <sstream>

// Local
#include "hf_tokenizer.h"
#include "sentencepiece.h"
#include "tiktoken.h"

using namespace tokenizers;

std::string help(char* argv[]) {
std::stringstream ss;
ss << "Usage: " << argv[0] << " <type> <model> <input to tokenize...>" << std::endl << std::endl;
ss << "Types:\n" << std::endl;
ss << "* sentencepiece: SPTokenizer" << std::endl;
ss << "* tiktoken: Tiktoken" << std::endl;
ss << "* hf_tokenizers: HFTokenizer" << std::endl;
return ss.str();
}

int main(int argc, char* argv[]) {

// Check for the right number of CLI args
if (argc < 4) {
std::cerr << help(argv) << std::endl;
return 1;
}

// Parse CLI args
const std::string tokenizer_type(argv[1]);
const std::string model_path(argv[2]);
std::stringstream prompt_ss;
for (auto i = 3; i < argc; ++i) {
if (i > 3) {
prompt_ss << " ";
}
prompt_ss << argv[i];
}
const std::string prompt = prompt_ss.str();

// Instantiate the tokenizer
std::unique_ptr<Tokenizer> tok_ptr;
if (tokenizer_type == "sentencepiece") {
tok_ptr.reset(new SPTokenizer());
} else if (tokenizer_type == "tiktoken") {
tok_ptr.reset(new Tiktoken());
} else if (tokenizer_type == "hf_tokenizer") {
tok_ptr.reset(new HFTokenizer());
} else {
std::stringstream ss;
ss << "ERROR: Invalid tokenizer type: " << tokenizer_type << std::endl << std::endl;
ss << help(argv);
std::cerr << ss.str() << std::endl;
return 1;
}

// Load from the path
tok_ptr->load(model_path);

// Log out the IDs for the BOS/EOS tokens
std::cout << "Vocab Size: " << tok_ptr->vocab_size() << std::endl;
std::cout << "BOS: " << tok_ptr->bos_tok() << std::endl;
std::cout << "EOS: " << tok_ptr->eos_tok() << std::endl << std::endl;

// Encode
std::cout << "PROMPT:" << std::endl << prompt << std::endl << std::endl;
std::cout << "Encoding..." << std::endl;
const auto encoded_result = tok_ptr->encode(prompt, 0, 0);
const auto encoded = encoded_result.get();
std::cout << "[";
for (const auto tok_id : encoded) {
std::cout << " " << tok_id;
}
std::cout << " ]" << std::endl << std::endl;

// Decode
std::cout << "Decoding..." << std::endl;
uint64_t prev = tok_ptr->bos_tok();
for (const auto& current : encoded) {
const auto decoded_result = tok_ptr->decode(prev, current);
std::cout << decoded_result.get();
prev = current;
}
std::cout << std::endl;

return 0;
}

0 comments on commit c1776a3

Please sign in to comment.