From e761dcbca25047dc5ccc2e83442b763bf964a273 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 6 Aug 2024 18:56:48 -0400 Subject: [PATCH] backend: move more stuff into LlamaCppBackend Signed-off-by: Jared Van Bortel --- gpt4all-backend/CMakeLists.txt | 2 +- ...lmodel_shared.cpp => llamacpp_backend.cpp} | 409 ++++++++++++++++-- gpt4all-backend/llamacpp_backend.h | 187 ++++++++ gpt4all-backend/llamacpp_backend_impl.cpp | 10 +- gpt4all-backend/llamacpp_backend_impl.h | 10 +- gpt4all-backend/llmodel.cpp | 350 --------------- gpt4all-backend/llmodel.h | 212 +-------- gpt4all-backend/llmodel_c.cpp | 13 +- gpt4all-chat/chatapi.cpp | 23 +- gpt4all-chat/chatapi.h | 65 --- gpt4all-chat/chatllm.cpp | 71 +-- gpt4all-chat/chatllm.h | 11 +- gpt4all-chat/embllm.cpp | 12 +- gpt4all-chat/embllm.h | 4 +- gpt4all-chat/llm.cpp | 4 +- gpt4all-chat/main.cpp | 4 +- gpt4all-chat/modellist.cpp | 8 +- gpt4all-chat/mysettings.cpp | 8 +- gpt4all-chat/network.cpp | 4 +- 19 files changed, 660 insertions(+), 747 deletions(-) rename gpt4all-backend/{llmodel_shared.cpp => llamacpp_backend.cpp} (52%) create mode 100644 gpt4all-backend/llamacpp_backend.h delete mode 100644 gpt4all-backend/llmodel.cpp diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt index 9a9fd5783f556..62c326bea9fa9 100644 --- a/gpt4all-backend/CMakeLists.txt +++ b/gpt4all-backend/CMakeLists.txt @@ -142,7 +142,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) endforeach() add_library(llmodel - llmodel.h llmodel.cpp llmodel_shared.cpp + llmodel.h llamacpp_backend.cpp llmodel_c.h llmodel_c.cpp dlhandle.cpp ) diff --git a/gpt4all-backend/llmodel_shared.cpp b/gpt4all-backend/llamacpp_backend.cpp similarity index 52% rename from gpt4all-backend/llmodel_shared.cpp rename to gpt4all-backend/llamacpp_backend.cpp index 7477254a74ff8..3d3ee1a230aa1 100644 --- a/gpt4all-backend/llmodel_shared.cpp +++ b/gpt4all-backend/llamacpp_backend.cpp @@ -1,20 +1,46 @@ -#include "llmodel.h" +#include "llamacpp_backend.h" + +#include "dlhandle.h" #include #include #include #include +#include +#include +#include #include #include +#include +#include #include #include #include #include #include +#include #include +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#endif + +#ifdef _MSC_VER +# include +#endif + +#if defined(__APPLE__) && defined(__aarch64__) +# include "sysinfo.h" // for getSystemTotalRAMInBytes +#endif + +namespace fs = std::filesystem; namespace ranges = std::ranges; + static bool parsePromptTemplate(const std::string &tmpl, std::vector &placeholders, std::string &err) { static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))"); @@ -38,15 +64,16 @@ static bool parsePromptTemplate(const std::string &tmpl, std::vector promptCallback, - std::function responseCallback, - bool allowContextShift, - PromptContext &promptCtx, - bool special, - std::string *fakeReply) -{ +void LlamaCppBackend::prompt( + const std::string &prompt, + const std::string &promptTemplate, + std::function promptCallback, + std::function responseCallback, + bool allowContextShift, + PromptContext &promptCtx, + bool special, + std::string *fakeReply +) { if (!isModelLoaded()) { std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n"; return; @@ -153,11 +180,13 @@ void LLModel::prompt(const std::string &prompt, } // returns false on error -bool LLModel::decodePrompt(std::function promptCallback, - std::function responseCallback, - bool allowContextShift, - PromptContext &promptCtx, - std::vector embd_inp) { +bool LlamaCppBackend::decodePrompt( + std::function promptCallback, + std::function responseCallback, + bool allowContextShift, + PromptContext &promptCtx, + std::vector embd_inp +) { if ((int) embd_inp.size() > promptCtx.n_ctx - 4) { responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed."); std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() << @@ -224,9 +253,11 @@ static std::string::size_type stringsOverlap(const std::string &s, const std::st return std::string::npos; } -void LLModel::generateResponse(std::function responseCallback, - bool allowContextShift, - PromptContext &promptCtx) { +void LlamaCppBackend::generateResponse( + std::function responseCallback, + bool allowContextShift, + PromptContext &promptCtx +) { static const char *stopSequences[] { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context", }; @@ -371,31 +402,327 @@ void LLModel::generateResponse(std::function promptCtx.n_past -= cachedTokens.size(); } -void LLModel::embed( - const std::vector &texts, float *embeddings, std::optional prefix, int dimensionality, - size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb +/* ********************************* + * Backend implementation management + * ********************************* */ + +#ifndef __APPLE__ +static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"}; +#elif defined(__aarch64__) +static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"}; +#else +static const std::string DEFAULT_BACKENDS[] = {"cpu"}; +#endif + +std::string s_implementations_search_path = "."; + +#if !(defined(__x86_64__) || defined(_M_X64)) + // irrelevant on non-x86_64 + #define cpu_supports_avx() -1 + #define cpu_supports_avx2() -1 +#elif defined(_MSC_VER) + // MSVC + static int get_cpu_info(int func_id, int reg_id) { + int info[4]; + __cpuid(info, func_id); + return info[reg_id]; + } + + // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX + #define cpu_supports_avx() !!(get_cpu_info(1, 2) & (1 << 28)) + // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX + #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 << 5)) +#else + // gcc/clang + #define cpu_supports_avx() !!__builtin_cpu_supports("avx") + #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2") +#endif + +LlamaCppBackend::Implementation::Implementation(Dlhandle &&dlhandle_) + : m_dlhandle(new Dlhandle(std::move(dlhandle_))) { + auto get_model_type = m_dlhandle->get("get_model_type"); + assert(get_model_type); + m_modelType = get_model_type(); + auto get_build_variant = m_dlhandle->get("get_build_variant"); + assert(get_build_variant); + m_buildVariant = get_build_variant(); + m_getFileArch = m_dlhandle->get("get_file_arch"); + assert(m_getFileArch); + m_isArchSupported = m_dlhandle->get("is_arch_supported"); + assert(m_isArchSupported); + m_construct = m_dlhandle->get("construct"); + assert(m_construct); +} + +LlamaCppBackend::Implementation::Implementation(Implementation &&o) + : m_getFileArch(o.m_getFileArch) + , m_isArchSupported(o.m_isArchSupported) + , m_construct(o.m_construct) + , m_modelType(o.m_modelType) + , m_buildVariant(o.m_buildVariant) + , m_dlhandle(o.m_dlhandle) { + o.m_dlhandle = nullptr; +} + +LlamaCppBackend::Implementation::~Implementation() +{ + delete m_dlhandle; +} + +static bool isImplementation(const Dlhandle &dl) +{ + return dl.get("is_g4a_backend_model_implementation"); +} + +// Add the CUDA Toolkit to the DLL search path on Windows. +// This is necessary for chat.exe to find CUDA when started from Qt Creator. +static void addCudaSearchPath() +{ +#ifdef _WIN32 + if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) { + auto libDir = std::wstring(cudaPath) + L"\\bin"; + if (!AddDllDirectory(libDir.c_str())) { + auto err = GetLastError(); + std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n"; + } + } +#endif +} + +const std::vector &LlamaCppBackend::Implementation::implementationList() +{ + if (cpu_supports_avx() == 0) { + throw std::runtime_error("CPU does not support AVX"); + } + + // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the + // individual models without the cleanup of the static list interfering + static auto* libs = new std::vector([] () { + std::vector fres; + + addCudaSearchPath(); + + std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)"; + if (cpu_supports_avx2() == 0) { + impl_name_re += "-avxonly"; + } + std::regex re(impl_name_re); + auto search_in_directory = [&](const std::string& paths) { + std::stringstream ss(paths); + std::string path; + // Split the paths string by the delimiter and process each path. + while (std::getline(ss, path, ';')) { + std::u8string u8_path(path.begin(), path.end()); + // Iterate over all libraries + for (const auto &f : fs::directory_iterator(u8_path)) { + const fs::path &p = f.path(); + + if (p.extension() != LIB_FILE_EXT) continue; + if (!std::regex_search(p.stem().string(), re)) { + std::cerr << "did not match regex: " << p.stem().string() << "\n"; + continue; + } + + // Add to list if model implementation + Dlhandle dl; + try { + dl = Dlhandle(p); + } catch (const Dlhandle::Exception &e) { + std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n"; + continue; + } + if (!isImplementation(dl)) { + std::cerr << "Not an implementation: " << p.filename().string() << "\n"; + continue; + } + fres.emplace_back(Implementation(std::move(dl))); + } + } + }; + + search_in_directory(s_implementations_search_path); + + return fres; + }()); + // Return static result + return *libs; +} + +static std::string applyCPUVariant(const std::string &buildVariant) +{ + if (buildVariant != "metal" && cpu_supports_avx2() == 0) { + return buildVariant + "-avxonly"; + } + return buildVariant; +} + +const LlamaCppBackend::Implementation* LlamaCppBackend::Implementation::implementation( + const char *fname, + const std::string& buildVariant ) { - (void)texts; - (void)embeddings; - (void)prefix; - (void)dimensionality; - (void)tokenCount; - (void)doMean; - (void)atlas; - (void)cancelCb; - throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings"); + bool buildVariantMatched = false; + std::optional archName; + for (const auto& i : implementationList()) { + if (buildVariant != i.m_buildVariant) continue; + buildVariantMatched = true; + + char *arch = i.m_getFileArch(fname); + if (!arch) continue; + archName = arch; + + bool archSupported = i.m_isArchSupported(arch); + free(arch); + if (archSupported) return &i; + } + + if (!buildVariantMatched) + return nullptr; + if (!archName) + throw UnsupportedModelError("Unsupported file format"); + + throw BadArchError(std::move(*archName)); } -void LLModel::embed( - const std::vector &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount, - bool doMean, bool atlas +LlamaCppBackend *LlamaCppBackend::Implementation::construct( + const std::string &modelPath, + const std::string &backend, + int n_ctx ) { - (void)texts; - (void)embeddings; - (void)isRetrieval; - (void)dimensionality; - (void)tokenCount; - (void)doMean; - (void)atlas; - throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings"); + std::vector desiredBackends; + if (backend != "auto") { + desiredBackends.push_back(backend); + } else { + desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS)); + } + + for (const auto &desiredBackend: desiredBackends) { + const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend)); + + if (impl) { + // Construct llmodel implementation + auto *fres = impl->m_construct(); + fres->m_implementation = impl; + +#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs + /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at + * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in + * most (all?) places where this is called, causing underestimation of required + * memory. */ + if (backend == "auto" && desiredBackend == "metal") { + // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not + size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100); + if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) { + delete fres; + continue; + } + } +#else + (void)n_ctx; +#endif + + return fres; + } + } + + throw MissingImplementationError("Could not find any implementations for backend: " + backend); +} + +LlamaCppBackend *LlamaCppBackend::Implementation::constructGlobalLlama(const std::optional &backend) +{ + static std::unordered_map> implCache; + + const std::vector *impls; + try { + impls = &implementationList(); + } catch (const std::runtime_error &e) { + std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n"; + return nullptr; + } + + std::vector desiredBackends; + if (backend) { + desiredBackends.push_back(backend.value()); + } else { + desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS)); + } + + const Implementation *impl = nullptr; + + for (const auto &desiredBackend: desiredBackends) { + auto cacheIt = implCache.find(desiredBackend); + if (cacheIt != implCache.end()) + return cacheIt->second.get(); // cached + + for (const auto &i: *impls) { + if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) { + impl = &i; + break; + } + } + + if (impl) { + auto *fres = impl->m_construct(); + fres->m_implementation = impl; + implCache[desiredBackend] = std::unique_ptr(fres); + return fres; + } + } + + std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") + << "\n"; + return nullptr; +} + +std::vector LlamaCppBackend::Implementation::availableGPUDevices(size_t memoryRequired) +{ + std::vector devices; +#ifndef __APPLE__ + static const std::string backends[] = {"kompute", "cuda"}; + for (const auto &backend: backends) { + auto *llama = constructGlobalLlama(backend); + if (llama) { + auto backendDevs = llama->availableGPUDevices(memoryRequired); + devices.insert(devices.end(), backendDevs.begin(), backendDevs.end()); + } + } +#endif + return devices; +} + +int32_t LlamaCppBackend::Implementation::maxContextLength(const std::string &modelPath) +{ + auto *llama = constructGlobalLlama(); + return llama ? llama->maxContextLength(modelPath) : -1; +} + +int32_t LlamaCppBackend::Implementation::layerCount(const std::string &modelPath) +{ + auto *llama = constructGlobalLlama(); + return llama ? llama->layerCount(modelPath) : -1; +} + +bool LlamaCppBackend::Implementation::isEmbeddingModel(const std::string &modelPath) +{ + auto *llama = constructGlobalLlama(); + return llama && llama->isEmbeddingModel(modelPath); +} + +void LlamaCppBackend::Implementation::setImplementationsSearchPath(const std::string& path) +{ + s_implementations_search_path = path; +} + +const std::string& LlamaCppBackend::Implementation::implementationsSearchPath() +{ + return s_implementations_search_path; +} + +bool LlamaCppBackend::Implementation::hasSupportedCPU() +{ + return cpu_supports_avx() != 0; +} + +int LlamaCppBackend::Implementation::cpuSupportsAVX2() +{ + return cpu_supports_avx2(); } diff --git a/gpt4all-backend/llamacpp_backend.h b/gpt4all-backend/llamacpp_backend.h new file mode 100644 index 0000000000000..d04ec7d39d998 --- /dev/null +++ b/gpt4all-backend/llamacpp_backend.h @@ -0,0 +1,187 @@ +#pragma once + +#include "llmodel.h" + +class LlamaCppBackend : public EmbLLModel { +public: + class BadArchError: public std::runtime_error { + public: + BadArchError(std::string arch) + : runtime_error("Unsupported model architecture: " + arch) + , m_arch(std::move(arch)) + {} + + const std::string &arch() const noexcept { return m_arch; } + + private: + std::string m_arch; + }; + + class MissingImplementationError: public std::runtime_error { + public: + using std::runtime_error::runtime_error; + }; + + class UnsupportedModelError: public std::runtime_error { + public: + using std::runtime_error::runtime_error; + }; + + struct GPUDevice { + const char *backend; + int index; + int type; + size_t heapSize; + std::string name; + std::string vendor; + + GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor): + backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)), + vendor(std::move(vendor)) {} + + std::string selectionName() const + { + assert(backend == "cuda"s || backend == "kompute"s); + return backendName() + ": " + name; + } + + std::string backendName() const { return backendIdToName(backend); } + + static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); } + + static std::string updateSelectionName(const std::string &name) { + if (name == "Auto" || name == "CPU" || name == "Metal") + return name; + auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) { + return name.starts_with(entry.second + ": "); + }); + if (it != s_backendNames.end()) + return name; + return "Vulkan: " + name; // previously, there were only Vulkan devices + } + + private: + static inline const std::unordered_map s_backendNames { + {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"}, + }; + }; + + class Implementation { + public: + Implementation(const Implementation &) = delete; + Implementation(Implementation &&); + ~Implementation(); + + std::string_view modelType() const { return m_modelType; } + std::string_view buildVariant() const { return m_buildVariant; } + + static LlamaCppBackend *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048); + static std::vector availableGPUDevices(size_t memoryRequired = 0); + static int32_t maxContextLength(const std::string &modelPath); + static int32_t layerCount(const std::string &modelPath); + static bool isEmbeddingModel(const std::string &modelPath); + static void setImplementationsSearchPath(const std::string &path); + static const std::string &implementationsSearchPath(); + static bool hasSupportedCPU(); + // 0 for no, 1 for yes, -1 for non-x86_64 + static int cpuSupportsAVX2(); + + private: + Implementation(Dlhandle &&); + + static const std::vector &implementationList(); + static const Implementation *implementation(const char *fname, const std::string &buildVariant); + static LlamaCppBackend *constructGlobalLlama(const std::optional &backend = std::nullopt); + + char *(*m_getFileArch)(const char *fname); + bool (*m_isArchSupported)(const char *arch); + LlamaCppBackend *(*m_construct)(); + + std::string_view m_modelType; + std::string_view m_buildVariant; + Dlhandle *m_dlhandle; + }; + + using ProgressCallback = std::function; + + virtual bool isModelBlacklisted(const std::string &modelPath) const = 0; + virtual bool isEmbeddingModel(const std::string &modelPath) const = 0; + virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0; + + void prompt(const std::string &prompt, + const std::string &promptTemplate, + std::function promptCallback, + std::function responseCallback, + bool allowContextShift, + PromptContext &ctx, + bool special = false, + std::string *fakeReply = nullptr) override; + + virtual void setThreadCount(int32_t n_threads) { (void)n_threads; } + virtual int32_t threadCount() const { return 1; } + + const Implementation &implementation() const { return *m_implementation; } + + virtual std::vector availableGPUDevices(size_t memoryRequired) const + { + (void)memoryRequired; + return {}; + } + + virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const + { + (void)memoryRequired; + (void)name; + return false; + } + + virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const + { + (void)device; + if (unavail_reason) { + *unavail_reason = "model has no GPU support"; + } + return false; + } + + virtual bool usingGPUDevice() const { return false; } + virtual const char *backendName() const { return "cpu"; } + virtual const char *gpuDeviceName() const { return nullptr; } + + void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; } + +protected: + virtual std::vector tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0; + virtual bool isSpecialToken(Token id) const = 0; + virtual std::string tokenToString(Token id) const = 0; + virtual Token sampleToken(PromptContext &ctx) const = 0; + virtual bool evalTokens(PromptContext &ctx, const std::vector &tokens) const = 0; + virtual void shiftContext(PromptContext &promptCtx) = 0; + virtual int32_t contextLength() const = 0; + virtual const std::vector &endTokens() const = 0; + virtual bool shouldAddBOS() const = 0; + + virtual int32_t maxContextLength(std::string const &modelPath) const = 0; + virtual int32_t layerCount(std::string const &modelPath) const = 0; + + static bool staticProgressCallback(float progress, void* ctx) + { + LlamaCppBackend *model = static_cast(ctx); + if (model && model->m_progressCallback) + return model->m_progressCallback(progress); + return true; + } + + bool decodePrompt(std::function promptCallback, + std::function responseCallback, + bool allowContextShift, + PromptContext &promptCtx, + std::vector embd_inp); + void generateResponse(std::function responseCallback, + bool allowContextShift, + PromptContext &promptCtx); + + const Implementation *m_implementation = nullptr; + ProgressCallback m_progressCallback; + Token m_tokenize_last_token = -1; +}; diff --git a/gpt4all-backend/llamacpp_backend_impl.cpp b/gpt4all-backend/llamacpp_backend_impl.cpp index aece51c6bb788..0ace53bb50ac7 100644 --- a/gpt4all-backend/llamacpp_backend_impl.cpp +++ b/gpt4all-backend/llamacpp_backend_impl.cpp @@ -378,7 +378,7 @@ bool LlamaCppBackendImpl::loadModel(const std::string &modelPath, int n_ctx, int d_ptr->model_params.use_mlock = params.use_mlock; #endif - d_ptr->model_params.progress_callback = &LLModel::staticProgressCallback; + d_ptr->model_params.progress_callback = &LlamaCppBackend::staticProgressCallback; d_ptr->model_params.progress_callback_user_data = this; d_ptr->backend_name = "cpu"; // default @@ -659,7 +659,7 @@ static const char *getVulkanVendorName(uint32_t vendorID) } #endif -std::vector LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const +std::vector LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const { #if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA) size_t count = 0; @@ -675,7 +675,7 @@ std::vector LlamaCppBackendImpl::availableGPUDevices(size_t #endif if (lcppDevices) { - std::vector devices; + std::vector devices; devices.reserve(count); for (size_t i = 0; i < count; ++i) { @@ -909,7 +909,7 @@ void LlamaCppBackendImpl::embed( void LlamaCppBackendImpl::embed( const std::vector &texts, float *embeddings, std::optional prefix, int dimensionality, - size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb + size_t *tokenCount, bool doMean, bool atlas, EmbLLModel::EmbedCancelCallback *cancelCb ) { if (!d_ptr->model) throw std::logic_error("no model is loaded"); @@ -967,7 +967,7 @@ double getL2NormScale(T *start, T *end) void LlamaCppBackendImpl::embedInternal( const std::vector &texts, float *embeddings, std::string prefix, int dimensionality, - size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec + size_t *tokenCount, bool doMean, bool atlas, EmbLLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec ) { typedef std::vector TokenString; static constexpr int32_t atlasMaxLength = 8192; diff --git a/gpt4all-backend/llamacpp_backend_impl.h b/gpt4all-backend/llamacpp_backend_impl.h index 5923572f961e7..7ed73c579d42b 100644 --- a/gpt4all-backend/llamacpp_backend_impl.h +++ b/gpt4all-backend/llamacpp_backend_impl.h @@ -1,10 +1,10 @@ +#pragma once + #ifndef LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE #error This file is NOT meant to be included outside of llamacpp_backend_impl.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE #endif -#ifndef LLAMACPP_BACKEND_IMPL_H -#define LLAMACPP_BACKEND_IMPL_H -#include "llmodel.h" +#include "llamacpp_backend.h" #include #include @@ -13,7 +13,7 @@ struct LlamaPrivate; struct EmbModelSpec; -class LlamaCppBackendImpl : public LLModel { +class LlamaCppBackendImpl : public LlamaCppBackend { public: LlamaCppBackendImpl(); ~LlamaCppBackendImpl(); @@ -68,5 +68,3 @@ class LlamaCppBackendImpl : public LLModel { size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb, const EmbModelSpec *spec); }; - -#endif // LLAMACPP_BACKEND_IMPL_H diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp deleted file mode 100644 index 7b18004aa10b4..0000000000000 --- a/gpt4all-backend/llmodel.cpp +++ /dev/null @@ -1,350 +0,0 @@ -#include "llmodel.h" - -#include "dlhandle.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef _WIN32 -# define WIN32_LEAN_AND_MEAN -# ifndef NOMINMAX -# define NOMINMAX -# endif -# include -#endif - -#ifdef _MSC_VER -# include -#endif - -#if defined(__APPLE__) && defined(__aarch64__) -# include "sysinfo.h" // for getSystemTotalRAMInBytes -#endif - -namespace fs = std::filesystem; - -#ifndef __APPLE__ -static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"}; -#elif defined(__aarch64__) -static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"}; -#else -static const std::string DEFAULT_BACKENDS[] = {"cpu"}; -#endif - -std::string s_implementations_search_path = "."; - -#if !(defined(__x86_64__) || defined(_M_X64)) - // irrelevant on non-x86_64 - #define cpu_supports_avx() -1 - #define cpu_supports_avx2() -1 -#elif defined(_MSC_VER) - // MSVC - static int get_cpu_info(int func_id, int reg_id) { - int info[4]; - __cpuid(info, func_id); - return info[reg_id]; - } - - // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX - #define cpu_supports_avx() !!(get_cpu_info(1, 2) & (1 << 28)) - // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX - #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 << 5)) -#else - // gcc/clang - #define cpu_supports_avx() !!__builtin_cpu_supports("avx") - #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2") -#endif - -LLModel::Implementation::Implementation(Dlhandle &&dlhandle_) - : m_dlhandle(new Dlhandle(std::move(dlhandle_))) { - auto get_model_type = m_dlhandle->get("get_model_type"); - assert(get_model_type); - m_modelType = get_model_type(); - auto get_build_variant = m_dlhandle->get("get_build_variant"); - assert(get_build_variant); - m_buildVariant = get_build_variant(); - m_getFileArch = m_dlhandle->get("get_file_arch"); - assert(m_getFileArch); - m_isArchSupported = m_dlhandle->get("is_arch_supported"); - assert(m_isArchSupported); - m_construct = m_dlhandle->get("construct"); - assert(m_construct); -} - -LLModel::Implementation::Implementation(Implementation &&o) - : m_getFileArch(o.m_getFileArch) - , m_isArchSupported(o.m_isArchSupported) - , m_construct(o.m_construct) - , m_modelType(o.m_modelType) - , m_buildVariant(o.m_buildVariant) - , m_dlhandle(o.m_dlhandle) { - o.m_dlhandle = nullptr; -} - -LLModel::Implementation::~Implementation() -{ - delete m_dlhandle; -} - -static bool isImplementation(const Dlhandle &dl) -{ - return dl.get("is_g4a_backend_model_implementation"); -} - -// Add the CUDA Toolkit to the DLL search path on Windows. -// This is necessary for chat.exe to find CUDA when started from Qt Creator. -static void addCudaSearchPath() -{ -#ifdef _WIN32 - if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) { - auto libDir = std::wstring(cudaPath) + L"\\bin"; - if (!AddDllDirectory(libDir.c_str())) { - auto err = GetLastError(); - std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n"; - } - } -#endif -} - -const std::vector &LLModel::Implementation::implementationList() -{ - if (cpu_supports_avx() == 0) { - throw std::runtime_error("CPU does not support AVX"); - } - - // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the - // individual models without the cleanup of the static list interfering - static auto* libs = new std::vector([] () { - std::vector fres; - - addCudaSearchPath(); - - std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)"; - if (cpu_supports_avx2() == 0) { - impl_name_re += "-avxonly"; - } - std::regex re(impl_name_re); - auto search_in_directory = [&](const std::string& paths) { - std::stringstream ss(paths); - std::string path; - // Split the paths string by the delimiter and process each path. - while (std::getline(ss, path, ';')) { - std::u8string u8_path(path.begin(), path.end()); - // Iterate over all libraries - for (const auto &f : fs::directory_iterator(u8_path)) { - const fs::path &p = f.path(); - - if (p.extension() != LIB_FILE_EXT) continue; - if (!std::regex_search(p.stem().string(), re)) { - std::cerr << "did not match regex: " << p.stem().string() << "\n"; - continue; - } - - // Add to list if model implementation - Dlhandle dl; - try { - dl = Dlhandle(p); - } catch (const Dlhandle::Exception &e) { - std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n"; - continue; - } - if (!isImplementation(dl)) { - std::cerr << "Not an implementation: " << p.filename().string() << "\n"; - continue; - } - fres.emplace_back(Implementation(std::move(dl))); - } - } - }; - - search_in_directory(s_implementations_search_path); - - return fres; - }()); - // Return static result - return *libs; -} - -static std::string applyCPUVariant(const std::string &buildVariant) -{ - if (buildVariant != "metal" && cpu_supports_avx2() == 0) { - return buildVariant + "-avxonly"; - } - return buildVariant; -} - -const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) -{ - bool buildVariantMatched = false; - std::optional archName; - for (const auto& i : implementationList()) { - if (buildVariant != i.m_buildVariant) continue; - buildVariantMatched = true; - - char *arch = i.m_getFileArch(fname); - if (!arch) continue; - archName = arch; - - bool archSupported = i.m_isArchSupported(arch); - free(arch); - if (archSupported) return &i; - } - - if (!buildVariantMatched) - return nullptr; - if (!archName) - throw UnsupportedModelError("Unsupported file format"); - - throw BadArchError(std::move(*archName)); -} - -LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) -{ - std::vector desiredBackends; - if (backend != "auto") { - desiredBackends.push_back(backend); - } else { - desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS)); - } - - for (const auto &desiredBackend: desiredBackends) { - const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend)); - - if (impl) { - // Construct llmodel implementation - auto *fres = impl->m_construct(); - fres->m_implementation = impl; - -#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs - /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at - * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in - * most (all?) places where this is called, causing underestimation of required - * memory. */ - if (backend == "auto" && desiredBackend == "metal") { - // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not - size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100); - if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) { - delete fres; - continue; - } - } -#else - (void)n_ctx; -#endif - - return fres; - } - } - - throw MissingImplementationError("Could not find any implementations for backend: " + backend); -} - -LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional &backend) -{ - static std::unordered_map> implCache; - - const std::vector *impls; - try { - impls = &implementationList(); - } catch (const std::runtime_error &e) { - std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n"; - return nullptr; - } - - std::vector desiredBackends; - if (backend) { - desiredBackends.push_back(backend.value()); - } else { - desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS)); - } - - const Implementation *impl = nullptr; - - for (const auto &desiredBackend: desiredBackends) { - auto cacheIt = implCache.find(desiredBackend); - if (cacheIt != implCache.end()) - return cacheIt->second.get(); // cached - - for (const auto &i: *impls) { - if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) { - impl = &i; - break; - } - } - - if (impl) { - auto *fres = impl->m_construct(); - fres->m_implementation = impl; - implCache[desiredBackend] = std::unique_ptr(fres); - return fres; - } - } - - std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n"; - return nullptr; -} - -std::vector LLModel::Implementation::availableGPUDevices(size_t memoryRequired) -{ - std::vector devices; -#ifndef __APPLE__ - static const std::string backends[] = {"kompute", "cuda"}; - for (const auto &backend: backends) { - auto *llama = constructGlobalLlama(backend); - if (llama) { - auto backendDevs = llama->availableGPUDevices(memoryRequired); - devices.insert(devices.end(), backendDevs.begin(), backendDevs.end()); - } - } -#endif - return devices; -} - -int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) -{ - auto *llama = constructGlobalLlama(); - return llama ? llama->maxContextLength(modelPath) : -1; -} - -int32_t LLModel::Implementation::layerCount(const std::string &modelPath) -{ - auto *llama = constructGlobalLlama(); - return llama ? llama->layerCount(modelPath) : -1; -} - -bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) -{ - auto *llama = constructGlobalLlama(); - return llama && llama->isEmbeddingModel(modelPath); -} - -void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) -{ - s_implementations_search_path = path; -} - -const std::string& LLModel::Implementation::implementationsSearchPath() -{ - return s_implementations_search_path; -} - -bool LLModel::Implementation::hasSupportedCPU() -{ - return cpu_supports_avx() != 0; -} - -int LLModel::Implementation::cpuSupportsAVX2() -{ - return cpu_supports_avx2(); -} diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index 04a510dc740f2..83c559ff779cd 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -1,5 +1,4 @@ -#ifndef LLMODEL_H -#define LLMODEL_H +#pragma once #include #include @@ -24,104 +23,6 @@ class LLModel { public: using Token = int32_t; - class BadArchError: public std::runtime_error { - public: - BadArchError(std::string arch) - : runtime_error("Unsupported model architecture: " + arch) - , m_arch(std::move(arch)) - {} - - const std::string &arch() const noexcept { return m_arch; } - - private: - std::string m_arch; - }; - - class MissingImplementationError: public std::runtime_error { - public: - using std::runtime_error::runtime_error; - }; - - class UnsupportedModelError: public std::runtime_error { - public: - using std::runtime_error::runtime_error; - }; - - struct GPUDevice { - const char *backend; - int index; - int type; - size_t heapSize; - std::string name; - std::string vendor; - - GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor): - backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)), - vendor(std::move(vendor)) {} - - std::string selectionName() const - { - assert(backend == "cuda"s || backend == "kompute"s); - return backendName() + ": " + name; - } - - std::string backendName() const { return backendIdToName(backend); } - - static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); } - - static std::string updateSelectionName(const std::string &name) { - if (name == "Auto" || name == "CPU" || name == "Metal") - return name; - auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) { - return name.starts_with(entry.second + ": "); - }); - if (it != s_backendNames.end()) - return name; - return "Vulkan: " + name; // previously, there were only Vulkan devices - } - - private: - static inline const std::unordered_map s_backendNames { - {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"}, - }; - }; - - class Implementation { - public: - Implementation(const Implementation &) = delete; - Implementation(Implementation &&); - ~Implementation(); - - std::string_view modelType() const { return m_modelType; } - std::string_view buildVariant() const { return m_buildVariant; } - - static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048); - static std::vector availableGPUDevices(size_t memoryRequired = 0); - static int32_t maxContextLength(const std::string &modelPath); - static int32_t layerCount(const std::string &modelPath); - static bool isEmbeddingModel(const std::string &modelPath); - static void setImplementationsSearchPath(const std::string &path); - static const std::string &implementationsSearchPath(); - static bool hasSupportedCPU(); - // 0 for no, 1 for yes, -1 for non-x86_64 - static int cpuSupportsAVX2(); - - private: - Implementation(Dlhandle &&); - - static const std::vector &implementationList(); - static const Implementation *implementation(const char *fname, const std::string &buildVariant); - static LLModel *constructGlobalLlama(const std::optional &backend = std::nullopt); - - char *(*m_getFileArch)(const char *fname); - bool (*m_isArchSupported)(const char *arch); - LLModel *(*m_construct)(); - - std::string_view m_modelType; - std::string_view m_buildVariant; - Dlhandle *m_dlhandle; - }; - struct PromptContext { std::vector tokens; // current tokens in the context window int32_t n_past = 0; // number of tokens in past conversation @@ -137,18 +38,11 @@ class LLModel { float contextErase = 0.5f; // percent of context to erase if we exceed the context window }; - using ProgressCallback = std::function; - - explicit LLModel() {} virtual ~LLModel() {} - virtual bool supportsEmbedding() const = 0; - virtual bool supportsCompletion() const = 0; + virtual bool supportsCompletion() const { return true; } virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0; - virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; }; - virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; } virtual bool isModelLoaded() const = 0; - virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0; virtual size_t stateSize() const { return 0; } virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; } virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; } @@ -162,101 +56,25 @@ class LLModel { bool allowContextShift, PromptContext &ctx, bool special = false, - std::string *fakeReply = nullptr); + std::string *fakeReply = nullptr) = 0; +protected: + explicit LLModel() {} +}; + +class EmbLLModel: virtual public LLModel { +public: using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend); - virtual size_t embeddingSize() const { - throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings"); - } + virtual bool supportsCompletion() const = 0; + virtual bool supportsEmbedding() const = 0; + virtual size_t embeddingSize() const = 0; + // user-specified prefix virtual void embed(const std::vector &texts, float *embeddings, std::optional prefix, int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false, - EmbedCancelCallback *cancelCb = nullptr); + EmbedCancelCallback *cancelCb = nullptr) = 0; // automatic prefix virtual void embed(const std::vector &texts, float *embeddings, bool isRetrieval, - int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false); - - virtual void setThreadCount(int32_t n_threads) { (void)n_threads; } - virtual int32_t threadCount() const { return 1; } - - const Implementation &implementation() const { - return *m_implementation; - } - - virtual std::vector availableGPUDevices(size_t memoryRequired) const { - (void)memoryRequired; - return {}; - } - - virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const { - (void)memoryRequired; - (void)name; - return false; - } - - virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const { - (void)device; - if (unavail_reason) { - *unavail_reason = "model has no GPU support"; - } - return false; - } - - virtual bool usingGPUDevice() const { return false; } - virtual const char *backendName() const { return "cpu"; } - virtual const char *gpuDeviceName() const { return nullptr; } - - void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; } - -protected: - // These are pure virtual because subclasses need to implement as the default implementation of - // 'prompt' above calls these functions - virtual std::vector tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0; - virtual bool isSpecialToken(Token id) const = 0; - virtual std::string tokenToString(Token id) const = 0; - virtual Token sampleToken(PromptContext &ctx) const = 0; - virtual bool evalTokens(PromptContext &ctx, const std::vector &tokens) const = 0; - virtual void shiftContext(PromptContext &promptCtx) = 0; - virtual int32_t contextLength() const = 0; - virtual const std::vector &endTokens() const = 0; - virtual bool shouldAddBOS() const = 0; - - virtual int32_t maxContextLength(std::string const &modelPath) const - { - (void)modelPath; - return -1; - } - - virtual int32_t layerCount(std::string const &modelPath) const - { - (void)modelPath; - return -1; - } - - const Implementation *m_implementation = nullptr; - - ProgressCallback m_progressCallback; - static bool staticProgressCallback(float progress, void* ctx) - { - LLModel* model = static_cast(ctx); - if (model && model->m_progressCallback) - return model->m_progressCallback(progress); - return true; - } - - bool decodePrompt(std::function promptCallback, - std::function responseCallback, - bool allowContextShift, - PromptContext &promptCtx, - std::vector embd_inp); - void generateResponse(std::function responseCallback, - bool allowContextShift, - PromptContext &promptCtx); - - Token m_tokenize_last_token = -1; // not serialized - - friend class LLMImplementation; + int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) = 0; }; - -#endif // LLMODEL_H diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp index f3fd68ffa69c4..f4adf1c3fa876 100644 --- a/gpt4all-backend/llmodel_c.cpp +++ b/gpt4all-backend/llmodel_c.cpp @@ -1,5 +1,6 @@ #include "llmodel_c.h" +#include "llamacpp_backend.h" #include "llmodel.h" #include @@ -15,7 +16,7 @@ #include struct LLModelWrapper { - LLModel *llModel = nullptr; + LlamaCppBackend *llModel = nullptr; LLModel::PromptContext promptContext; ~LLModelWrapper() { delete llModel; } }; @@ -41,9 +42,9 @@ static void llmodel_set_error(const char **errptr, const char *message) llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) { - LLModel *llModel; + LlamaCppBackend *llModel; try { - llModel = LLModel::Implementation::construct(model_path, backend); + llModel = LlamaCppBackend::Implementation::construct(model_path, backend); } catch (const std::exception& e) { llmodel_set_error(error, e.what()); return nullptr; @@ -214,12 +215,12 @@ int32_t llmodel_threadCount(llmodel_model model) void llmodel_set_implementation_search_path(const char *path) { - LLModel::Implementation::setImplementationsSearchPath(path); + LlamaCppBackend::Implementation::setImplementationsSearchPath(path); } const char *llmodel_get_implementation_search_path() { - return LLModel::Implementation::implementationsSearchPath().c_str(); + return LlamaCppBackend::Implementation::implementationsSearchPath().c_str(); } // RAII wrapper around a C-style struct @@ -244,7 +245,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired, { static thread_local std::unique_ptr c_devices; - auto devices = LLModel::Implementation::availableGPUDevices(memoryRequired); + auto devices = LlamaCppBackend::Implementation::availableGPUDevices(memoryRequired); *num_devices = devices.size(); if (devices.empty()) { return nullptr; /* no devices */ } diff --git a/gpt4all-chat/chatapi.cpp b/gpt4all-chat/chatapi.cpp index b443f24c3ab7e..ada3332518d19 100644 --- a/gpt4all-chat/chatapi.cpp +++ b/gpt4all-chat/chatapi.cpp @@ -32,14 +32,6 @@ ChatAPI::ChatAPI() { } -size_t ChatAPI::requiredMem(const std::string &modelPath, int n_ctx, int ngl) -{ - Q_UNUSED(modelPath); - Q_UNUSED(n_ctx); - Q_UNUSED(ngl); - return 0; -} - bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl) { Q_UNUSED(modelPath); @@ -48,20 +40,7 @@ bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl) return true; } -void ChatAPI::setThreadCount(int32_t n_threads) -{ - Q_UNUSED(n_threads); - qt_noop(); -} - -int32_t ChatAPI::threadCount() const -{ - return 1; -} - -ChatAPI::~ChatAPI() -{ -} +ChatAPI::~ChatAPI() {} bool ChatAPI::isModelLoaded() const { diff --git a/gpt4all-chat/chatapi.h b/gpt4all-chat/chatapi.h index 59b68f5821081..0decb6421a6a7 100644 --- a/gpt4all-chat/chatapi.h +++ b/gpt4all-chat/chatapi.h @@ -57,11 +57,8 @@ class ChatAPI : public QObject, public LLModel { ChatAPI(); virtual ~ChatAPI(); - bool supportsEmbedding() const override { return false; } - bool supportsCompletion() const override { return true; } bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override; bool isModelLoaded() const override; - size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override; size_t stateSize() const override; size_t saveState(uint8_t *dest) const override; size_t restoreState(const uint8_t *src) override; @@ -74,9 +71,6 @@ class ChatAPI : public QObject, public LLModel { bool special, std::string *fakeReply) override; - void setThreadCount(int32_t n_threads) override; - int32_t threadCount() const override; - void setModelName(const QString &modelName) { m_modelName = modelName; } void setAPIKey(const QString &apiKey) { m_apiKey = apiKey; } void setRequestURL(const QString &requestURL) { m_requestURL = requestURL; } @@ -92,65 +86,6 @@ class ChatAPI : public QObject, public LLModel { LLModel::PromptContext *ctx, const QByteArray &array); -protected: - // We have to implement these as they are pure virtual in base class, but we don't actually use - // them as they are only called from the default implementation of 'prompt' which we override and - // completely replace - - std::vector tokenize(PromptContext &ctx, const std::string &str, bool special) override - { - (void)ctx; - (void)str; - (void)special; - throw std::logic_error("not implemented"); - } - - bool isSpecialToken(Token id) const override - { - (void)id; - throw std::logic_error("not implemented"); - } - - std::string tokenToString(Token id) const override - { - (void)id; - throw std::logic_error("not implemented"); - } - - Token sampleToken(PromptContext &ctx) const override - { - (void)ctx; - throw std::logic_error("not implemented"); - } - - bool evalTokens(PromptContext &ctx, const std::vector &tokens) const override - { - (void)ctx; - (void)tokens; - throw std::logic_error("not implemented"); - } - - void shiftContext(PromptContext &promptCtx) override - { - (void)promptCtx; - throw std::logic_error("not implemented"); - } - - int32_t contextLength() const override - { - throw std::logic_error("not implemented"); - } - - const std::vector &endTokens() const override - { - throw std::logic_error("not implemented"); - } - - bool shouldAddBOS() const override - { - throw std::logic_error("not implemented"); - } - private: std::function m_responseCallback; QString m_modelName; diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp index e9fb7f3132f95..b386d0ce684a3 100644 --- a/gpt4all-chat/chatllm.cpp +++ b/gpt4all-chat/chatllm.cpp @@ -412,19 +412,20 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro QString filePath = modelInfo.dirpath + modelInfo.filename(); - auto construct = [this, &filePath, &modelInfo, &modelLoadProps, n_ctx](std::string const &backend) { + auto construct = [this, &filePath, &modelInfo, &modelLoadProps, n_ctx](std::string const &backend) -> LlamaCppBackend * { + LlamaCppBackend *lcppmodel; QString constructError; m_llModelInfo.resetModel(this); try { - auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx); - m_llModelInfo.resetModel(this, model); - } catch (const LLModel::MissingImplementationError &e) { + lcppmodel = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx); + m_llModelInfo.resetModel(this, lcppmodel); + } catch (const LlamaCppBackend::MissingImplementationError &e) { modelLoadProps.insert("error", "missing_model_impl"); constructError = e.what(); - } catch (const LLModel::UnsupportedModelError &e) { + } catch (const LlamaCppBackend::UnsupportedModelError &e) { modelLoadProps.insert("error", "unsupported_model_file"); constructError = e.what(); - } catch (const LLModel::BadArchError &e) { + } catch (const LlamaCppBackend::BadArchError &e) { constructError = e.what(); modelLoadProps.insert("error", "unsupported_model_arch"); modelLoadProps.insert("model_arch", QString::fromStdString(e.arch())); @@ -435,21 +436,22 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); resetModel(); emit modelLoadingError(u"Error loading %1: %2"_s.arg(modelInfo.filename(), constructError)); - return false; + return nullptr; } - m_llModelInfo.model->setProgressCallback([this](float progress) -> bool { + lcppmodel->setProgressCallback([this](float progress) -> bool { progress = std::max(progress, std::numeric_limits::min()); // keep progress above zero emit modelLoadingPercentageChanged(progress); return m_shouldBeLoaded; }); - return true; + return lcppmodel; }; - if (!construct(backend)) + auto *lcppmodel = construct(backend); + if (!lcppmodel) return true; - if (m_llModelInfo.model->isModelBlacklisted(filePath.toStdString())) { + if (lcppmodel->isModelBlacklisted(filePath.toStdString())) { static QSet warned; auto fname = modelInfo.filename(); if (!warned.contains(fname)) { @@ -460,16 +462,16 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro } } - auto approxDeviceMemGB = [](const LLModel::GPUDevice *dev) { + auto approxDeviceMemGB = [](const LlamaCppBackend::GPUDevice *dev) { float memGB = dev->heapSize / float(1024 * 1024 * 1024); return std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place }; - std::vector availableDevices; - const LLModel::GPUDevice *defaultDevice = nullptr; + std::vector availableDevices; + const LlamaCppBackend::GPUDevice *defaultDevice = nullptr; { - const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl); - availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory); + const size_t requiredMemory = lcppmodel->requiredMem(filePath.toStdString(), n_ctx, ngl); + availableDevices = lcppmodel->availableGPUDevices(requiredMemory); // Pick the best device // NB: relies on the fact that Kompute devices are listed first if (!availableDevices.empty() && availableDevices.front().type == 2 /*a discrete gpu*/) { @@ -485,14 +487,14 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro bool actualDeviceIsCPU = true; #if defined(Q_OS_MAC) && defined(__aarch64__) - if (m_llModelInfo.model->implementation().buildVariant() == "metal") + if (lcppmodel->implementation().buildVariant() == "metal") actualDeviceIsCPU = false; #else if (requestedDevice != "CPU") { const auto *device = defaultDevice; if (requestedDevice != "Auto") { // Use the selected device - for (const LLModel::GPUDevice &d : availableDevices) { + for (const auto &d : availableDevices) { if (QString::fromStdString(d.selectionName()) == requestedDevice) { device = &d; break; @@ -503,7 +505,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro std::string unavail_reason; if (!device) { // GPU not available - } else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) { + } else if (!lcppmodel->initializeGPUDevice(device->index, &unavail_reason)) { m_llModelInfo.fallbackReason = QString::fromStdString(unavail_reason); } else { actualDeviceIsCPU = false; @@ -512,7 +514,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro } #endif - bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl); + bool success = lcppmodel->loadModel(filePath.toStdString(), n_ctx, ngl); if (!m_shouldBeLoaded) { m_llModelInfo.resetModel(this); @@ -531,10 +533,13 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro modelLoadProps.insert("cpu_fallback_reason", "gpu_load_failed"); // For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls - if (backend == "cuda" && !construct("auto")) - return true; + if (backend == "cuda") { + lcppmodel = construct("auto"); + if (!lcppmodel) + return true; + } - success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0); + success = lcppmodel->loadModel(filePath.toStdString(), n_ctx, 0); if (!m_shouldBeLoaded) { m_llModelInfo.resetModel(this); @@ -544,7 +549,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro emit modelLoadingPercentageChanged(0.0f); return false; } - } else if (!m_llModelInfo.model->usingGPUDevice()) { + } else if (!lcppmodel->usingGPUDevice()) { // ggml_vk_init was not called in llama.cpp // We might have had to fallback to CPU after load if the model is not possible to accelerate // for instance if the quantization method is not supported on Vulkan yet @@ -562,7 +567,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro return true; } - switch (m_llModelInfo.model->implementation().modelType()[0]) { + switch (lcppmodel->implementation().modelType()[0]) { case 'L': m_llModelType = LLModelType::LLAMA_; break; default: { @@ -774,11 +779,15 @@ bool ChatLLM::promptInternal(const QList &collectionList, const QString m_ctx.n_batch = n_batch; m_ctx.repeat_penalty = repeat_penalty; m_ctx.repeat_last_n = repeat_penalty_tokens; - m_llModelInfo.model->setThreadCount(n_threads); + + if (auto *lcppmodel = dynamic_cast(m_llModelInfo.model.get())) + lcppmodel->setThreadCount(n_threads); + #if defined(DEBUG) printf("%s", qPrintable(prompt)); fflush(stdout); #endif + QElapsedTimer totalTime; totalTime.start(); m_timer->start(); @@ -1238,11 +1247,15 @@ void ChatLLM::processSystemPrompt() m_ctx.n_batch = n_batch; m_ctx.repeat_penalty = repeat_penalty; m_ctx.repeat_last_n = repeat_penalty_tokens; - m_llModelInfo.model->setThreadCount(n_threads); + + if (auto *lcppmodel = dynamic_cast(m_llModelInfo.model.get())) + lcppmodel->setThreadCount(n_threads); + #if defined(DEBUG) printf("%s", qPrintable(QString::fromStdString(systemPrompt))); fflush(stdout); #endif + auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode system prompt without a response // use "%1%2" and not "%1" to avoid implicit whitespace m_llModelInfo.model->prompt(systemPrompt, "%1%2", promptFunc, nullptr, /*allowContextShift*/ true, m_ctx, true); @@ -1288,7 +1301,9 @@ void ChatLLM::processRestoreStateFromText() m_ctx.n_batch = n_batch; m_ctx.repeat_penalty = repeat_penalty; m_ctx.repeat_last_n = repeat_penalty_tokens; - m_llModelInfo.model->setThreadCount(n_threads); + + if (auto *lcppmodel = dynamic_cast(m_llModelInfo.model.get())) + lcppmodel->setThreadCount(n_threads); auto it = m_stateFromText.begin(); while (it < m_stateFromText.end()) { diff --git a/gpt4all-chat/chatllm.h b/gpt4all-chat/chatllm.h index d123358ad58e0..68f4e0f1d95d6 100644 --- a/gpt4all-chat/chatllm.h +++ b/gpt4all-chat/chatllm.h @@ -4,6 +4,7 @@ #include "database.h" // IWYU pragma: keep #include "modellist.h" +#include "../gpt4all-backend/llamacpp_backend.h" #include "../gpt4all-backend/llmodel.h" #include @@ -128,15 +129,17 @@ class ChatLLM : public QObject QString deviceBackend() const { - if (!isModelLoaded()) return QString(); - std::string name = LLModel::GPUDevice::backendIdToName(m_llModelInfo.model->backendName()); + auto *lcppmodel = dynamic_cast(m_llModelInfo.model.get()); + if (!isModelLoaded() && !lcppmodel) return QString(); + std::string name = LlamaCppBackend::GPUDevice::backendIdToName(lcppmodel->backendName()); return QString::fromStdString(name); } QString device() const { - if (!isModelLoaded()) return QString(); - const char *name = m_llModelInfo.model->gpuDeviceName(); + auto *lcppmodel = dynamic_cast(m_llModelInfo.model.get()); + if (!isModelLoaded() || !lcppmodel) return QString(); + const char *name = lcppmodel->gpuDeviceName(); return name ? QString(name) : u"CPU"_s; } diff --git a/gpt4all-chat/embllm.cpp b/gpt4all-chat/embllm.cpp index 615a6ce4d9252..af56d0bc19293 100644 --- a/gpt4all-chat/embllm.cpp +++ b/gpt4all-chat/embllm.cpp @@ -3,7 +3,7 @@ #include "modellist.h" #include "mysettings.h" -#include "../gpt4all-backend/llmodel.h" +#include "../gpt4all-backend/llamacpp_backend.h" #include #include @@ -99,7 +99,7 @@ bool EmbeddingLLMWorker::loadModel() #endif try { - m_model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx); + m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx); } catch (const std::exception &e) { qWarning() << "embllm WARNING: Could not load embedding model:" << e.what(); return false; @@ -112,11 +112,11 @@ bool EmbeddingLLMWorker::loadModel() actualDeviceIsCPU = false; #else if (requestedDevice != "CPU") { - const LLModel::GPUDevice *device = nullptr; - std::vector availableDevices = m_model->availableGPUDevices(0); + const LlamaCppBackend::GPUDevice *device = nullptr; + auto availableDevices = m_model->availableGPUDevices(0); if (requestedDevice != "Auto") { // Use the selected device - for (const LLModel::GPUDevice &d : availableDevices) { + for (const auto &d : availableDevices) { if (QString::fromStdString(d.selectionName()) == requestedDevice) { device = &d; break; @@ -145,7 +145,7 @@ bool EmbeddingLLMWorker::loadModel() if (backend == "cuda") { // For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls try { - m_model = LLModel::Implementation::construct(filePath.toStdString(), "auto", n_ctx); + m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), "auto", n_ctx); } catch (const std::exception &e) { qWarning() << "embllm WARNING: Could not load embedding model:" << e.what(); return false; diff --git a/gpt4all-chat/embllm.h b/gpt4all-chat/embllm.h index 91376650d05e3..fda773e545372 100644 --- a/gpt4all-chat/embllm.h +++ b/gpt4all-chat/embllm.h @@ -13,7 +13,7 @@ #include #include -class LLModel; +class LlamaCppBackend; class QNetworkAccessManager; struct EmbeddingChunk { @@ -67,7 +67,7 @@ private Q_SLOTS: QString m_nomicAPIKey; QNetworkAccessManager *m_networkManager; std::vector m_lastResponse; - LLModel *m_model = nullptr; + LlamaCppBackend *m_model = nullptr; std::atomic m_stopGenerating; QThread m_workerThread; QMutex m_mutex; // guards m_model and m_nomicAPIKey diff --git a/gpt4all-chat/llm.cpp b/gpt4all-chat/llm.cpp index 13820030393e7..c501953c8d053 100644 --- a/gpt4all-chat/llm.cpp +++ b/gpt4all-chat/llm.cpp @@ -1,6 +1,6 @@ #include "llm.h" -#include "../gpt4all-backend/llmodel.h" +#include "../gpt4all-backend/llamacpp_backend.h" #include "../gpt4all-backend/sysinfo.h" #include @@ -30,7 +30,7 @@ LLM *LLM::globalInstance() LLM::LLM() : QObject{nullptr} - , m_compatHardware(LLModel::Implementation::hasSupportedCPU()) + , m_compatHardware(LlamaCppBackend::Implementation::hasSupportedCPU()) { QNetworkInformation::loadDefaultBackend(); auto * netinfo = QNetworkInformation::instance(); diff --git a/gpt4all-chat/main.cpp b/gpt4all-chat/main.cpp index 4546a95bcf327..f4a3df2075067 100644 --- a/gpt4all-chat/main.cpp +++ b/gpt4all-chat/main.cpp @@ -8,7 +8,7 @@ #include "mysettings.h" #include "network.h" -#include "../gpt4all-backend/llmodel.h" +#include "../gpt4all-backend/llamacpp_backend.h" #include #include @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) if (LLM::directoryExists(frameworksDir)) llmodelSearchPaths += ";" + frameworksDir; #endif - LLModel::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString()); + LlamaCppBackend::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString()); // Set the local and language translation before the qml engine has even been started. This will // use the default system locale unless the user has explicitly set it to use a different one. diff --git a/gpt4all-chat/modellist.cpp b/gpt4all-chat/modellist.cpp index 580b615ff4e66..4d5a4147e1258 100644 --- a/gpt4all-chat/modellist.cpp +++ b/gpt4all-chat/modellist.cpp @@ -4,7 +4,7 @@ #include "mysettings.h" #include "network.h" -#include "../gpt4all-backend/llmodel.h" +#include "../gpt4all-backend/llamacpp_backend.h" #include #include @@ -258,7 +258,7 @@ int ModelInfo::maxContextLength() const if (!installed || isOnline) return -1; if (m_maxContextLength != -1) return m_maxContextLength; auto path = (dirpath + filename()).toStdString(); - int n_ctx = LLModel::Implementation::maxContextLength(path); + int n_ctx = LlamaCppBackend::Implementation::maxContextLength(path); if (n_ctx < 0) { n_ctx = 4096; // fallback value } @@ -282,7 +282,7 @@ int ModelInfo::maxGpuLayers() const if (!installed || isOnline) return -1; if (m_maxGpuLayers != -1) return m_maxGpuLayers; auto path = (dirpath + filename()).toStdString(); - int layers = LLModel::Implementation::layerCount(path); + int layers = LlamaCppBackend::Implementation::layerCount(path); if (layers < 0) { layers = 100; // fallback value } @@ -997,7 +997,7 @@ void ModelList::updateData(const QString &id, const QVector && (info->isDiscovered() || info->description().isEmpty())) { // read GGUF and decide based on model architecture - info->isEmbeddingModel = LLModel::Implementation::isEmbeddingModel(modelPath.toStdString()); + info->isEmbeddingModel = LlamaCppBackend::Implementation::isEmbeddingModel(modelPath.toStdString()); info->checkedEmbeddingModel = true; } diff --git a/gpt4all-chat/mysettings.cpp b/gpt4all-chat/mysettings.cpp index b29ec431f302b..d57b5926c4b5b 100644 --- a/gpt4all-chat/mysettings.cpp +++ b/gpt4all-chat/mysettings.cpp @@ -1,6 +1,6 @@ #include "mysettings.h" -#include "../gpt4all-backend/llmodel.h" +#include "../gpt4all-backend/llamacpp_backend.h" #include #include @@ -95,8 +95,8 @@ static QStringList getDevices(bool skipKompute = false) #if defined(Q_OS_MAC) && defined(__aarch64__) deviceList << "Metal"; #else - std::vector devices = LLModel::Implementation::availableGPUDevices(); - for (LLModel::GPUDevice &d : devices) { + auto devices = LlamaCppBackend::Implementation::availableGPUDevices(); + for (auto &d : devices) { if (!skipKompute || strcmp(d.backend, "kompute")) deviceList << QString::fromStdString(d.selectionName()); } @@ -512,7 +512,7 @@ QString MySettings::device() auto device = value.toString(); if (!device.isEmpty()) { auto deviceStr = device.toStdString(); - auto newNameStr = LLModel::GPUDevice::updateSelectionName(deviceStr); + auto newNameStr = LlamaCppBackend::GPUDevice::updateSelectionName(deviceStr); if (newNameStr != deviceStr) { auto newName = QString::fromStdString(newNameStr); qWarning() << "updating device name:" << device << "->" << newName; diff --git a/gpt4all-chat/network.cpp b/gpt4all-chat/network.cpp index e7ee616cd2cad..47595bcad1bf5 100644 --- a/gpt4all-chat/network.cpp +++ b/gpt4all-chat/network.cpp @@ -9,7 +9,7 @@ #include "modellist.h" #include "mysettings.h" -#include "../gpt4all-backend/llmodel.h" +#include "../gpt4all-backend/llamacpp_backend.h" #include #include @@ -290,7 +290,7 @@ void Network::sendStartup() {"display", u"%1x%2"_s.arg(display->size().width()).arg(display->size().height())}, {"ram", LLM::globalInstance()->systemTotalRAMInGB()}, {"cpu", getCPUModel()}, - {"cpu_supports_avx2", LLModel::Implementation::cpuSupportsAVX2()}, + {"cpu_supports_avx2", LlamaCppBackend::Implementation::cpuSupportsAVX2()}, {"datalake_active", mySettings->networkIsActive()}, }); sendIpify();