From e761dcbca25047dc5ccc2e83442b763bf964a273 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 6 Aug 2024 18:56:48 -0400
Subject: [PATCH] backend: move more stuff into LlamaCppBackend

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-backend/CMakeLists.txt                |   2 +-
 ...lmodel_shared.cpp => llamacpp_backend.cpp} | 409 ++++++++++++++++--
 gpt4all-backend/llamacpp_backend.h            | 187 ++++++++
 gpt4all-backend/llamacpp_backend_impl.cpp     |  10 +-
 gpt4all-backend/llamacpp_backend_impl.h       |  10 +-
 gpt4all-backend/llmodel.cpp                   | 350 ---------------
 gpt4all-backend/llmodel.h                     | 212 +--------
 gpt4all-backend/llmodel_c.cpp                 |  13 +-
 gpt4all-chat/chatapi.cpp                      |  23 +-
 gpt4all-chat/chatapi.h                        |  65 ---
 gpt4all-chat/chatllm.cpp                      |  71 +--
 gpt4all-chat/chatllm.h                        |  11 +-
 gpt4all-chat/embllm.cpp                       |  12 +-
 gpt4all-chat/embllm.h                         |   4 +-
 gpt4all-chat/llm.cpp                          |   4 +-
 gpt4all-chat/main.cpp                         |   4 +-
 gpt4all-chat/modellist.cpp                    |   8 +-
 gpt4all-chat/mysettings.cpp                   |   8 +-
 gpt4all-chat/network.cpp                      |   4 +-
 19 files changed, 660 insertions(+), 747 deletions(-)
 rename gpt4all-backend/{llmodel_shared.cpp => llamacpp_backend.cpp} (52%)
 create mode 100644 gpt4all-backend/llamacpp_backend.h
 delete mode 100644 gpt4all-backend/llmodel.cpp
diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt
index 9a9fd5783f556..62c326bea9fa9 100644
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@@ -142,7 +142,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
 endforeach()
 
 add_library(llmodel
-    llmodel.h llmodel.cpp llmodel_shared.cpp
+    llmodel.h llamacpp_backend.cpp
     llmodel_c.h llmodel_c.cpp
     dlhandle.cpp
 )
diff --git a/gpt4all-backend/llmodel_shared.cpp b/gpt4all-backend/llamacpp_backend.cpp
similarity index 52%
rename from gpt4all-backend/llmodel_shared.cpp
rename to gpt4all-backend/llamacpp_backend.cpp
index 7477254a74ff8..3d3ee1a230aa1 100644
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llamacpp_backend.cpp
@@ -1,20 +1,46 @@
-#include "llmodel.h"
+#include "llamacpp_backend.h"
+
+#include "dlhandle.h"
 
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
 #include <functional>
 #include <iostream>
+#include <iterator>
+#include <memory>
 #include <optional>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
+#ifdef _WIN32
+#   define WIN32_LEAN_AND_MEAN
+#   ifndef NOMINMAX
+#       define NOMINMAX
+#   endif
+#   include <windows.h>
+#endif
+
+#ifdef _MSC_VER
+#   include <intrin.h>
+#endif
+
+#if defined(__APPLE__) && defined(__aarch64__)
+#   include "sysinfo.h" // for getSystemTotalRAMInBytes
+#endif
+
+namespace fs = std::filesystem;
 namespace ranges = std::ranges;
 
+
 static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
 {
     static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
@@ -38,15 +64,16 @@ static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch
     return true;
 }
 
-void LLModel::prompt(const std::string &prompt,
-                     const std::string &promptTemplate,
-                     std::function<bool(int32_t)> promptCallback,
-                     std::function<bool(int32_t, const std::string&)> responseCallback,
-                     bool allowContextShift,
-                     PromptContext &promptCtx,
-                     bool special,
-                     std::string *fakeReply)
-{
+void LlamaCppBackend::prompt(
+    const std::string &prompt,
+    const std::string &promptTemplate,
+    std::function<bool(int32_t)> promptCallback,
+    std::function<bool(int32_t, const std::string&)> responseCallback,
+    bool allowContextShift,
+    PromptContext &promptCtx,
+    bool special,
+    std::string *fakeReply
+) {
     if (!isModelLoaded()) {
         std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
         return;
@@ -153,11 +180,13 @@ void LLModel::prompt(const std::string &prompt,
 }
 
 // returns false on error
-bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
-                           std::function<bool(int32_t, const std::string&)> responseCallback,
-                           bool allowContextShift,
-                           PromptContext &promptCtx,
-                           std::vector<Token> embd_inp) {
+bool LlamaCppBackend::decodePrompt(
+    std::function<bool(int32_t)> promptCallback,
+    std::function<bool(int32_t, const std::string&)> responseCallback,
+    bool allowContextShift,
+    PromptContext &promptCtx,
+    std::vector<Token> embd_inp
+) {
     if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
         responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
         std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
@@ -224,9 +253,11 @@ static std::string::size_type stringsOverlap(const std::string &s, const std::st
     return std::string::npos;
 }
 
-void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
-                               bool allowContextShift,
-                               PromptContext &promptCtx) {
+void LlamaCppBackend::generateResponse(
+    std::function<bool(int32_t, const std::string&)> responseCallback,
+    bool allowContextShift,
+    PromptContext &promptCtx
+) {
     static const char *stopSequences[] {
         "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context",
     };
@@ -371,31 +402,327 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
     promptCtx.n_past -= cachedTokens.size();
 }
 
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
+/* *********************************
+ * Backend implementation management
+ * ********************************* */
+
+#ifndef __APPLE__
+static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
+#elif defined(__aarch64__)
+static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
+#else
+static const std::string DEFAULT_BACKENDS[] = {"cpu"};
+#endif
+
+std::string s_implementations_search_path = ".";
+
+#if !(defined(__x86_64__) || defined(_M_X64))
+    // irrelevant on non-x86_64
+    #define cpu_supports_avx()  -1
+    #define cpu_supports_avx2() -1
+#elif defined(_MSC_VER)
+    // MSVC
+    static int get_cpu_info(int func_id, int reg_id) {
+        int info[4];
+        __cpuid(info, func_id);
+        return info[reg_id];
+    }
+
+    // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
+    #define cpu_supports_avx()  !!(get_cpu_info(1, 2) & (1 << 28))
+    // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
+    #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 <<  5))
+#else
+    // gcc/clang
+    #define cpu_supports_avx()  !!__builtin_cpu_supports("avx")
+    #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
+#endif
+
+LlamaCppBackend::Implementation::Implementation(Dlhandle &&dlhandle_)
+    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
+    auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
+    assert(get_model_type);
+    m_modelType = get_model_type();
+    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
+    assert(get_build_variant);
+    m_buildVariant = get_build_variant();
+    m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
+    assert(m_getFileArch);
+    m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
+    assert(m_isArchSupported);
+    m_construct = m_dlhandle->get<LlamaCppBackend *()>("construct");
+    assert(m_construct);
+}
+
+LlamaCppBackend::Implementation::Implementation(Implementation &&o)
+    : m_getFileArch(o.m_getFileArch)
+    , m_isArchSupported(o.m_isArchSupported)
+    , m_construct(o.m_construct)
+    , m_modelType(o.m_modelType)
+    , m_buildVariant(o.m_buildVariant)
+    , m_dlhandle(o.m_dlhandle) {
+    o.m_dlhandle = nullptr;
+}
+
+LlamaCppBackend::Implementation::~Implementation()
+{
+    delete m_dlhandle;
+}
+
+static bool isImplementation(const Dlhandle &dl)
+{
+    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
+}
+
+// Add the CUDA Toolkit to the DLL search path on Windows.
+// This is necessary for chat.exe to find CUDA when started from Qt Creator.
+static void addCudaSearchPath()
+{
+#ifdef _WIN32
+    if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
+        auto libDir = std::wstring(cudaPath) + L"\\bin";
+        if (!AddDllDirectory(libDir.c_str())) {
+            auto err = GetLastError();
+            std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
+        }
+    }
+#endif
+}
+
+const std::vector<LlamaCppBackend::Implementation> &LlamaCppBackend::Implementation::implementationList()
+{
+    if (cpu_supports_avx() == 0) {
+        throw std::runtime_error("CPU does not support AVX");
+    }
+
+    // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
+    // individual models without the cleanup of the static list interfering
+    static auto* libs = new std::vector<Implementation>([] () {
+        std::vector<Implementation> fres;
+
+        addCudaSearchPath();
+
+        std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
+        if (cpu_supports_avx2() == 0) {
+            impl_name_re += "-avxonly";
+        }
+        std::regex re(impl_name_re);
+        auto search_in_directory = [&](const std::string& paths) {
+            std::stringstream ss(paths);
+            std::string path;
+            // Split the paths string by the delimiter and process each path.
+            while (std::getline(ss, path, ';')) {
+                std::u8string u8_path(path.begin(), path.end());
+                // Iterate over all libraries
+                for (const auto &f : fs::directory_iterator(u8_path)) {
+                    const fs::path &p = f.path();
+
+                    if (p.extension() != LIB_FILE_EXT) continue;
+                    if (!std::regex_search(p.stem().string(), re)) {
+                        std::cerr << "did not match regex: " << p.stem().string() << "\n";
+                        continue;
+                    }
+
+                    // Add to list if model implementation
+                    Dlhandle dl;
+                    try {
+                        dl = Dlhandle(p);
+                    } catch (const Dlhandle::Exception &e) {
+                        std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
+                        continue;
+                    }
+                    if (!isImplementation(dl)) {
+                        std::cerr << "Not an implementation: " << p.filename().string() << "\n";
+                        continue;
+                    }
+                    fres.emplace_back(Implementation(std::move(dl)));
+                }
+            }
+        };
+
+        search_in_directory(s_implementations_search_path);
+
+        return fres;
+    }());
+    // Return static result
+    return *libs;
+}
+
+static std::string applyCPUVariant(const std::string &buildVariant)
+{
+    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
+        return buildVariant + "-avxonly";
+    }
+    return buildVariant;
+}
+
+const LlamaCppBackend::Implementation* LlamaCppBackend::Implementation::implementation(
+    const char *fname,
+    const std::string& buildVariant
 ) {
-    (void)texts;
-    (void)embeddings;
-    (void)prefix;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    (void)cancelCb;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
+    bool buildVariantMatched = false;
+    std::optional<std::string> archName;
+    for (const auto& i : implementationList()) {
+        if (buildVariant != i.m_buildVariant) continue;
+        buildVariantMatched = true;
+
+        char *arch = i.m_getFileArch(fname);
+        if (!arch) continue;
+        archName = arch;
+
+        bool archSupported = i.m_isArchSupported(arch);
+        free(arch);
+        if (archSupported) return &i;
+    }
+
+    if (!buildVariantMatched)
+        return nullptr;
+    if (!archName)
+        throw UnsupportedModelError("Unsupported file format");
+
+    throw BadArchError(std::move(*archName));
 }
 
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
-    bool doMean, bool atlas
+LlamaCppBackend *LlamaCppBackend::Implementation::construct(
+    const std::string &modelPath,
+    const std::string &backend,
+    int n_ctx
 ) {
-    (void)texts;
-    (void)embeddings;
-    (void)isRetrieval;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
+    std::vector<std::string> desiredBackends;
+    if (backend != "auto") {
+        desiredBackends.push_back(backend);
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
+    }
+
+    for (const auto &desiredBackend: desiredBackends) {
+        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
+
+        if (impl) {
+            // Construct llmodel implementation
+            auto *fres = impl->m_construct();
+            fres->m_implementation = impl;
+
+#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
+            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
+             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
+             * most (all?) places where this is called, causing underestimation of required
+             * memory. */
+            if (backend == "auto" && desiredBackend == "metal") {
+                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
+                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
+                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
+                    delete fres;
+                    continue;
+                }
+            }
+#else
+            (void)n_ctx;
+#endif
+
+            return fres;
+        }
+    }
+
+    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
+}
+
+LlamaCppBackend *LlamaCppBackend::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
+{
+    static std::unordered_map<std::string, std::unique_ptr<LlamaCppBackend>> implCache;
+
+    const std::vector<Implementation> *impls;
+    try {
+        impls = &implementationList();
+    } catch (const std::runtime_error &e) {
+        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
+        return nullptr;
+    }
+
+    std::vector<std::string> desiredBackends;
+    if (backend) {
+        desiredBackends.push_back(backend.value());
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
+    }
+
+    const Implementation *impl = nullptr;
+
+    for (const auto &desiredBackend: desiredBackends) {
+        auto cacheIt = implCache.find(desiredBackend);
+        if (cacheIt != implCache.end())
+            return cacheIt->second.get(); // cached
+
+        for (const auto &i: *impls) {
+            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
+                impl = &i;
+                break;
+            }
+        }
+
+        if (impl) {
+            auto *fres = impl->m_construct();
+            fres->m_implementation = impl;
+            implCache[desiredBackend] = std::unique_ptr<LlamaCppBackend>(fres);
+            return fres;
+        }
+    }
+
+    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default")
+              << "\n";
+    return nullptr;
+}
+
+std::vector<LlamaCppBackend::GPUDevice> LlamaCppBackend::Implementation::availableGPUDevices(size_t memoryRequired)
+{
+    std::vector<LlamaCppBackend::GPUDevice> devices;
+#ifndef __APPLE__
+    static const std::string backends[] = {"kompute", "cuda"};
+    for (const auto &backend: backends) {
+        auto *llama = constructGlobalLlama(backend);
+        if (llama) {
+            auto backendDevs = llama->availableGPUDevices(memoryRequired);
+            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
+        }
+    }
+#endif
+    return devices;
+}
+
+int32_t LlamaCppBackend::Implementation::maxContextLength(const std::string &modelPath)
+{
+    auto *llama = constructGlobalLlama();
+    return llama ? llama->maxContextLength(modelPath) : -1;
+}
+
+int32_t LlamaCppBackend::Implementation::layerCount(const std::string &modelPath)
+{
+    auto *llama = constructGlobalLlama();
+    return llama ? llama->layerCount(modelPath) : -1;
+}
+
+bool LlamaCppBackend::Implementation::isEmbeddingModel(const std::string &modelPath)
+{
+    auto *llama = constructGlobalLlama();
+    return llama && llama->isEmbeddingModel(modelPath);
+}
+
+void LlamaCppBackend::Implementation::setImplementationsSearchPath(const std::string& path)
+{
+    s_implementations_search_path = path;
+}
+
+const std::string& LlamaCppBackend::Implementation::implementationsSearchPath()
+{
+    return s_implementations_search_path;
+}
+
+bool LlamaCppBackend::Implementation::hasSupportedCPU()
+{
+    return cpu_supports_avx() != 0;
+}
+
+int LlamaCppBackend::Implementation::cpuSupportsAVX2()
+{
+    return cpu_supports_avx2();
 }
diff --git a/gpt4all-backend/llamacpp_backend.h b/gpt4all-backend/llamacpp_backend.h
new file mode 100644
index 0000000000000..d04ec7d39d998
--- /dev/null
+++ b/gpt4all-backend/llamacpp_backend.h
@@ -0,0 +1,187 @@
+#pragma once
+
+#include "llmodel.h"
+
+class LlamaCppBackend : public EmbLLModel {
+public:
+    class BadArchError: public std::runtime_error {
+    public:
+        BadArchError(std::string arch)
+            : runtime_error("Unsupported model architecture: " + arch)
+            , m_arch(std::move(arch))
+            {}
+
+        const std::string &arch() const noexcept { return m_arch; }
+
+    private:
+        std::string m_arch;
+    };
+
+    class MissingImplementationError: public std::runtime_error {
+    public:
+        using std::runtime_error::runtime_error;
+    };
+
+    class UnsupportedModelError: public std::runtime_error {
+    public:
+        using std::runtime_error::runtime_error;
+    };
+
+    struct GPUDevice {
+        const char *backend;
+        int index;
+        int type;
+        size_t heapSize;
+        std::string name;
+        std::string vendor;
+
+        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
+            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
+            vendor(std::move(vendor)) {}
+
+        std::string selectionName() const
+        {
+            assert(backend == "cuda"s || backend == "kompute"s);
+            return backendName() + ": " + name;
+        }
+
+        std::string backendName() const { return backendIdToName(backend); }
+
+        static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
+
+        static std::string updateSelectionName(const std::string &name) {
+            if (name == "Auto" || name == "CPU" || name == "Metal")
+                return name;
+            auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
+                return name.starts_with(entry.second + ": ");
+            });
+            if (it != s_backendNames.end())
+                return name;
+            return "Vulkan: " + name; // previously, there were only Vulkan devices
+        }
+
+    private:
+        static inline const std::unordered_map<std::string, std::string> s_backendNames {
+            {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
+        };
+    };
+
+    class Implementation {
+    public:
+        Implementation(const Implementation &) = delete;
+        Implementation(Implementation &&);
+        ~Implementation();
+
+        std::string_view modelType() const { return m_modelType; }
+        std::string_view buildVariant() const { return m_buildVariant; }
+
+        static LlamaCppBackend *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
+        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
+        static int32_t maxContextLength(const std::string &modelPath);
+        static int32_t layerCount(const std::string &modelPath);
+        static bool isEmbeddingModel(const std::string &modelPath);
+        static void setImplementationsSearchPath(const std::string &path);
+        static const std::string &implementationsSearchPath();
+        static bool hasSupportedCPU();
+        // 0 for no, 1 for yes, -1 for non-x86_64
+        static int cpuSupportsAVX2();
+
+    private:
+        Implementation(Dlhandle &&);
+
+        static const std::vector<Implementation> &implementationList();
+        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
+        static LlamaCppBackend *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
+
+        char *(*m_getFileArch)(const char *fname);
+        bool (*m_isArchSupported)(const char *arch);
+        LlamaCppBackend *(*m_construct)();
+
+        std::string_view m_modelType;
+        std::string_view m_buildVariant;
+        Dlhandle *m_dlhandle;
+    };
+
+    using ProgressCallback = std::function<bool(float progress)>;
+
+    virtual bool isModelBlacklisted(const std::string &modelPath) const = 0;
+    virtual bool isEmbeddingModel(const std::string &modelPath) const = 0;
+    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
+
+    void prompt(const std::string &prompt,
+                const std::string &promptTemplate,
+                std::function<bool(int32_t)> promptCallback,
+                std::function<bool(int32_t, const std::string&)> responseCallback,
+                bool allowContextShift,
+                PromptContext &ctx,
+                bool special = false,
+                std::string *fakeReply = nullptr) override;
+
+    virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
+    virtual int32_t threadCount() const { return 1; }
+
+    const Implementation &implementation() const { return *m_implementation; }
+
+    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const
+    {
+        (void)memoryRequired;
+        return {};
+    }
+
+    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const
+    {
+        (void)memoryRequired;
+        (void)name;
+        return false;
+    }
+
+    virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const
+    {
+        (void)device;
+        if (unavail_reason) {
+            *unavail_reason = "model has no GPU support";
+        }
+        return false;
+    }
+
+    virtual bool usingGPUDevice() const { return false; }
+    virtual const char *backendName() const { return "cpu"; }
+    virtual const char *gpuDeviceName() const { return nullptr; }
+
+    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
+
+protected:
+    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0;
+    virtual bool isSpecialToken(Token id) const = 0;
+    virtual std::string tokenToString(Token id) const = 0;
+    virtual Token sampleToken(PromptContext &ctx) const = 0;
+    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
+    virtual void shiftContext(PromptContext &promptCtx) = 0;
+    virtual int32_t contextLength() const = 0;
+    virtual const std::vector<Token> &endTokens() const = 0;
+    virtual bool shouldAddBOS() const = 0;
+
+    virtual int32_t maxContextLength(std::string const &modelPath) const = 0;
+    virtual int32_t layerCount(std::string const &modelPath) const = 0;
+
+    static bool staticProgressCallback(float progress, void* ctx)
+    {
+        LlamaCppBackend *model = static_cast<LlamaCppBackend *>(ctx);
+        if (model && model->m_progressCallback)
+            return model->m_progressCallback(progress);
+        return true;
+    }
+
+    bool decodePrompt(std::function<bool(int32_t)> promptCallback,
+                      std::function<bool(int32_t, const std::string&)> responseCallback,
+                      bool allowContextShift,
+                      PromptContext &promptCtx,
+                      std::vector<Token> embd_inp);
+    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
+                          bool allowContextShift,
+                          PromptContext &promptCtx);
+
+    const Implementation *m_implementation      = nullptr;
+    ProgressCallback      m_progressCallback;
+    Token                 m_tokenize_last_token = -1;
+};
diff --git a/gpt4all-backend/llamacpp_backend_impl.cpp b/gpt4all-backend/llamacpp_backend_impl.cpp
index aece51c6bb788..0ace53bb50ac7 100644
--- a/gpt4all-backend/llamacpp_backend_impl.cpp
+++ b/gpt4all-backend/llamacpp_backend_impl.cpp
@@ -378,7 +378,7 @@ bool LlamaCppBackendImpl::loadModel(const std::string &modelPath, int n_ctx, int
     d_ptr->model_params.use_mlock = params.use_mlock;
 #endif
 
-    d_ptr->model_params.progress_callback = &LLModel::staticProgressCallback;
+    d_ptr->model_params.progress_callback = &LlamaCppBackend::staticProgressCallback;
     d_ptr->model_params.progress_callback_user_data = this;
 
     d_ptr->backend_name = "cpu"; // default
@@ -659,7 +659,7 @@ static const char *getVulkanVendorName(uint32_t vendorID)
 }
 #endif
 
-std::vector<LLModel::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const
+std::vector<LlamaCppBackendImpl::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t memoryRequired) const
 {
 #if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
     size_t count = 0;
@@ -675,7 +675,7 @@ std::vector<LLModel::GPUDevice> LlamaCppBackendImpl::availableGPUDevices(size_t
 #endif
 
     if (lcppDevices) {
-        std::vector<LLModel::GPUDevice> devices;
+        std::vector<GPUDevice> devices;
         devices.reserve(count);
 
         for (size_t i = 0; i < count; ++i) {
@@ -909,7 +909,7 @@ void LlamaCppBackendImpl::embed(
 
 void LlamaCppBackendImpl::embed(
     const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb
+    size_t *tokenCount, bool doMean, bool atlas, EmbLLModel::EmbedCancelCallback *cancelCb
 ) {
     if (!d_ptr->model)
         throw std::logic_error("no model is loaded");
@@ -967,7 +967,7 @@ double getL2NormScale(T *start, T *end)
 
 void LlamaCppBackendImpl::embedInternal(
     const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
+    size_t *tokenCount, bool doMean, bool atlas, EmbLLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
 ) {
     typedef std::vector<LLModel::Token> TokenString;
     static constexpr int32_t atlasMaxLength = 8192;
diff --git a/gpt4all-backend/llamacpp_backend_impl.h b/gpt4all-backend/llamacpp_backend_impl.h
index 5923572f961e7..7ed73c579d42b 100644
--- a/gpt4all-backend/llamacpp_backend_impl.h
+++ b/gpt4all-backend/llamacpp_backend_impl.h
@@ -1,10 +1,10 @@
+#pragma once
+
 #ifndef LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #error This file is NOT meant to be included outside of llamacpp_backend_impl.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMACPP_BACKEND_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #endif
-#ifndef LLAMACPP_BACKEND_IMPL_H
-#define LLAMACPP_BACKEND_IMPL_H
 
-#include "llmodel.h"
+#include "llamacpp_backend.h"
 
 #include <memory>
 #include <string>
@@ -13,7 +13,7 @@
 struct LlamaPrivate;
 struct EmbModelSpec;
 
-class LlamaCppBackendImpl : public LLModel {
+class LlamaCppBackendImpl : public LlamaCppBackend {
 public:
     LlamaCppBackendImpl();
     ~LlamaCppBackendImpl();
@@ -68,5 +68,3 @@ class LlamaCppBackendImpl : public LLModel {
                        size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
                        const EmbModelSpec *spec);
 };
-
-#endif // LLAMACPP_BACKEND_IMPL_H
diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp
deleted file mode 100644
index 7b18004aa10b4..0000000000000
--- a/gpt4all-backend/llmodel.cpp
+++ /dev/null
@@ -1,350 +0,0 @@
-#include "llmodel.h"
-
-#include "dlhandle.h"
-
-#include <cassert>
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#ifdef _WIN32
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
-#endif
-
-#ifdef _MSC_VER
-#   include <intrin.h>
-#endif
-
-#if defined(__APPLE__) && defined(__aarch64__)
-#   include "sysinfo.h" // for getSystemTotalRAMInBytes
-#endif
-
-namespace fs = std::filesystem;
-
-#ifndef __APPLE__
-static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
-#elif defined(__aarch64__)
-static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
-#else
-static const std::string DEFAULT_BACKENDS[] = {"cpu"};
-#endif
-
-std::string s_implementations_search_path = ".";
-
-#if !(defined(__x86_64__) || defined(_M_X64))
-    // irrelevant on non-x86_64
-    #define cpu_supports_avx()  -1
-    #define cpu_supports_avx2() -1
-#elif defined(_MSC_VER)
-    // MSVC
-    static int get_cpu_info(int func_id, int reg_id) {
-        int info[4];
-        __cpuid(info, func_id);
-        return info[reg_id];
-    }
-
-    // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
-    #define cpu_supports_avx()  !!(get_cpu_info(1, 2) & (1 << 28))
-    // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
-    #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 <<  5))
-#else
-    // gcc/clang
-    #define cpu_supports_avx()  !!__builtin_cpu_supports("avx")
-    #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
-#endif
-
-LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
-    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
-    auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
-    assert(get_model_type);
-    m_modelType = get_model_type();
-    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
-    assert(get_build_variant);
-    m_buildVariant = get_build_variant();
-    m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
-    assert(m_getFileArch);
-    m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
-    assert(m_isArchSupported);
-    m_construct = m_dlhandle->get<LLModel *()>("construct");
-    assert(m_construct);
-}
-
-LLModel::Implementation::Implementation(Implementation &&o)
-    : m_getFileArch(o.m_getFileArch)
-    , m_isArchSupported(o.m_isArchSupported)
-    , m_construct(o.m_construct)
-    , m_modelType(o.m_modelType)
-    , m_buildVariant(o.m_buildVariant)
-    , m_dlhandle(o.m_dlhandle) {
-    o.m_dlhandle = nullptr;
-}
-
-LLModel::Implementation::~Implementation()
-{
-    delete m_dlhandle;
-}
-
-static bool isImplementation(const Dlhandle &dl)
-{
-    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
-}
-
-// Add the CUDA Toolkit to the DLL search path on Windows.
-// This is necessary for chat.exe to find CUDA when started from Qt Creator.
-static void addCudaSearchPath()
-{
-#ifdef _WIN32
-    if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
-        auto libDir = std::wstring(cudaPath) + L"\\bin";
-        if (!AddDllDirectory(libDir.c_str())) {
-            auto err = GetLastError();
-            std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
-        }
-    }
-#endif
-}
-
-const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
-{
-    if (cpu_supports_avx() == 0) {
-        throw std::runtime_error("CPU does not support AVX");
-    }
-
-    // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
-    // individual models without the cleanup of the static list interfering
-    static auto* libs = new std::vector<Implementation>([] () {
-        std::vector<Implementation> fres;
-
-        addCudaSearchPath();
-
-        std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
-        if (cpu_supports_avx2() == 0) {
-            impl_name_re += "-avxonly";
-        }
-        std::regex re(impl_name_re);
-        auto search_in_directory = [&](const std::string& paths) {
-            std::stringstream ss(paths);
-            std::string path;
-            // Split the paths string by the delimiter and process each path.
-            while (std::getline(ss, path, ';')) {
-                std::u8string u8_path(path.begin(), path.end());
-                // Iterate over all libraries
-                for (const auto &f : fs::directory_iterator(u8_path)) {
-                    const fs::path &p = f.path();
-
-                    if (p.extension() != LIB_FILE_EXT) continue;
-                    if (!std::regex_search(p.stem().string(), re)) {
-                        std::cerr << "did not match regex: " << p.stem().string() << "\n";
-                        continue;
-                    }
-
-                    // Add to list if model implementation
-                    Dlhandle dl;
-                    try {
-                        dl = Dlhandle(p);
-                    } catch (const Dlhandle::Exception &e) {
-                        std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
-                        continue;
-                    }
-                    if (!isImplementation(dl)) {
-                        std::cerr << "Not an implementation: " << p.filename().string() << "\n";
-                        continue;
-                    }
-                    fres.emplace_back(Implementation(std::move(dl)));
-                }
-            }
-        };
-
-        search_in_directory(s_implementations_search_path);
-
-        return fres;
-    }());
-    // Return static result
-    return *libs;
-}
-
-static std::string applyCPUVariant(const std::string &buildVariant)
-{
-    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
-        return buildVariant + "-avxonly";
-    }
-    return buildVariant;
-}
-
-const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
-{
-    bool buildVariantMatched = false;
-    std::optional<std::string> archName;
-    for (const auto& i : implementationList()) {
-        if (buildVariant != i.m_buildVariant) continue;
-        buildVariantMatched = true;
-
-        char *arch = i.m_getFileArch(fname);
-        if (!arch) continue;
-        archName = arch;
-
-        bool archSupported = i.m_isArchSupported(arch);
-        free(arch);
-        if (archSupported) return &i;
-    }
-
-    if (!buildVariantMatched)
-        return nullptr;
-    if (!archName)
-        throw UnsupportedModelError("Unsupported file format");
-
-    throw BadArchError(std::move(*archName));
-}
-
-LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx)
-{
-    std::vector<std::string> desiredBackends;
-    if (backend != "auto") {
-        desiredBackends.push_back(backend);
-    } else {
-        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
-    }
-
-    for (const auto &desiredBackend: desiredBackends) {
-        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
-
-        if (impl) {
-            // Construct llmodel implementation
-            auto *fres = impl->m_construct();
-            fres->m_implementation = impl;
-
-#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
-            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
-             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
-             * most (all?) places where this is called, causing underestimation of required
-             * memory. */
-            if (backend == "auto" && desiredBackend == "metal") {
-                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
-                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
-                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
-                    delete fres;
-                    continue;
-                }
-            }
-#else
-            (void)n_ctx;
-#endif
-
-            return fres;
-        }
-    }
-
-    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
-}
-
-LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
-{
-    static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
-
-    const std::vector<Implementation> *impls;
-    try {
-        impls = &implementationList();
-    } catch (const std::runtime_error &e) {
-        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
-        return nullptr;
-    }
-
-    std::vector<std::string> desiredBackends;
-    if (backend) {
-        desiredBackends.push_back(backend.value());
-    } else {
-        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
-    }
-
-    const Implementation *impl = nullptr;
-
-    for (const auto &desiredBackend: desiredBackends) {
-        auto cacheIt = implCache.find(desiredBackend);
-        if (cacheIt != implCache.end())
-            return cacheIt->second.get(); // cached
-
-        for (const auto &i: *impls) {
-            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
-                impl = &i;
-                break;
-            }
-        }
-
-        if (impl) {
-            auto *fres = impl->m_construct();
-            fres->m_implementation = impl;
-            implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
-            return fres;
-        }
-    }
-
-    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
-    return nullptr;
-}
-
-std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired)
-{
-    std::vector<LLModel::GPUDevice> devices;
-#ifndef __APPLE__
-    static const std::string backends[] = {"kompute", "cuda"};
-    for (const auto &backend: backends) {
-        auto *llama = constructGlobalLlama(backend);
-        if (llama) {
-            auto backendDevs = llama->availableGPUDevices(memoryRequired);
-            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
-        }
-    }
-#endif
-    return devices;
-}
-
-int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->maxContextLength(modelPath) : -1;
-}
-
-int32_t LLModel::Implementation::layerCount(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->layerCount(modelPath) : -1;
-}
-
-bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama && llama->isEmbeddingModel(modelPath);
-}
-
-void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
-{
-    s_implementations_search_path = path;
-}
-
-const std::string& LLModel::Implementation::implementationsSearchPath()
-{
-    return s_implementations_search_path;
-}
-
-bool LLModel::Implementation::hasSupportedCPU()
-{
-    return cpu_supports_avx() != 0;
-}
-
-int LLModel::Implementation::cpuSupportsAVX2()
-{
-    return cpu_supports_avx2();
-}
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
index 04a510dc740f2..83c559ff779cd 100644
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -1,5 +1,4 @@
-#ifndef LLMODEL_H
-#define LLMODEL_H
+#pragma once
 
 #include <algorithm>
 #include <cassert>
@@ -24,104 +23,6 @@ class LLModel {
 public:
     using Token = int32_t;
 
-    class BadArchError: public std::runtime_error {
-    public:
-        BadArchError(std::string arch)
-            : runtime_error("Unsupported model architecture: " + arch)
-            , m_arch(std::move(arch))
-            {}
-
-        const std::string &arch() const noexcept { return m_arch; }
-
-    private:
-        std::string m_arch;
-    };
-
-    class MissingImplementationError: public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    class UnsupportedModelError: public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    struct GPUDevice {
-        const char *backend;
-        int index;
-        int type;
-        size_t heapSize;
-        std::string name;
-        std::string vendor;
-
-        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
-            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
-            vendor(std::move(vendor)) {}
-
-        std::string selectionName() const
-        {
-            assert(backend == "cuda"s || backend == "kompute"s);
-            return backendName() + ": " + name;
-        }
-
-        std::string backendName() const { return backendIdToName(backend); }
-
-        static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
-
-        static std::string updateSelectionName(const std::string &name) {
-            if (name == "Auto" || name == "CPU" || name == "Metal")
-                return name;
-            auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
-                return name.starts_with(entry.second + ": ");
-            });
-            if (it != s_backendNames.end())
-                return name;
-            return "Vulkan: " + name; // previously, there were only Vulkan devices
-        }
-
-    private:
-        static inline const std::unordered_map<std::string, std::string> s_backendNames {
-            {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
-        };
-    };
-
-    class Implementation {
-    public:
-        Implementation(const Implementation &) = delete;
-        Implementation(Implementation &&);
-        ~Implementation();
-
-        std::string_view modelType() const { return m_modelType; }
-        std::string_view buildVariant() const { return m_buildVariant; }
-
-        static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
-        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
-        static int32_t maxContextLength(const std::string &modelPath);
-        static int32_t layerCount(const std::string &modelPath);
-        static bool isEmbeddingModel(const std::string &modelPath);
-        static void setImplementationsSearchPath(const std::string &path);
-        static const std::string &implementationsSearchPath();
-        static bool hasSupportedCPU();
-        // 0 for no, 1 for yes, -1 for non-x86_64
-        static int cpuSupportsAVX2();
-
-    private:
-        Implementation(Dlhandle &&);
-
-        static const std::vector<Implementation> &implementationList();
-        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
-        static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
-
-        char *(*m_getFileArch)(const char *fname);
-        bool (*m_isArchSupported)(const char *arch);
-        LLModel *(*m_construct)();
-
-        std::string_view m_modelType;
-        std::string_view m_buildVariant;
-        Dlhandle *m_dlhandle;
-    };
-
     struct PromptContext {
         std::vector<int32_t> tokens;    // current tokens in the context window
         int32_t n_past = 0;             // number of tokens in past conversation
@@ -137,18 +38,11 @@ class LLModel {
         float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
     };
 
-    using ProgressCallback = std::function<bool(float progress)>;
-
-    explicit LLModel() {}
     virtual ~LLModel() {}
 
-    virtual bool supportsEmbedding() const = 0;
-    virtual bool supportsCompletion() const = 0;
+    virtual bool supportsCompletion() const { return true; }
     virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; };
-    virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
     virtual bool isModelLoaded() const = 0;
-    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
     virtual size_t stateSize() const { return 0; }
     virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
     virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
@@ -162,101 +56,25 @@ class LLModel {
                         bool allowContextShift,
                         PromptContext &ctx,
                         bool special = false,
-                        std::string *fakeReply = nullptr);
+                        std::string *fakeReply = nullptr) = 0;
 
+protected:
+    explicit LLModel() {}
+};
+
+class EmbLLModel: virtual public LLModel {
+public:
     using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
 
-    virtual size_t embeddingSize() const {
-        throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-    }
+    virtual bool supportsCompletion() const = 0;
+    virtual bool supportsEmbedding() const = 0;
+    virtual size_t embeddingSize() const = 0;
+
     // user-specified prefix
     virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
                        int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
-                       EmbedCancelCallback *cancelCb = nullptr);
+                       EmbedCancelCallback *cancelCb = nullptr) = 0;
     // automatic prefix
     virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
-                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
-
-    virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
-    virtual int32_t threadCount() const { return 1; }
-
-    const Implementation &implementation() const {
-        return *m_implementation;
-    }
-
-    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const {
-        (void)memoryRequired;
-        return {};
-    }
-
-    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const {
-        (void)memoryRequired;
-        (void)name;
-        return false;
-    }
-
-    virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const {
-        (void)device;
-        if (unavail_reason) {
-            *unavail_reason = "model has no GPU support";
-        }
-        return false;
-    }
-
-    virtual bool usingGPUDevice() const { return false; }
-    virtual const char *backendName() const { return "cpu"; }
-    virtual const char *gpuDeviceName() const { return nullptr; }
-
-    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
-
-protected:
-    // These are pure virtual because subclasses need to implement as the default implementation of
-    // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0;
-    virtual bool isSpecialToken(Token id) const = 0;
-    virtual std::string tokenToString(Token id) const = 0;
-    virtual Token sampleToken(PromptContext &ctx) const = 0;
-    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
-    virtual void shiftContext(PromptContext &promptCtx) = 0;
-    virtual int32_t contextLength() const = 0;
-    virtual const std::vector<Token> &endTokens() const = 0;
-    virtual bool shouldAddBOS() const = 0;
-
-    virtual int32_t maxContextLength(std::string const &modelPath) const
-    {
-        (void)modelPath;
-        return -1;
-    }
-
-    virtual int32_t layerCount(std::string const &modelPath) const
-    {
-        (void)modelPath;
-        return -1;
-    }
-
-    const Implementation *m_implementation = nullptr;
-
-    ProgressCallback m_progressCallback;
-    static bool staticProgressCallback(float progress, void* ctx)
-    {
-        LLModel* model = static_cast<LLModel*>(ctx);
-        if (model && model->m_progressCallback)
-            return model->m_progressCallback(progress);
-        return true;
-    }
-
-    bool decodePrompt(std::function<bool(int32_t)> promptCallback,
-                      std::function<bool(int32_t, const std::string&)> responseCallback,
-                      bool allowContextShift,
-                      PromptContext &promptCtx,
-                      std::vector<Token> embd_inp);
-    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
-                          bool allowContextShift,
-                          PromptContext &promptCtx);
-
-    Token m_tokenize_last_token = -1; // not serialized
-
-    friend class LLMImplementation;
+                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) = 0;
 };
-
-#endif // LLMODEL_H
diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp
index f3fd68ffa69c4..f4adf1c3fa876 100644
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -1,5 +1,6 @@
 #include "llmodel_c.h"
 
+#include "llamacpp_backend.h"
 #include "llmodel.h"
 
 #include <algorithm>
@@ -15,7 +16,7 @@
 #include <vector>
 
 struct LLModelWrapper {
-    LLModel *llModel = nullptr;
+    LlamaCppBackend *llModel = nullptr;
     LLModel::PromptContext promptContext;
     ~LLModelWrapper() { delete llModel; }
 };
@@ -41,9 +42,9 @@ static void llmodel_set_error(const char **errptr, const char *message)
 
 llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
 {
-    LLModel *llModel;
+    LlamaCppBackend *llModel;
     try {
-        llModel = LLModel::Implementation::construct(model_path, backend);
+        llModel = LlamaCppBackend::Implementation::construct(model_path, backend);
     } catch (const std::exception& e) {
         llmodel_set_error(error, e.what());
         return nullptr;
@@ -214,12 +215,12 @@ int32_t llmodel_threadCount(llmodel_model model)
 
 void llmodel_set_implementation_search_path(const char *path)
 {
-    LLModel::Implementation::setImplementationsSearchPath(path);
+    LlamaCppBackend::Implementation::setImplementationsSearchPath(path);
 }
 
 const char *llmodel_get_implementation_search_path()
 {
-    return LLModel::Implementation::implementationsSearchPath().c_str();
+    return LlamaCppBackend::Implementation::implementationsSearchPath().c_str();
 }
 
 // RAII wrapper around a C-style struct
@@ -244,7 +245,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
 {
     static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;
 
-    auto devices = LLModel::Implementation::availableGPUDevices(memoryRequired);
+    auto devices = LlamaCppBackend::Implementation::availableGPUDevices(memoryRequired);
     *num_devices = devices.size();
 
     if (devices.empty()) { return nullptr; /* no devices */ }
diff --git a/gpt4all-chat/chatapi.cpp b/gpt4all-chat/chatapi.cpp
index b443f24c3ab7e..ada3332518d19 100644
--- a/gpt4all-chat/chatapi.cpp
+++ b/gpt4all-chat/chatapi.cpp
@@ -32,14 +32,6 @@ ChatAPI::ChatAPI()
 {
 }
 
-size_t ChatAPI::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
-{
-    Q_UNUSED(modelPath);
-    Q_UNUSED(n_ctx);
-    Q_UNUSED(ngl);
-    return 0;
-}
-
 bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
     Q_UNUSED(modelPath);
@@ -48,20 +40,7 @@ bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl)
     return true;
 }
 
-void ChatAPI::setThreadCount(int32_t n_threads)
-{
-    Q_UNUSED(n_threads);
-    qt_noop();
-}
-
-int32_t ChatAPI::threadCount() const
-{
-    return 1;
-}
-
-ChatAPI::~ChatAPI()
-{
-}
+ChatAPI::~ChatAPI() {}
 
 bool ChatAPI::isModelLoaded() const
 {
diff --git a/gpt4all-chat/chatapi.h b/gpt4all-chat/chatapi.h
index 59b68f5821081..0decb6421a6a7 100644
--- a/gpt4all-chat/chatapi.h
+++ b/gpt4all-chat/chatapi.h
@@ -57,11 +57,8 @@ class ChatAPI : public QObject, public LLModel {
     ChatAPI();
     virtual ~ChatAPI();
 
-    bool supportsEmbedding() const override { return false; }
-    bool supportsCompletion() const override { return true; }
     bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;
@@ -74,9 +71,6 @@ class ChatAPI : public QObject, public LLModel {
                 bool special,
                 std::string *fakeReply) override;
 
-    void setThreadCount(int32_t n_threads) override;
-    int32_t threadCount() const override;
-
     void setModelName(const QString &modelName) { m_modelName = modelName; }
     void setAPIKey(const QString &apiKey) { m_apiKey = apiKey; }
     void setRequestURL(const QString &requestURL) { m_requestURL = requestURL; }
@@ -92,65 +86,6 @@ class ChatAPI : public QObject, public LLModel {
                  LLModel::PromptContext *ctx,
                  const QByteArray &array);
 
-protected:
-    // We have to implement these as they are pure virtual in base class, but we don't actually use
-    // them as they are only called from the default implementation of 'prompt' which we override and
-    // completely replace
-
-    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) override
-    {
-        (void)ctx;
-        (void)str;
-        (void)special;
-        throw std::logic_error("not implemented");
-    }
-
-    bool isSpecialToken(Token id) const override
-    {
-        (void)id;
-        throw std::logic_error("not implemented");
-    }
-
-    std::string tokenToString(Token id) const override
-    {
-        (void)id;
-        throw std::logic_error("not implemented");
-    }
-
-    Token sampleToken(PromptContext &ctx) const override
-    {
-        (void)ctx;
-        throw std::logic_error("not implemented");
-    }
-
-    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override
-    {
-        (void)ctx;
-        (void)tokens;
-        throw std::logic_error("not implemented");
-    }
-
-    void shiftContext(PromptContext &promptCtx) override
-    {
-        (void)promptCtx;
-        throw std::logic_error("not implemented");
-    }
-
-    int32_t contextLength() const override
-    {
-        throw std::logic_error("not implemented");
-    }
-
-    const std::vector<Token> &endTokens() const override
-    {
-        throw std::logic_error("not implemented");
-    }
-
-    bool shouldAddBOS() const override
-    {
-        throw std::logic_error("not implemented");
-    }
-
 private:
     std::function<bool(int32_t, const std::string&)> m_responseCallback;
     QString m_modelName;
diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp
index e9fb7f3132f95..b386d0ce684a3 100644
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -412,19 +412,20 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
 
     QString filePath = modelInfo.dirpath + modelInfo.filename();
 
-    auto construct = [this, &filePath, &modelInfo, &modelLoadProps, n_ctx](std::string const &backend) {
+    auto construct = [this, &filePath, &modelInfo, &modelLoadProps, n_ctx](std::string const &backend) -> LlamaCppBackend * {
+        LlamaCppBackend *lcppmodel;
         QString constructError;
         m_llModelInfo.resetModel(this);
         try {
-            auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
-            m_llModelInfo.resetModel(this, model);
-        } catch (const LLModel::MissingImplementationError &e) {
+            lcppmodel = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx);
+            m_llModelInfo.resetModel(this, lcppmodel);
+        } catch (const LlamaCppBackend::MissingImplementationError &e) {
             modelLoadProps.insert("error", "missing_model_impl");
             constructError = e.what();
-        } catch (const LLModel::UnsupportedModelError &e) {
+        } catch (const LlamaCppBackend::UnsupportedModelError &e) {
             modelLoadProps.insert("error", "unsupported_model_file");
             constructError = e.what();
-        } catch (const LLModel::BadArchError &e) {
+        } catch (const LlamaCppBackend::BadArchError &e) {
             constructError = e.what();
             modelLoadProps.insert("error", "unsupported_model_arch");
             modelLoadProps.insert("model_arch", QString::fromStdString(e.arch()));
@@ -435,21 +436,22 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
                 LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
             resetModel();
             emit modelLoadingError(u"Error loading %1: %2"_s.arg(modelInfo.filename(), constructError));
-            return false;
+            return nullptr;
         }
 
-        m_llModelInfo.model->setProgressCallback([this](float progress) -> bool {
+        lcppmodel->setProgressCallback([this](float progress) -> bool {
             progress = std::max(progress, std::numeric_limits<float>::min()); // keep progress above zero
             emit modelLoadingPercentageChanged(progress);
             return m_shouldBeLoaded;
         });
-        return true;
+        return lcppmodel;
     };
 
-    if (!construct(backend))
+    auto *lcppmodel = construct(backend);
+    if (!lcppmodel)
         return true;
 
-    if (m_llModelInfo.model->isModelBlacklisted(filePath.toStdString())) {
+    if (lcppmodel->isModelBlacklisted(filePath.toStdString())) {
         static QSet<QString> warned;
         auto fname = modelInfo.filename();
         if (!warned.contains(fname)) {
@@ -460,16 +462,16 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
         }
     }
 
-    auto approxDeviceMemGB = [](const LLModel::GPUDevice *dev) {
+    auto approxDeviceMemGB = [](const LlamaCppBackend::GPUDevice *dev) {
         float memGB = dev->heapSize / float(1024 * 1024 * 1024);
         return std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place
     };
 
-    std::vector<LLModel::GPUDevice> availableDevices;
-    const LLModel::GPUDevice *defaultDevice = nullptr;
+    std::vector<LlamaCppBackend::GPUDevice> availableDevices;
+    const LlamaCppBackend::GPUDevice *defaultDevice = nullptr;
     {
-        const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
-        availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
+        const size_t requiredMemory = lcppmodel->requiredMem(filePath.toStdString(), n_ctx, ngl);
+        availableDevices = lcppmodel->availableGPUDevices(requiredMemory);
         // Pick the best device
         // NB: relies on the fact that Kompute devices are listed first
         if (!availableDevices.empty() && availableDevices.front().type == 2 /*a discrete gpu*/) {
@@ -485,14 +487,14 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
     bool actualDeviceIsCPU = true;
 
 #if defined(Q_OS_MAC) && defined(__aarch64__)
-    if (m_llModelInfo.model->implementation().buildVariant() == "metal")
+    if (lcppmodel->implementation().buildVariant() == "metal")
         actualDeviceIsCPU = false;
 #else
     if (requestedDevice != "CPU") {
         const auto *device = defaultDevice;
         if (requestedDevice != "Auto") {
             // Use the selected device
-            for (const LLModel::GPUDevice &d : availableDevices) {
+            for (const auto &d : availableDevices) {
                 if (QString::fromStdString(d.selectionName()) == requestedDevice) {
                     device = &d;
                     break;
@@ -503,7 +505,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
         std::string unavail_reason;
         if (!device) {
             // GPU not available
-        } else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
+        } else if (!lcppmodel->initializeGPUDevice(device->index, &unavail_reason)) {
             m_llModelInfo.fallbackReason = QString::fromStdString(unavail_reason);
         } else {
             actualDeviceIsCPU = false;
@@ -512,7 +514,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
     }
 #endif
 
-    bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
+    bool success = lcppmodel->loadModel(filePath.toStdString(), n_ctx, ngl);
 
     if (!m_shouldBeLoaded) {
         m_llModelInfo.resetModel(this);
@@ -531,10 +533,13 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
         modelLoadProps.insert("cpu_fallback_reason", "gpu_load_failed");
 
         // For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls
-        if (backend == "cuda" && !construct("auto"))
-            return true;
+        if (backend == "cuda") {
+            lcppmodel = construct("auto");
+            if (!lcppmodel)
+                return true;
+        }
 
-        success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0);
+        success = lcppmodel->loadModel(filePath.toStdString(), n_ctx, 0);
 
         if (!m_shouldBeLoaded) {
             m_llModelInfo.resetModel(this);
@@ -544,7 +549,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
             emit modelLoadingPercentageChanged(0.0f);
             return false;
         }
-    } else if (!m_llModelInfo.model->usingGPUDevice()) {
+    } else if (!lcppmodel->usingGPUDevice()) {
         // ggml_vk_init was not called in llama.cpp
         // We might have had to fallback to CPU after load if the model is not possible to accelerate
         // for instance if the quantization method is not supported on Vulkan yet
@@ -562,7 +567,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
         return true;
     }
 
-    switch (m_llModelInfo.model->implementation().modelType()[0]) {
+    switch (lcppmodel->implementation().modelType()[0]) {
     case 'L': m_llModelType = LLModelType::LLAMA_; break;
     default:
         {
@@ -774,11 +779,15 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
     m_ctx.n_batch = n_batch;
     m_ctx.repeat_penalty = repeat_penalty;
     m_ctx.repeat_last_n = repeat_penalty_tokens;
-    m_llModelInfo.model->setThreadCount(n_threads);
+
+    if (auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get()))
+        lcppmodel->setThreadCount(n_threads);
+
 #if defined(DEBUG)
     printf("%s", qPrintable(prompt));
     fflush(stdout);
 #endif
+
     QElapsedTimer totalTime;
     totalTime.start();
     m_timer->start();
@@ -1238,11 +1247,15 @@ void ChatLLM::processSystemPrompt()
     m_ctx.n_batch = n_batch;
     m_ctx.repeat_penalty = repeat_penalty;
     m_ctx.repeat_last_n = repeat_penalty_tokens;
-    m_llModelInfo.model->setThreadCount(n_threads);
+
+    if (auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get()))
+        lcppmodel->setThreadCount(n_threads);
+
 #if defined(DEBUG)
     printf("%s", qPrintable(QString::fromStdString(systemPrompt)));
     fflush(stdout);
 #endif
+
     auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode system prompt without a response
     // use "%1%2" and not "%1" to avoid implicit whitespace
     m_llModelInfo.model->prompt(systemPrompt, "%1%2", promptFunc, nullptr, /*allowContextShift*/ true, m_ctx, true);
@@ -1288,7 +1301,9 @@ void ChatLLM::processRestoreStateFromText()
     m_ctx.n_batch = n_batch;
     m_ctx.repeat_penalty = repeat_penalty;
     m_ctx.repeat_last_n = repeat_penalty_tokens;
-    m_llModelInfo.model->setThreadCount(n_threads);
+
+    if (auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get()))
+        lcppmodel->setThreadCount(n_threads);
 
     auto it = m_stateFromText.begin();
     while (it < m_stateFromText.end()) {
diff --git a/gpt4all-chat/chatllm.h b/gpt4all-chat/chatllm.h
index d123358ad58e0..68f4e0f1d95d6 100644
--- a/gpt4all-chat/chatllm.h
+++ b/gpt4all-chat/chatllm.h
@@ -4,6 +4,7 @@
 #include "database.h" // IWYU pragma: keep
 #include "modellist.h"
 
+#include "../gpt4all-backend/llamacpp_backend.h"
 #include "../gpt4all-backend/llmodel.h"
 
 #include <QByteArray>
@@ -128,15 +129,17 @@ class ChatLLM : public QObject
 
     QString deviceBackend() const
     {
-        if (!isModelLoaded()) return QString();
-        std::string name = LLModel::GPUDevice::backendIdToName(m_llModelInfo.model->backendName());
+        auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get());
+        if (!isModelLoaded() && !lcppmodel) return QString();
+        std::string name = LlamaCppBackend::GPUDevice::backendIdToName(lcppmodel->backendName());
         return QString::fromStdString(name);
     }
 
     QString device() const
     {
-        if (!isModelLoaded()) return QString();
-        const char *name = m_llModelInfo.model->gpuDeviceName();
+        auto *lcppmodel = dynamic_cast<LlamaCppBackend *>(m_llModelInfo.model.get());
+        if (!isModelLoaded() || !lcppmodel) return QString();
+        const char *name = lcppmodel->gpuDeviceName();
         return name ? QString(name) : u"CPU"_s;
     }
 
diff --git a/gpt4all-chat/embllm.cpp b/gpt4all-chat/embllm.cpp
index 615a6ce4d9252..af56d0bc19293 100644
--- a/gpt4all-chat/embllm.cpp
+++ b/gpt4all-chat/embllm.cpp
@@ -3,7 +3,7 @@
 #include "modellist.h"
 #include "mysettings.h"
 
-#include "../gpt4all-backend/llmodel.h"
+#include "../gpt4all-backend/llamacpp_backend.h"
 
 #include <QCoreApplication>
 #include <QDebug>
@@ -99,7 +99,7 @@ bool EmbeddingLLMWorker::loadModel()
 #endif
 
     try {
-        m_model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
+        m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx);
     } catch (const std::exception &e) {
         qWarning() << "embllm WARNING: Could not load embedding model:" << e.what();
         return false;
@@ -112,11 +112,11 @@ bool EmbeddingLLMWorker::loadModel()
         actualDeviceIsCPU = false;
 #else
     if (requestedDevice != "CPU") {
-        const LLModel::GPUDevice *device = nullptr;
-        std::vector<LLModel::GPUDevice> availableDevices = m_model->availableGPUDevices(0);
+        const LlamaCppBackend::GPUDevice *device = nullptr;
+        auto availableDevices = m_model->availableGPUDevices(0);
         if (requestedDevice != "Auto") {
             // Use the selected device
-            for (const LLModel::GPUDevice &d : availableDevices) {
+            for (const auto &d : availableDevices) {
                 if (QString::fromStdString(d.selectionName()) == requestedDevice) {
                     device = &d;
                     break;
@@ -145,7 +145,7 @@ bool EmbeddingLLMWorker::loadModel()
         if (backend == "cuda") {
             // For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls
             try {
-                m_model = LLModel::Implementation::construct(filePath.toStdString(), "auto", n_ctx);
+                m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), "auto", n_ctx);
             } catch (const std::exception &e) {
                 qWarning() << "embllm WARNING: Could not load embedding model:" << e.what();
                 return false;
diff --git a/gpt4all-chat/embllm.h b/gpt4all-chat/embllm.h
index 91376650d05e3..fda773e545372 100644
--- a/gpt4all-chat/embllm.h
+++ b/gpt4all-chat/embllm.h
@@ -13,7 +13,7 @@
 #include <atomic>
 #include <vector>
 
-class LLModel;
+class LlamaCppBackend;
 class QNetworkAccessManager;
 
 struct EmbeddingChunk {
@@ -67,7 +67,7 @@ private Q_SLOTS:
     QString m_nomicAPIKey;
     QNetworkAccessManager *m_networkManager;
     std::vector<float> m_lastResponse;
-    LLModel *m_model = nullptr;
+    LlamaCppBackend *m_model = nullptr;
     std::atomic<bool> m_stopGenerating;
     QThread m_workerThread;
     QMutex m_mutex; // guards m_model and m_nomicAPIKey
diff --git a/gpt4all-chat/llm.cpp b/gpt4all-chat/llm.cpp
index 13820030393e7..c501953c8d053 100644
--- a/gpt4all-chat/llm.cpp
+++ b/gpt4all-chat/llm.cpp
@@ -1,6 +1,6 @@
 #include "llm.h"
 
-#include "../gpt4all-backend/llmodel.h"
+#include "../gpt4all-backend/llamacpp_backend.h"
 #include "../gpt4all-backend/sysinfo.h"
 
 #include <QCoreApplication>
@@ -30,7 +30,7 @@ LLM *LLM::globalInstance()
 
 LLM::LLM()
     : QObject{nullptr}
-    , m_compatHardware(LLModel::Implementation::hasSupportedCPU())
+    , m_compatHardware(LlamaCppBackend::Implementation::hasSupportedCPU())
 {
     QNetworkInformation::loadDefaultBackend();
     auto * netinfo = QNetworkInformation::instance();
diff --git a/gpt4all-chat/main.cpp b/gpt4all-chat/main.cpp
index 4546a95bcf327..f4a3df2075067 100644
--- a/gpt4all-chat/main.cpp
+++ b/gpt4all-chat/main.cpp
@@ -8,7 +8,7 @@
 #include "mysettings.h"
 #include "network.h"
 
-#include "../gpt4all-backend/llmodel.h"
+#include "../gpt4all-backend/llamacpp_backend.h"
 
 #include <QCoreApplication>
 #include <QGuiApplication>
@@ -46,7 +46,7 @@ int main(int argc, char *argv[])
     if (LLM::directoryExists(frameworksDir))
         llmodelSearchPaths += ";" + frameworksDir;
 #endif
-    LLModel::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString());
+    LlamaCppBackend::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString());
 
     // Set the local and language translation before the qml engine has even been started. This will
     // use the default system locale unless the user has explicitly set it to use a different one.
diff --git a/gpt4all-chat/modellist.cpp b/gpt4all-chat/modellist.cpp
index 580b615ff4e66..4d5a4147e1258 100644
--- a/gpt4all-chat/modellist.cpp
+++ b/gpt4all-chat/modellist.cpp
@@ -4,7 +4,7 @@
 #include "mysettings.h"
 #include "network.h"
 
-#include "../gpt4all-backend/llmodel.h"
+#include "../gpt4all-backend/llamacpp_backend.h"
 
 #include <QChar>
 #include <QCoreApplication>
@@ -258,7 +258,7 @@ int ModelInfo::maxContextLength() const
     if (!installed || isOnline) return -1;
     if (m_maxContextLength != -1) return m_maxContextLength;
     auto path = (dirpath + filename()).toStdString();
-    int n_ctx = LLModel::Implementation::maxContextLength(path);
+    int n_ctx = LlamaCppBackend::Implementation::maxContextLength(path);
     if (n_ctx < 0) {
         n_ctx = 4096; // fallback value
     }
@@ -282,7 +282,7 @@ int ModelInfo::maxGpuLayers() const
     if (!installed || isOnline) return -1;
     if (m_maxGpuLayers != -1) return m_maxGpuLayers;
     auto path = (dirpath + filename()).toStdString();
-    int layers = LLModel::Implementation::layerCount(path);
+    int layers = LlamaCppBackend::Implementation::layerCount(path);
     if (layers < 0) {
         layers = 100; // fallback value
     }
@@ -997,7 +997,7 @@ void ModelList::updateData(const QString &id, const QVector<QPair<int, QVariant>
             && (info->isDiscovered() || info->description().isEmpty()))
         {
             // read GGUF and decide based on model architecture
-            info->isEmbeddingModel = LLModel::Implementation::isEmbeddingModel(modelPath.toStdString());
+            info->isEmbeddingModel = LlamaCppBackend::Implementation::isEmbeddingModel(modelPath.toStdString());
             info->checkedEmbeddingModel = true;
         }
 
diff --git a/gpt4all-chat/mysettings.cpp b/gpt4all-chat/mysettings.cpp
index b29ec431f302b..d57b5926c4b5b 100644
--- a/gpt4all-chat/mysettings.cpp
+++ b/gpt4all-chat/mysettings.cpp
@@ -1,6 +1,6 @@
 #include "mysettings.h"
 
-#include "../gpt4all-backend/llmodel.h"
+#include "../gpt4all-backend/llamacpp_backend.h"
 
 #include <QDebug>
 #include <QDir>
@@ -95,8 +95,8 @@ static QStringList getDevices(bool skipKompute = false)
 #if defined(Q_OS_MAC) && defined(__aarch64__)
     deviceList << "Metal";
 #else
-    std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
-    for (LLModel::GPUDevice &d : devices) {
+    auto devices = LlamaCppBackend::Implementation::availableGPUDevices();
+    for (auto &d : devices) {
         if (!skipKompute || strcmp(d.backend, "kompute"))
             deviceList << QString::fromStdString(d.selectionName());
     }
@@ -512,7 +512,7 @@ QString MySettings::device()
     auto device = value.toString();
     if (!device.isEmpty()) {
         auto deviceStr = device.toStdString();
-        auto newNameStr = LLModel::GPUDevice::updateSelectionName(deviceStr);
+        auto newNameStr = LlamaCppBackend::GPUDevice::updateSelectionName(deviceStr);
         if (newNameStr != deviceStr) {
             auto newName = QString::fromStdString(newNameStr);
             qWarning() << "updating device name:" << device << "->" << newName;
diff --git a/gpt4all-chat/network.cpp b/gpt4all-chat/network.cpp
index e7ee616cd2cad..47595bcad1bf5 100644
--- a/gpt4all-chat/network.cpp
+++ b/gpt4all-chat/network.cpp
@@ -9,7 +9,7 @@
 #include "modellist.h"
 #include "mysettings.h"
 
-#include "../gpt4all-backend/llmodel.h"
+#include "../gpt4all-backend/llamacpp_backend.h"
 
 #include <QCoreApplication>
 #include <QDateTime>
@@ -290,7 +290,7 @@ void Network::sendStartup()
         {"display", u"%1x%2"_s.arg(display->size().width()).arg(display->size().height())},
         {"ram", LLM::globalInstance()->systemTotalRAMInGB()},
         {"cpu", getCPUModel()},
-        {"cpu_supports_avx2", LLModel::Implementation::cpuSupportsAVX2()},
+        {"cpu_supports_avx2", LlamaCppBackend::Implementation::cpuSupportsAVX2()},
         {"datalake_active", mySettings->networkIsActive()},
     });
     sendIpify();