diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 0ec9bcf589f7..fcab568fb442 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -1828,6 +1828,98 @@ jobs: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' + # llama-cpp-tq (TurboQuant fork) + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-llama-cpp-tq' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "llama-cpp-tq" + dockerfile: "./backend/Dockerfile.llama-cpp" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp-tq' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "llama-cpp-tq" + dockerfile: "./backend/Dockerfile.llama-cpp" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp-tq' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "llama-cpp-tq" + dockerfile: "./backend/Dockerfile.llama-cpp" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp-tq' + base-image: "ubuntu:24.04" + runs-on: 'ubuntu-24.04-arm' + ubuntu-version: '2404' + backend: "llama-cpp-tq" + dockerfile: "./backend/Dockerfile.llama-cpp" + context: "./" + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-llama-cpp-tq' + runs-on: 'ubuntu-latest' + base-image: "rocm/dev-ubuntu-24.04:6.4.4" + skip-drivers: 'false' + backend: "llama-cpp-tq" + dockerfile: "./backend/Dockerfile.llama-cpp" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-llama-cpp-tq' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "llama-cpp-tq" + dockerfile: "./backend/Dockerfile.llama-cpp" + context: "./" + ubuntu-version: '2204' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-llama-cpp-tq' + runs-on: 'bigger-runner' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "llama-cpp-tq" + dockerfile: "./backend/Dockerfile.llama-cpp" + context: "./" + ubuntu-version: '2404' # Stablediffusion-ggml - build-type: '' cuda-major-version: "" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 49e489beb00b..bd36e6646d7f 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -14,6 +14,11 @@ jobs: variable: "LLAMA_VERSION" branch: "master" file: "backend/cpp/llama-cpp/Makefile" + - repository: "TheTom/llama-cpp-turboquant" + variable: "LLAMA_VERSION" + branch: "master" + file: "backend/cpp/llama-cpp-tq/Makefile" + branch_suffix: "-tq" - repository: "ggml-org/whisper.cpp" variable: "WHISPER_CPP_VERSION" branch: "master" @@ -60,7 +65,7 @@ jobs: push-to-fork: ci-forks/LocalAI commit-message: ':arrow_up: Update ${{ matrix.repository }}' title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`' - branch: "update/${{ matrix.variable }}" + branch: "update/${{ matrix.variable }}${{ matrix.branch_suffix }}" body: ${{ steps.bump.outputs.message }} signoff: true diff --git a/.gitignore b/.gitignore index 25252eada349..11aca693bd32 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ prepare-sources /backend/cpp/llama-cpp/llama.cpp /backend/cpp/llama-* !backend/cpp/llama-cpp +!backend/cpp/llama-cpp-tq /backends /backend-images /result.yaml diff --git a/Makefile b/Makefile index 61bceb7e36fe..f3e6573a48e5 100644 --- a/Makefile +++ b/Makefile @@ -544,8 +544,9 @@ backend-images: mkdir -p backend-images # Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG -# llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg +# llama-cpp and forks - use llama-cpp Dockerfile BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false +BACKEND_LLAMA_CPP_TQ = llama-cpp-tq|llama-cpp|.|false|true # Golang backends BACKEND_PIPER = piper|golang|.|false|true @@ -609,6 +610,7 @@ endef # Generate all docker-build targets $(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP))) +$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_TQ))) $(eval $(call generate-docker-build-target,$(BACKEND_PIPER))) $(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE))) $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE))) diff --git a/backend/Dockerfile.llama-cpp b/backend/Dockerfile.llama-cpp index 3930d04d4aba..110b9bf04a5f 100644 --- a/backend/Dockerfile.llama-cpp +++ b/backend/Dockerfile.llama-cpp @@ -58,7 +58,9 @@ ARG CUDA_DOCKER_ARCH ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ARG CMAKE_ARGS ENV CMAKE_ARGS=${CMAKE_ARGS} -ARG BACKEND=rerankers +ARG BACKEND=llama-cpp +ARG LLAMA_BACKEND_DIR=${BACKEND} +ENV LLAMA_BACKEND_DIR=${LLAMA_BACKEND_DIR} ARG BUILD_TYPE ENV BUILD_TYPE=${BUILD_TYPE} ARG CUDA_MAJOR_VERSION @@ -255,32 +257,27 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}" export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}" echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}" - rm -rf /LocalAI/backend/cpp/llama-cpp-*-build + rm -rf /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}-*-build fi +cd /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR} + if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then - cd /LocalAI/backend/cpp/llama-cpp - make llama-cpp-fallback - make llama-cpp-grpc - make llama-cpp-rpc-server + make ARCH=aarch64 build-variants else - cd /LocalAI/backend/cpp/llama-cpp - make llama-cpp-avx - make llama-cpp-avx2 - make llama-cpp-avx512 - make llama-cpp-fallback - make llama-cpp-grpc - make llama-cpp-rpc-server + make build-variants fi EOT # Copy libraries using a script to handle architecture differences -RUN make -BC /LocalAI/backend/cpp/llama-cpp package +RUN make -BC /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR} package FROM scratch +ARG BACKEND=llama-cpp +ARG LLAMA_BACKEND_DIR=${BACKEND} # Copy all available binaries (the build process only creates the appropriate ones for the target architecture) -COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./ +COPY --from=builder /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}/package/. ./ diff --git a/backend/cpp/llama-cpp-tq/Makefile b/backend/cpp/llama-cpp-tq/Makefile new file mode 100644 index 000000000000..e2cb41471254 --- /dev/null +++ b/backend/cpp/llama-cpp-tq/Makefile @@ -0,0 +1,6 @@ +LLAMA_VERSION?=master +LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant +BACKEND_NAME?=llama-cpp-tq +SHARED_DIR?=$(CURDIR)/../llama-cpp + +include ../llama-cpp/Makefile diff --git a/backend/cpp/llama-cpp/CMakeLists.txt b/backend/cpp/llama-cpp/CMakeLists.txt index 598461975532..d4965a2feb97 100644 --- a/backend/cpp/llama-cpp/CMakeLists.txt +++ b/backend/cpp/llama-cpp/CMakeLists.txt @@ -59,6 +59,11 @@ add_library(hw_grpc_proto add_executable(${TARGET} grpc-server.cpp json.hpp httplib.h) +# Enable autoparser support if the header exists (not present in all llama.cpp forks) +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/chat-auto-parser.h") + target_compile_definitions(${TARGET} PRIVATE HAS_AUTOPARSER) +endif() + target_include_directories(${TARGET} PRIVATE ../llava) target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 9cbf69125e2d..67ad03aff1bc 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,6 +1,10 @@ LLAMA_VERSION?=a1cfb645307edc61a89e41557f290f441043d3c2 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp +BACKEND_NAME?=llama-cpp +SHARED_DIR?=$(CURDIR) +GRPC_SERVER_DIR?=tools/grpc-server +SERVER_SOURCE_DIR?=tools/server CMAKE_ARGS?= BUILD_TYPE?= @@ -67,6 +71,17 @@ ifeq ($(BUILD_TYPE),sycl_f32) -DCMAKE_CXX_FLAGS="-fsycl" endif +# Variants to build for each architecture (can be overridden by forks) +X86_64_VARIANTS ?= llama-cpp-avx llama-cpp-avx2 llama-cpp-avx512 llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server +ARM64_VARIANTS ?= llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server + +build-variants: +ifeq ($(ARCH),aarch64) + @for v in $(ARM64_VARIANTS); do $(MAKE) $$v || exit 1; done +else + @for v in $(X86_64_VARIANTS); do $(MAKE) $$v || exit 1; done +endif + INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \ @@ -90,42 +105,42 @@ else endif llama-cpp-avx2: llama.cpp - cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build - $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge + cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build purge $(info ${GREEN}I llama-cpp build info:avx2${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server - cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2 + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx2-build" build-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build/grpc-server llama-cpp-avx2 llama-cpp-avx512: llama.cpp - cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build - $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge + cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build purge $(info ${GREEN}I llama-cpp build info:avx512${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server - cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512 + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx512-build" build-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build/grpc-server llama-cpp-avx512 llama-cpp-avx: llama.cpp - cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build - $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge + cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build purge $(info ${GREEN}I llama-cpp build info:avx${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server - cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-avx-build" build-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build/grpc-server llama-cpp-avx llama-cpp-fallback: llama.cpp - cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build - $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge + cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build purge $(info ${GREEN}I llama-cpp build info:fallback${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server - cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-fallback-build" build-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build/grpc-server llama-cpp-fallback llama-cpp-grpc: llama.cpp - cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build - $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge + cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build + $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build purge $(info ${GREEN}I llama-cpp build info:grpc${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server - cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc + CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="$(BACKEND_NAME)-grpc-build" build-llama-cpp-grpc-server + cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/grpc-server llama-cpp-grpc llama-cpp-rpc-server: llama-cpp-grpc - cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server + cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server llama.cpp: mkdir -p llama.cpp @@ -133,30 +148,30 @@ llama.cpp: git init && \ git remote add origin $(LLAMA_REPO) && \ git fetch origin && \ - git checkout -b build $(LLAMA_VERSION) && \ + (git checkout -b build $(LLAMA_VERSION) || git checkout -b build origin/$(LLAMA_VERSION)) && \ git submodule update --init --recursive --depth 1 --single-branch -llama.cpp/tools/grpc-server: llama.cpp - mkdir -p llama.cpp/tools/grpc-server - bash prepare.sh +llama.cpp/$(GRPC_SERVER_DIR): llama.cpp + mkdir -p llama.cpp/$(GRPC_SERVER_DIR) + SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh rebuild: - bash prepare.sh + SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh rm -rf grpc-server $(MAKE) grpc-server package: - bash package.sh + bash $(SHARED_DIR)/package.sh purge: rm -rf llama.cpp/build - rm -rf llama.cpp/tools/grpc-server + rm -rf llama.cpp/$(GRPC_SERVER_DIR) rm -rf grpc-server clean: purge rm -rf llama.cpp -grpc-server: llama.cpp llama.cpp/tools/grpc-server +grpc-server: llama.cpp llama.cpp/$(GRPC_SERVER_DIR) @echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)" ifneq (,$(findstring sycl,$(BUILD_TYPE))) +bash -c "source $(ONEAPI_VARS); \ diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index d9d5a5ca4a82..8f4383d1e97d 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -17,7 +17,9 @@ #include "backend.pb.h" #include "backend.grpc.pb.h" #include "common.h" +#ifdef HAS_AUTOPARSER #include "chat-auto-parser.h" +#endif #include #include #include @@ -2665,6 +2667,7 @@ class BackendServiceImpl final : public backend::Backend::Service { response->set_rendered_template(rendered_template); +#ifdef HAS_AUTOPARSER // Run differential template analysis to detect tool format markers if (params_base.use_jinja) { try { @@ -2770,6 +2773,7 @@ class BackendServiceImpl final : public backend::Backend::Service { SRV_WRN("ModelMetadata: failed to run autoparser analysis: %s\n", e.what()); } } +#endif return grpc::Status::OK; } diff --git a/backend/cpp/llama-cpp/package.sh b/backend/cpp/llama-cpp/package.sh index d1897e6bed5a..efa760255e30 100755 --- a/backend/cpp/llama-cpp/package.sh +++ b/backend/cpp/llama-cpp/package.sh @@ -5,14 +5,21 @@ set -e -CURDIR=$(dirname "$(realpath $0)") -REPO_ROOT="${CURDIR}/../../.." +# Use working directory (not script location) so forks that share this script work correctly +CURDIR=$(pwd) +SCRIPT_DIR=$(dirname "$(realpath $0)") +REPO_ROOT="${SCRIPT_DIR}/../../.." # Create lib directory mkdir -p $CURDIR/package/lib cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/ -cp -rfv $CURDIR/run.sh $CURDIR/package/ +# Copy run.sh — prefer local copy, fall back to shared dir (script location) +if [ -f "$CURDIR/run.sh" ]; then + cp -rfv $CURDIR/run.sh $CURDIR/package/ +else + cp -rfv $SCRIPT_DIR/run.sh $CURDIR/package/ +fi # Detect architecture and copy appropriate libraries if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then diff --git a/backend/cpp/llama-cpp/prepare.sh b/backend/cpp/llama-cpp/prepare.sh index f9b7e3dd2651..d3652fe8f535 100644 --- a/backend/cpp/llama-cpp/prepare.sh +++ b/backend/cpp/llama-cpp/prepare.sh @@ -1,31 +1,43 @@ #!/bin/bash -## Patches +SHARED_DIR="${SHARED_DIR:-.}" +SERVER_SOURCE_DIR="${SERVER_SOURCE_DIR:-tools/server}" +GRPC_SERVER_DIR="${GRPC_SERVER_DIR:-tools/grpc-server}" ## Apply patches from the `patches` directory if [ -d "patches" ]; then for patch in $(ls patches); do echo "Applying patch $patch" patch -d llama.cpp/ -p1 < patches/$patch - done + done fi set -e -for file in $(ls llama.cpp/tools/server/); do - cp -rfv llama.cpp/tools/server/$file llama.cpp/tools/grpc-server/ +# Copy server source files into grpc-server build directory +for file in $(ls llama.cpp/${SERVER_SOURCE_DIR}/); do + cp -rfv llama.cpp/${SERVER_SOURCE_DIR}/$file llama.cpp/${GRPC_SERVER_DIR}/ done -cp -r CMakeLists.txt llama.cpp/tools/grpc-server/ -cp -r grpc-server.cpp llama.cpp/tools/grpc-server/ -cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/ -cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/ +# Copy build files — prefer local overrides, fall back to SHARED_DIR +for f in CMakeLists.txt grpc-server.cpp; do + if [ -f "$f" ]; then + cp -r "$f" llama.cpp/${GRPC_SERVER_DIR}/ + else + cp -r "$SHARED_DIR/$f" llama.cpp/${GRPC_SERVER_DIR}/ + fi +done + +cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/${GRPC_SERVER_DIR}/ +cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/${GRPC_SERVER_DIR}/ + +# Add grpc-server subdirectory to the parent CMakeLists.txt +PARENT_CMAKELISTS="llama.cpp/$(dirname ${GRPC_SERVER_DIR})/CMakeLists.txt" set +e -if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then +if grep -q "grpc-server" "$PARENT_CMAKELISTS"; then echo "grpc-server already added" else - echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt + echo "add_subdirectory(grpc-server)" >> "$PARENT_CMAKELISTS" fi set -e - diff --git a/backend/index.yaml b/backend/index.yaml index d94cb70be2f9..1335ef0fd369 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -29,6 +29,34 @@ nvidia-cuda-12: "cuda12-llama-cpp" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp" +- &llamacpp_tq + name: "llama-cpp-tq" + alias: "llama-cpp-tq" + license: mit + description: | + TurboQuant llama.cpp fork - quantization research + urls: + - https://github.com/TheTom/llama-cpp-turboquant + tags: + - text-to-text + - LLM + - CPU + - GPU + - Metal + - CUDA + - HIP + capabilities: + default: "cpu-llama-cpp-tq" + nvidia: "cuda12-llama-cpp-tq" + intel: "intel-sycl-f16-llama-cpp-tq" + amd: "rocm-llama-cpp-tq" + metal: "metal-llama-cpp-tq" + vulkan: "vulkan-llama-cpp-tq" + nvidia-l4t: "nvidia-l4t-arm64-llama-cpp-tq" + nvidia-cuda-13: "cuda13-llama-cpp-tq" + nvidia-cuda-12: "cuda12-llama-cpp-tq" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-llama-cpp-tq" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-llama-cpp-tq" - &whispercpp name: "whisper" alias: "whisper" @@ -1252,6 +1280,57 @@ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-llama-cpp" mirrors: - localai/localai-backends:master-gpu-nvidia-cuda-13-llama-cpp +# llama-cpp-tq (TurboQuant) concrete backends +- !!merge <<: *llamacpp_tq + name: "cpu-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-cpu-llama-cpp-tq +- !!merge <<: *llamacpp_tq + name: "cuda12-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-llama-cpp-tq +- !!merge <<: *llamacpp_tq + name: "cuda13-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-llama-cpp-tq +- !!merge <<: *llamacpp_tq + name: "rocm-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-llama-cpp-tq +- !!merge <<: *llamacpp_tq + name: "intel-sycl-f16-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f16-llama-cpp-tq +- !!merge <<: *llamacpp_tq + name: "intel-sycl-f32-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f32-llama-cpp-tq +- !!merge <<: *llamacpp_tq + name: "vulkan-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-gpu-vulkan-llama-cpp-tq +- !!merge <<: *llamacpp_tq + name: "metal-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-llama-cpp-tq +- !!merge <<: *llamacpp_tq + name: "nvidia-l4t-arm64-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-arm64-llama-cpp-tq +- !!merge <<: *llamacpp_tq + name: "cuda13-nvidia-l4t-arm64-llama-cpp-tq" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-llama-cpp-tq ## whisper - !!merge <<: *whispercpp name: "nvidia-l4t-arm64-whisper"