diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index 953913293953..318d99832d65 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4490,6 +4490,36 @@ include:
     dockerfile: "./backend/Dockerfile.golang"
     context: "./"
     ubuntu-version: '2404'
+  # supertonic CPU (amd64)
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-supertonic'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "supertonic"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  # supertonic CPU (arm64)
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-supertonic'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "supertonic"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
 
 # Darwin matrix (consumed by backend-jobs-darwin).
 includeDarwin:
diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml
index bb381567baa5..a8bac30dd1f7 100644
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -21,7 +21,10 @@ jobs:
         uses: securego/gosec@v2.27.1
         with:
           # we let the report trigger content trigger a failure using the GitHub Security features.
-          args: '-no-fail -fmt sarif -out results.sarif ./...'
+          # backend/go/supertonic is excluded: it vendors upstream supertone-inc/supertonic
+          # (helper.go), whose findings (G304 model-file loads, G404 math/rand for flow-matching
+          # noise, G104 unhandled errors) are inherent to that upstream code, not ours to rewrite.
+          args: '-no-fail -exclude-dir=backend/go/supertonic -fmt sarif -out results.sarif ./...'
       - name: Upload SARIF file
         if: ${{ github.actor != 'dependabot[bot]' }}
         uses: github/codeql-action/upload-sarif@v4
diff --git a/.golangci.yml b/.golangci.yml
index dceb32374fa8..d25d1ccb4789 100644
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -74,6 +74,8 @@ linters:
     paths:
       # Upstream whisper.cpp source tree fetched by the whisper backend Makefile.
       - 'backend/go/whisper/sources'
+      # Vendored upstream supertonic pipeline (supertone-inc/supertonic go/helper.go).
+      - 'backend/go/supertonic/helper.go'
       - 'docs/'
     rules:
       # CLI entry points: kong's `env:"..."` tag is the legitimate env→struct
diff --git a/Makefile b/Makefile
index ecca9d3c7b9e..5db33f1ac4f2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Disable parallel execution for backend builds
-.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
+.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic
 
 GOCMD=go
 GOTEST=$(GOCMD) test
@@ -595,6 +595,7 @@ test-extra: prepare-test-extra
 	$(MAKE) -C backend/rust/kokoros test
 	$(MAKE) -C backend/go/rfdetr-cpp test
 	$(MAKE) -C backend/go/locate-anything-cpp test
+	$(MAKE) -C backend/go/supertonic test
 
 ##
 ## End-to-end gRPC tests that exercise a built backend container image.
@@ -1181,6 +1182,7 @@ BACKEND_VIBEVOICE_CPP = vibevoice-cpp|golang|.|false|true
 BACKEND_LOCALVQE = localvqe|golang|.|false|true
 BACKEND_OPUS = opus|golang|.|false|true
 BACKEND_SHERPA_ONNX = sherpa-onnx|golang|.|false|true
+BACKEND_SUPERTONIC = supertonic|golang|.|false|true
 
 # Python backends with root context
 BACKEND_RERANKERS = rerankers|python|.|false|true
@@ -1308,12 +1310,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_KOKOROS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP)))
 $(eval $(call generate-docker-build-target,$(BACKEND_RFDETR_CPP)))
 $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))
+$(eval $(call generate-docker-build-target,$(BACKEND_SUPERTONIC)))
 
 # Pattern rule for docker-save targets
 docker-save-%: backend-images
 	docker save local-ai-backend:$* -o backend-images/$*.tar
 
-docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy
+docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy docker-build-supertonic
 
 ########################################################
 ### Mock Backend for E2E Tests
diff --git a/backend/go/supertonic/.gitignore b/backend/go/supertonic/.gitignore
new file mode 100644
index 000000000000..10f6d5c1f85e
--- /dev/null
+++ b/backend/go/supertonic/.gitignore
@@ -0,0 +1,4 @@
+/supertonic
+/sources/
+/backend-assets/
+/package/
diff --git a/backend/go/supertonic/Makefile b/backend/go/supertonic/Makefile
new file mode 100644
index 000000000000..ab4991442bf2
--- /dev/null
+++ b/backend/go/supertonic/Makefile
@@ -0,0 +1,62 @@
+CURRENT_DIR=$(abspath ./)
+GOCMD=go
+
+ONNX_VERSION?=1.24.4
+ONNX_ARCH?=x64
+ONNX_OS?=linux
+
+ifneq (,$(findstring aarch64,$(shell uname -m)))
+	ONNX_ARCH=aarch64
+endif
+
+ifeq ($(OS),Darwin)
+	ONNX_OS=osx
+	ifneq (,$(findstring arm64,$(shell uname -m)))
+		ONNX_ARCH=arm64
+	else
+		ONNX_ARCH=x86_64
+	endif
+endif
+
+# CUDA 12 ships as -gpu, CUDA 13 as -gpu_cuda13 (underscore). CPU has no suffix.
+ifeq ($(BUILD_TYPE),cublas)
+	ONNX_PROVIDER=cuda
+	ifeq ($(CUDA_MAJOR_VERSION),13)
+		ONNX_VARIANT=-gpu_cuda13
+	else
+		ONNX_VARIANT=-gpu
+	endif
+else
+	ONNX_VARIANT=
+	ONNX_PROVIDER=cpu
+endif
+
+sources/onnxruntime:
+	mkdir -p sources/onnxruntime
+	curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)$(ONNX_VARIANT)-$(ONNX_VERSION).tgz \
+	  -o sources/onnxruntime/onnxruntime.tgz
+	cd sources/onnxruntime && tar -xf onnxruntime.tgz --strip-components=1 && rm onnxruntime.tgz
+
+backend-assets/lib: sources/onnxruntime
+	mkdir -p backend-assets/lib
+	cp -rfLv sources/onnxruntime/lib/* backend-assets/lib/
+
+supertonic: backend-assets/lib
+	CGO_ENABLED=1 $(GOCMD) build \
+	  -ldflags "$(LD_FLAGS) -X main.onnxProvider=$(ONNX_PROVIDER)" \
+	  -tags "$(GO_TAGS)" -o supertonic ./
+
+package:
+	bash package.sh
+
+build: supertonic package
+
+# Tests need only the Go toolchain (gcc); yalue dlopens onnxruntime at
+# runtime, so no tarball download is required to compile or run unit specs.
+test:
+	CGO_ENABLED=1 $(GOCMD) test -v -timeout 120s ./...
+
+clean:
+	rm -rf supertonic sources/ backend-assets/ package/
+
+.PHONY: build package clean test
diff --git a/backend/go/supertonic/backend.go b/backend/go/supertonic/backend.go
new file mode 100644
index 000000000000..36028d33d80b
--- /dev/null
+++ b/backend/go/supertonic/backend.go
@@ -0,0 +1,307 @@
+package main
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+
+	laudio "github.com/mudler/LocalAI/pkg/audio"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// onnxProvider is set via -ldflags "-X main.onnxProvider=cuda" by the
+// CUDA build (later phase). Defaults to CPU.
+var onnxProvider = "cpu"
+
+// Per-model generation defaults, overridable via ModelOptions.Options:
+//
+//	supertonic.steps=<int>          denoising steps (quality), default 8
+//	supertonic.speed=<float>        speech rate, default 1.05
+//	supertonic.silence=<float>      inter-chunk silence seconds, default 0.3
+//	supertonic.default_voice=<name> voice-style used when request omits voice
+//	supertonic.default_lang=<lang>  language tag used when request omits it
+const (
+	optionSteps        = "supertonic.steps="
+	optionSpeed        = "supertonic.speed="
+	optionSilence      = "supertonic.silence="
+	optionDefaultVoice = "supertonic.default_voice="
+	optionDefaultLang  = "supertonic.default_lang="
+)
+
+type SupertonicBackend struct {
+	base.SingleThread
+
+	tts          *TextToSpeech
+	cfg          Config
+	modelDir     string
+	voicesDir    string
+	defaultVoice string
+	defaultLang  string
+	steps        int
+	speed        float32
+	silence      float32
+
+	styleMu sync.Mutex
+	styles  map[string]*Style // voice name -> loaded style cache
+}
+
+func (s *SupertonicBackend) Load(opts *pb.ModelOptions) error {
+	modelDir, err := resolveModelDir(opts.ModelFile)
+	if err != nil {
+		return err
+	}
+	s.modelDir = modelDir
+	s.voicesDir = resolveVoicesDir(modelDir)
+
+	cfg, err := LoadCfgs(modelDir)
+	if err != nil {
+		return fmt.Errorf("loading tts.json from %s: %w", modelDir, err)
+	}
+	s.cfg = cfg
+
+	// onnxProvider is "cpu" for the CPU build; the CUDA build sets it to
+	// "cuda" via -ldflags. Upstream LoadTextToSpeech still errors on GPU
+	// until the CUDA phase wires the execution provider.
+	tts, err := LoadTextToSpeech(modelDir, onnxProvider == "cuda", cfg)
+	if err != nil {
+		return fmt.Errorf("loading supertonic models from %s: %w", modelDir, err)
+	}
+	s.tts = tts
+
+	s.steps = int(findOptionInt(opts, optionSteps, 8))
+	s.speed = findOptionFloat(opts, optionSpeed, 1.05)
+	s.silence = findOptionFloat(opts, optionSilence, 0.3)
+	s.defaultVoice = findOptionValue(opts, optionDefaultVoice, "")
+	s.defaultLang = findOptionValue(opts, optionDefaultLang, "na")
+	s.styles = map[string]*Style{}
+	return nil
+}
+
+func (s *SupertonicBackend) TTS(req *pb.TTSRequest) error {
+	wav, sr, err := s.synthesize(req)
+	if err != nil {
+		return err
+	}
+	out := make([]float64, len(wav))
+	for i, v := range wav {
+		out[i] = float64(v)
+	}
+	if err := writeWavFile(req.Dst, out, sr); err != nil {
+		return fmt.Errorf("writing wav to %s: %w", req.Dst, err)
+	}
+	return nil
+}
+
+func (s *SupertonicBackend) TTSStream(req *pb.TTSRequest, results chan []byte) error {
+	defer close(results)
+
+	wav, sr, err := s.synthesize(req)
+	if err != nil {
+		return err
+	}
+
+	results <- streamingWAVHeader(uint32(sr))
+
+	const chunkSamples = 4096
+	for off := 0; off < len(wav); off += chunkSamples {
+		end := off + chunkSamples
+		if end > len(wav) {
+			end = len(wav)
+		}
+		results <- pcmFloatToInt16LE(wav[off:end])
+	}
+	return nil
+}
+
+// synthesize runs the full pipeline and returns the trimmed mono float32
+// PCM and its sample rate.
+func (s *SupertonicBackend) synthesize(req *pb.TTSRequest) ([]float32, int, error) {
+	if s.tts == nil {
+		return nil, 0, fmt.Errorf("supertonic model not loaded")
+	}
+	if strings.TrimSpace(req.Text) == "" {
+		return nil, 0, fmt.Errorf("empty text")
+	}
+
+	style, err := s.loadStyle(s.voiceName(req.Voice))
+	if err != nil {
+		return nil, 0, err
+	}
+
+	lang := s.resolveLang("")
+	if req.Language != nil {
+		lang = s.resolveLang(*req.Language)
+	}
+
+	wav, dur, err := s.tts.Call(req.Text, lang, style, s.steps, s.speed, s.silence)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	sr := s.tts.SampleRate
+	// Call returns concatenated audio; trim to the reported duration.
+	wavLen := int(float32(sr) * dur)
+	if wavLen < 0 {
+		wavLen = 0
+	}
+	if wavLen > len(wav) {
+		wavLen = len(wav)
+	}
+	return wav[:wavLen], sr, nil
+}
+
+// voiceName picks the request voice, falling back to the model default.
+func (s *SupertonicBackend) voiceName(reqVoice string) string {
+	v := strings.TrimSpace(reqVoice)
+	if v == "" {
+		return s.defaultVoice
+	}
+	return v
+}
+
+// resolveLang validates against AvailableLangs, falling back to the model
+// default (then "na").
+func (s *SupertonicBackend) resolveLang(reqLang string) string {
+	l := strings.TrimSpace(reqLang)
+	if l != "" && isValidLang(l) {
+		return l
+	}
+	if s.defaultLang != "" && isValidLang(s.defaultLang) {
+		return s.defaultLang
+	}
+	return "na"
+}
+
+// loadStyle resolves and caches a voice-style. An empty name with no model
+// default is an error (supertonic requires a style embedding).
+func (s *SupertonicBackend) loadStyle(name string) (*Style, error) {
+	if name == "" {
+		return nil, fmt.Errorf("no voice specified and no supertonic.default_voice set")
+	}
+	s.styleMu.Lock()
+	defer s.styleMu.Unlock()
+	if st, ok := s.styles[name]; ok {
+		return st, nil
+	}
+	path := s.voiceStylePath(name)
+	st, err := LoadVoiceStyle([]string{path}, false)
+	if err != nil {
+		return nil, fmt.Errorf("loading voice style %q (%s): %w", name, path, err)
+	}
+	s.styles[name] = st
+	return st, nil
+}
+
+// voiceStylePath maps a voice name to a JSON path. Absolute paths are honored;
+// names containing a separator resolve under modelDir; bare names resolve under
+// the resolved voicesDir (see resolveVoicesDir).
+func (s *SupertonicBackend) voiceStylePath(name string) string {
+	if !strings.HasSuffix(name, ".json") {
+		name += ".json"
+	}
+	if filepath.IsAbs(name) {
+		return name
+	}
+	if strings.ContainsRune(name, filepath.Separator) {
+		return filepath.Join(s.modelDir, name)
+	}
+	return filepath.Join(s.voicesDir, name)
+}
+
+// resolveVoicesDir locates the voice_styles directory. The HF model layout
+// puts the ONNX files in an onnx/ subdir with voice_styles/ as its sibling,
+// so check modelDir/voice_styles first, then the parent's voice_styles.
+func resolveVoicesDir(modelDir string) string {
+	candidates := []string{
+		filepath.Join(modelDir, "voice_styles"),
+		filepath.Join(filepath.Dir(modelDir), "voice_styles"),
+	}
+	for _, c := range candidates {
+		if info, err := os.Stat(c); err == nil && info.IsDir() {
+			return c
+		}
+	}
+	return candidates[0]
+}
+
+// resolveModelDir accepts either a directory (used as-is) or a file (its
+// parent dir is used).
+func resolveModelDir(modelFile string) (string, error) {
+	if modelFile == "" {
+		return "", fmt.Errorf("empty model path")
+	}
+	info, err := os.Stat(modelFile)
+	if err != nil {
+		return "", fmt.Errorf("stat model path %s: %w", modelFile, err)
+	}
+	if info.IsDir() {
+		return modelFile, nil
+	}
+	return filepath.Dir(modelFile), nil
+}
+
+// ---- option helpers (mirrors backend/go/sherpa-onnx/backend.go) ----
+
+func findOptionValue(opts *pb.ModelOptions, prefix, def string) string {
+	for _, o := range opts.Options {
+		if strings.HasPrefix(o, prefix) {
+			return strings.TrimPrefix(o, prefix)
+		}
+	}
+	return def
+}
+
+func findOptionFloat(opts *pb.ModelOptions, prefix string, def float32) float32 {
+	raw := findOptionValue(opts, prefix, "")
+	if raw == "" {
+		return def
+	}
+	v, err := strconv.ParseFloat(raw, 32)
+	if err != nil {
+		return def
+	}
+	return float32(v)
+}
+
+func findOptionInt(opts *pb.ModelOptions, prefix string, def int32) int32 {
+	raw := findOptionValue(opts, prefix, "")
+	if raw == "" {
+		return def
+	}
+	v, err := strconv.ParseInt(raw, 10, 32)
+	if err != nil {
+		return def
+	}
+	return int32(v)
+}
+
+// ---- PCM helpers ----
+
+func pcmFloatToInt16LE(samples []float32) []byte {
+	buf := make([]byte, len(samples)*2)
+	for i, f := range samples {
+		v := int32(f * 32767)
+		if v > 32767 {
+			v = 32767
+		} else if v < -32768 {
+			v = -32768
+		}
+		binary.LittleEndian.PutUint16(buf[2*i:], uint16(int16(v)))
+	}
+	return buf
+}
+
+func streamingWAVHeader(sampleRate uint32) []byte {
+	const streamingSize = 0xFFFFFFFF
+	h := laudio.NewWAVHeaderWithRate(streamingSize, sampleRate)
+	h.ChunkSize = streamingSize
+	var buf bytes.Buffer
+	_ = h.Write(&buf)
+	return buf.Bytes()
+}
diff --git a/backend/go/supertonic/backend_test.go b/backend/go/supertonic/backend_test.go
new file mode 100644
index 000000000000..d5027a082a13
--- /dev/null
+++ b/backend/go/supertonic/backend_test.go
@@ -0,0 +1,86 @@
+package main
+
+import (
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+var _ = Describe("voiceStylePath", func() {
+	s := &SupertonicBackend{modelDir: "/models/st/onnx", voicesDir: "/models/st/voice_styles"}
+
+	It("resolves a bare name under the resolved voicesDir", func() {
+		Expect(s.voiceStylePath("M1")).To(Equal(filepath.Join("/models/st/voice_styles", "M1.json")))
+	})
+	It("keeps an explicit .json suffix", func() {
+		Expect(s.voiceStylePath("M1.json")).To(Equal(filepath.Join("/models/st/voice_styles", "M1.json")))
+	})
+	It("honors absolute paths", func() {
+		Expect(s.voiceStylePath("/abs/v.json")).To(Equal("/abs/v.json"))
+	})
+})
+
+var _ = Describe("resolveVoicesDir", func() {
+	It("prefers voice_styles under modelDir", func() {
+		dir := GinkgoT().TempDir()
+		Expect(os.MkdirAll(filepath.Join(dir, "voice_styles"), 0o755)).To(Succeed())
+		Expect(resolveVoicesDir(dir)).To(Equal(filepath.Join(dir, "voice_styles")))
+	})
+	It("falls back to the sibling voice_styles next to an onnx subdir", func() {
+		root := GinkgoT().TempDir()
+		Expect(os.MkdirAll(filepath.Join(root, "voice_styles"), 0o755)).To(Succeed())
+		Expect(os.MkdirAll(filepath.Join(root, "onnx"), 0o755)).To(Succeed())
+		Expect(resolveVoicesDir(filepath.Join(root, "onnx"))).To(Equal(filepath.Join(root, "voice_styles")))
+	})
+})
+
+var _ = Describe("resolveLang", func() {
+	It("accepts a valid request language", func() {
+		s := &SupertonicBackend{defaultLang: "na"}
+		Expect(s.resolveLang("ko")).To(Equal("ko"))
+	})
+	It("falls back to the model default for an invalid language", func() {
+		s := &SupertonicBackend{defaultLang: "en"}
+		Expect(s.resolveLang("zz")).To(Equal("en"))
+	})
+	It("falls back to na when nothing is valid", func() {
+		s := &SupertonicBackend{defaultLang: ""}
+		Expect(s.resolveLang("")).To(Equal("na"))
+	})
+})
+
+var _ = Describe("pcmFloatToInt16LE", func() {
+	It("clamps and encodes little-endian", func() {
+		out := pcmFloatToInt16LE([]float32{0, 1.0, -1.0, 2.0})
+		Expect(out).To(HaveLen(8))
+		Expect(out[0:2]).To(Equal([]byte{0x00, 0x00})) // 0
+		Expect(out[2:4]).To(Equal([]byte{0xff, 0x7f})) // 32767
+		Expect(out[6:8]).To(Equal([]byte{0xff, 0x7f})) // clamp 2.0 -> 32767
+	})
+})
+
+var _ = Describe("end-to-end synthesis", Ordered, func() {
+	var modelDir string
+	BeforeAll(func() {
+		modelDir = os.Getenv("SUPERTONIC_MODEL_PATH")
+		if modelDir == "" {
+			Skip("set SUPERTONIC_MODEL_PATH to a supertonic model dir to run")
+		}
+		Expect(InitializeONNXRuntime()).To(Succeed())
+	})
+
+	It("synthesizes a wav file", func() {
+		b := &SupertonicBackend{}
+		Expect(b.Load(&pb.ModelOptions{ModelFile: modelDir, Options: []string{"supertonic.default_voice=F1"}})).To(Succeed())
+		dst := filepath.Join(GinkgoT().TempDir(), "out.wav")
+		lang := "en"
+		Expect(b.TTS(&pb.TTSRequest{Text: "Hello from LocalAI.", Dst: dst, Language: &lang})).To(Succeed())
+		info, err := os.Stat(dst)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(info.Size()).To(BeNumerically(">", 44)) // header + PCM
+	})
+})
diff --git a/backend/go/supertonic/helper.go b/backend/go/supertonic/helper.go
new file mode 100644
index 000000000000..9f927d5d3f0d
--- /dev/null
+++ b/backend/go/supertonic/helper.go
@@ -0,0 +1,1085 @@
+// Vendored from supertone-inc/supertonic (go/helper.go) at commit
+// dff55dc00064c398736080c78195f577527832ae.
+//
+// Copyright (c) Supertone, Inc. Licensed under the MIT License.
+// See https://github.com/supertone-inc/supertonic/blob/main/LICENSE
+//
+// Local modifications (if any) are marked with "LocalAI:" comments.
+
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"math"
+	"math/rand"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"time"
+	"unicode"
+
+	"github.com/go-audio/audio"
+	"github.com/go-audio/wav"
+	ort "github.com/yalue/onnxruntime_go"
+	"golang.org/x/text/unicode/norm"
+)
+
+// Available languages for multilingual TTS
+var AvailableLangs = []string{"en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es", "et", "fi", "fr", "hi", "hr", "hu", "id", "it", "lt", "lv", "nl", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk", "vi", "na"}
+
+// Config structures
+type SpecProcessorConfig struct {
+	NFFT      int     `json:"n_fft"`
+	WinLength int     `json:"win_length"`
+	HopLength int     `json:"hop_length"`
+	NMels     int     `json:"n_mels"`
+	Eps       float64 `json:"eps"`
+	NormMean  float64 `json:"norm_mean"`
+	NormStd   float64 `json:"norm_std"`
+}
+
+type EncoderConfig struct {
+	SpecProcessor SpecProcessorConfig `json:"spec_processor"`
+}
+
+type AEConfig struct {
+	SampleRate    int           `json:"sample_rate"`
+	BaseChunkSize int           `json:"base_chunk_size"`
+	Encoder       EncoderConfig `json:"encoder"`
+}
+
+type StyleTokenLayerConfig struct {
+	NStyle        int `json:"n_style"`
+	StyleValueDim int `json:"style_value_dim"`
+}
+
+type StyleEncoderConfig struct {
+	StyleTokenLayer StyleTokenLayerConfig `json:"style_token_layer"`
+}
+
+type ProjOutConfig struct {
+	Idim int `json:"idim"`
+	Odim int `json:"odim"`
+}
+
+type TextEncoderConfig struct {
+	ProjOut ProjOutConfig `json:"proj_out"`
+}
+
+type TTLConfig struct {
+	ChunkCompressFactor int                `json:"chunk_compress_factor"`
+	LatentDim           int                `json:"latent_dim"`
+	StyleEncoder        StyleEncoderConfig `json:"style_encoder"`
+	TextEncoder         TextEncoderConfig  `json:"text_encoder"`
+}
+
+type DPStyleEncoderConfig struct {
+	StyleTokenLayer StyleTokenLayerConfig `json:"style_token_layer"`
+}
+
+type DPConfig struct {
+	LatentDim           int                  `json:"latent_dim"`
+	ChunkCompressFactor int                  `json:"chunk_compress_factor"`
+	StyleEncoder        DPStyleEncoderConfig `json:"style_encoder"`
+}
+
+type Config struct {
+	AE  AEConfig  `json:"ae"`
+	TTL TTLConfig `json:"ttl"`
+	DP  DPConfig  `json:"dp"`
+}
+
+// VoiceStyleData holds voice style JSON structure
+type VoiceStyleData struct {
+	StyleTTL struct {
+		Data [][][]float64 `json:"data"`
+		Dims []int64       `json:"dims"`
+		Type string        `json:"type"`
+	} `json:"style_ttl"`
+	StyleDP struct {
+		Data [][][]float64 `json:"data"`
+		Dims []int64       `json:"dims"`
+		Type string        `json:"type"`
+	} `json:"style_dp"`
+}
+
+// UnicodeProcessor for text processing
+type UnicodeProcessor struct {
+	indexer []int64
+}
+
+// NewUnicodeProcessor creates a new UnicodeProcessor
+func NewUnicodeProcessor(unicodeIndexerPath string) (*UnicodeProcessor, error) {
+	indexer, err := loadJSONInt64(unicodeIndexerPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load unicode indexer: %w", err)
+	}
+
+	return &UnicodeProcessor{indexer: indexer}, nil
+}
+
+// Call processes text list to text IDs and mask
+func (up *UnicodeProcessor) Call(textList []string, langList []string) ([][]int64, [][][]float64) {
+	// Preprocess texts
+	processedTexts := make([]string, len(textList))
+	for i, text := range textList {
+		processedTexts[i] = preprocessText(text, langList[i])
+	}
+
+	// Get text lengths
+	textLengths := make([]int64, len(processedTexts))
+	maxLen := 0
+	for i, text := range processedTexts {
+		textLengths[i] = int64(len([]rune(text)))
+		if int(textLengths[i]) > maxLen {
+			maxLen = int(textLengths[i])
+		}
+	}
+
+	// Create text IDs
+	textIDs := make([][]int64, len(processedTexts))
+	for i, text := range processedTexts {
+		row := make([]int64, maxLen)
+		runes := []rune(text)
+		for j, r := range runes {
+			unicodeVal := int(r)
+			if unicodeVal < len(up.indexer) {
+				row[j] = up.indexer[unicodeVal]
+			} else {
+				row[j] = -1
+			}
+		}
+		textIDs[i] = row
+	}
+
+	// Create text mask
+	textMask := lengthToMask(textLengths, maxLen)
+
+	return textIDs, textMask
+}
+
+// Text chunking utilities
+const maxChunkLength = 300
+
+var abbreviations = []string{
+	"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
+	"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
+	"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D.",
+}
+
+func chunkText(text string, maxLen int) []string {
+	if maxLen == 0 {
+		maxLen = maxChunkLength
+	}
+
+	text = strings.TrimSpace(text)
+	if text == "" {
+		return []string{""}
+	}
+
+	// Split by paragraphs
+	paragraphs := regexp.MustCompile(`\n\s*\n`).Split(text, -1)
+	var chunks []string
+
+	for _, para := range paragraphs {
+		para = strings.TrimSpace(para)
+		if para == "" {
+			continue
+		}
+
+		if len(para) <= maxLen {
+			chunks = append(chunks, para)
+			continue
+		}
+
+		// Split by sentences
+		sentences := splitSentences(para)
+		var current strings.Builder
+		currentLen := 0
+
+		for _, sentence := range sentences {
+			sentence = strings.TrimSpace(sentence)
+			if sentence == "" {
+				continue
+			}
+
+			sentenceLen := len(sentence)
+			if sentenceLen > maxLen {
+				// If sentence is longer than maxLen, split by comma or space
+				if current.Len() > 0 {
+					chunks = append(chunks, strings.TrimSpace(current.String()))
+					current.Reset()
+					currentLen = 0
+				}
+
+				// Try splitting by comma
+				parts := strings.Split(sentence, ",")
+				for _, part := range parts {
+					part = strings.TrimSpace(part)
+					if part == "" {
+						continue
+					}
+
+					partLen := len(part)
+					if partLen > maxLen {
+						// Split by space as last resort
+						words := strings.Fields(part)
+						var wordChunk strings.Builder
+						wordChunkLen := 0
+
+						for _, word := range words {
+							wordLen := len(word)
+							if wordChunkLen+wordLen+1 > maxLen && wordChunk.Len() > 0 {
+								chunks = append(chunks, strings.TrimSpace(wordChunk.String()))
+								wordChunk.Reset()
+								wordChunkLen = 0
+							}
+
+							if wordChunk.Len() > 0 {
+								wordChunk.WriteString(" ")
+								wordChunkLen++
+							}
+							wordChunk.WriteString(word)
+							wordChunkLen += wordLen
+						}
+
+						if wordChunk.Len() > 0 {
+							chunks = append(chunks, strings.TrimSpace(wordChunk.String()))
+						}
+					} else {
+						if currentLen+partLen+1 > maxLen && current.Len() > 0 {
+							chunks = append(chunks, strings.TrimSpace(current.String()))
+							current.Reset()
+							currentLen = 0
+						}
+
+						if current.Len() > 0 {
+							current.WriteString(", ")
+							currentLen += 2
+						}
+						current.WriteString(part)
+						currentLen += partLen
+					}
+				}
+				continue
+			}
+
+			if currentLen+sentenceLen+1 > maxLen && current.Len() > 0 {
+				chunks = append(chunks, strings.TrimSpace(current.String()))
+				current.Reset()
+				currentLen = 0
+			}
+
+			if current.Len() > 0 {
+				current.WriteString(" ")
+				currentLen++
+			}
+			current.WriteString(sentence)
+			currentLen += sentenceLen
+		}
+
+		if current.Len() > 0 {
+			chunks = append(chunks, strings.TrimSpace(current.String()))
+		}
+	}
+
+	if len(chunks) == 0 {
+		return []string{""}
+	}
+
+	return chunks
+}
+
+func splitSentences(text string) []string {
+	// Go's regexp doesn't support lookbehind, so we use a simpler approach
+	// Split on sentence boundaries and then check if they're abbreviations
+	re := regexp.MustCompile(`([.!?])\s+`)
+	
+	// Find all matches
+	matches := re.FindAllStringIndex(text, -1)
+	if len(matches) == 0 {
+		return []string{text}
+	}
+	
+	var sentences []string
+	lastEnd := 0
+	
+	for _, match := range matches {
+		// Get the text before the punctuation
+		beforePunc := text[lastEnd:match[0]]
+		
+		// Check if this ends with an abbreviation
+		isAbbrev := false
+		for _, abbrev := range abbreviations {
+			if strings.HasSuffix(strings.TrimSpace(beforePunc+text[match[0]:match[0]+1]), abbrev) {
+				isAbbrev = true
+				break
+			}
+		}
+		
+		if !isAbbrev {
+			// This is a real sentence boundary
+			sentences = append(sentences, text[lastEnd:match[1]])
+			lastEnd = match[1]
+		}
+	}
+	
+	// Add the remaining text
+	if lastEnd < len(text) {
+		sentences = append(sentences, text[lastEnd:])
+	}
+	
+	if len(sentences) == 0 {
+		return []string{text}
+	}
+	
+	return sentences
+}
+
+// isValidLang checks if a language is in the available languages list
+func isValidLang(lang string) bool {
+	for _, l := range AvailableLangs {
+		if l == lang {
+			return true
+		}
+	}
+	return false
+}
+
+// Utility functions
+func preprocessText(text string, lang string) string {
+	// TODO: Need advanced normalizer for better performance
+	// Apply NFKD normalization using golang.org/x/text/unicode/norm
+	text = norm.NFKD.String(text)
+
+	// Remove emojis and various Unicode symbols
+	emojiPattern := regexp.MustCompile(`[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+`)
+	text = emojiPattern.ReplaceAllString(text, "")
+
+	// Replace various dashes and symbols
+	replacements := map[string]string{
+		"–": "-",    // en dash
+		"‑": "-",    // non-breaking hyphen
+		"—": "-",    // em dash
+		"_": " ",    // underscore
+		"\u201C": "\"",   // left double quote
+		"\u201D": "\"",   // right double quote
+		"\u2018": "'",    // left single quote
+		"\u2019": "'",    // right single quote
+		"´": "'",    // acute accent
+		"`": "'",    // grave accent
+		"[": " ",    // left bracket
+		"]": " ",    // right bracket
+		"|": " ",    // vertical bar
+		"/": " ",    // slash
+		"#": " ",    // hash
+		"→": " ",    // right arrow
+		"←": " ",    // left arrow
+	}
+
+	for old, new := range replacements {
+		text = strings.ReplaceAll(text, old, new)
+	}
+
+	// Remove special symbols
+	specialSymbols := []string{"♥", "☆", "♡", "©", "\\"}
+	for _, symbol := range specialSymbols {
+		text = strings.ReplaceAll(text, symbol, "")
+	}
+
+	// Replace known expressions
+	exprReplacements := map[string]string{
+		"@":     " at ",
+		"e.g.,": "for example, ",
+		"i.e.,": "that is, ",
+	}
+
+	for old, new := range exprReplacements {
+		text = strings.ReplaceAll(text, old, new)
+	}
+
+	// Fix spacing around punctuation
+	text = regexp.MustCompile(` ,`).ReplaceAllString(text, ",")
+	text = regexp.MustCompile(` \.`).ReplaceAllString(text, ".")
+	text = regexp.MustCompile(` !`).ReplaceAllString(text, "!")
+	text = regexp.MustCompile(` \?`).ReplaceAllString(text, "?")
+	text = regexp.MustCompile(` ;`).ReplaceAllString(text, ";")
+	text = regexp.MustCompile(` :`).ReplaceAllString(text, ":")
+	text = regexp.MustCompile(` '`).ReplaceAllString(text, "'")
+
+	// Remove duplicate quotes
+	for strings.Contains(text, `""`) {
+		text = strings.ReplaceAll(text, `""`, `"`)
+	}
+	for strings.Contains(text, "''") {
+		text = strings.ReplaceAll(text, "''", "'")
+	}
+	for strings.Contains(text, "``") {
+		text = strings.ReplaceAll(text, "``", "`")
+	}
+
+	// Remove extra spaces
+	text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
+	text = strings.TrimSpace(text)
+
+	// If text doesn't end with punctuation, quotes, or closing brackets, add a period
+	if text != "" {
+		endsWithPunct := regexp.MustCompile(`[.!?;:,'"\x{201C}\x{201D}\x{2018}\x{2019})\]}…。」』】〉》›»]$`)
+		if !endsWithPunct.MatchString(text) {
+			text += "."
+		}
+	}
+
+	// Validate language
+	if !isValidLang(lang) {
+		panic(fmt.Sprintf("Invalid language: %s. Available: %v", lang, AvailableLangs))
+	}
+
+	// Wrap text with language tags
+	text = fmt.Sprintf("<%s>%s</%s>", lang, text, lang)
+
+	return text
+}
+
+func lengthToMask(lengths []int64, maxLen int) [][][]float64 {
+	bsz := len(lengths)
+	mask := make([][][]float64, bsz)
+
+	for i := 0; i < bsz; i++ {
+		row := make([]float64, maxLen)
+		for j := 0; j < maxLen; j++ {
+			if int64(j) < lengths[i] {
+				row[j] = 1.0
+			} else {
+				row[j] = 0.0
+			}
+		}
+		mask[i] = [][]float64{row}
+	}
+
+	return mask
+}
+
+func getTextMask(textLengths []int64, maxLen int) [][][]float64 {
+	return lengthToMask(textLengths, maxLen)
+}
+
+func getLatentMask(wavLengths []int64, cfg Config) [][][]float64 {
+	baseChunkSize := int64(cfg.AE.BaseChunkSize)
+	chunkCompressFactor := int64(cfg.TTL.ChunkCompressFactor)
+	latentSize := baseChunkSize * chunkCompressFactor
+
+	latentLengths := make([]int64, len(wavLengths))
+	maxLen := int64(0)
+	for i, wavLen := range wavLengths {
+		latentLengths[i] = (wavLen + latentSize - 1) / latentSize
+		if latentLengths[i] > maxLen {
+			maxLen = latentLengths[i]
+		}
+	}
+
+	return lengthToMask(latentLengths, int(maxLen))
+}
+
+func writeWavFile(filename string, audioData []float64, sampleRate int) error {
+	file, err := os.Create(filename)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	// Convert float64 to int
+	intData := make([]int, len(audioData))
+	for i, sample := range audioData {
+		// Clamp to [-1, 1] and convert to 16-bit int
+		clamped := math.Max(-1.0, math.Min(1.0, sample))
+		intData[i] = int(clamped * 32767)
+	}
+
+	encoder := wav.NewEncoder(file, sampleRate, 16, 1, 1)
+	buf := &audio.IntBuffer{
+		Data:           intData,
+		Format:         &audio.Format{SampleRate: sampleRate, NumChannels: 1},
+		SourceBitDepth: 16,
+	}
+
+	if err := encoder.Write(buf); err != nil {
+		return err
+	}
+
+	return encoder.Close()
+}
+
+// Style holds style tensors
+type Style struct {
+	TtlTensor *ort.Tensor[float32]
+	DpTensor  *ort.Tensor[float32]
+}
+
+func (s *Style) Destroy() {
+	if s.TtlTensor != nil {
+		s.TtlTensor.Destroy()
+	}
+	if s.DpTensor != nil {
+		s.DpTensor.Destroy()
+	}
+}
+
+// LoadVoiceStyle loads voice style from JSON files
+func LoadVoiceStyle(voiceStylePaths []string, verbose bool) (*Style, error) {
+	bsz := len(voiceStylePaths)
+
+	// Read first file to get dimensions
+	firstData, err := os.ReadFile(voiceStylePaths[0])
+	if err != nil {
+		return nil, fmt.Errorf("failed to read voice style file: %w", err)
+	}
+
+	var firstStyle VoiceStyleData
+	if err := json.Unmarshal(firstData, &firstStyle); err != nil {
+		return nil, fmt.Errorf("failed to parse voice style JSON: %w", err)
+	}
+
+	ttlDims := firstStyle.StyleTTL.Dims
+	dpDims := firstStyle.StyleDP.Dims
+
+	ttlDim1 := ttlDims[1]
+	ttlDim2 := ttlDims[2]
+	dpDim1 := dpDims[1]
+	dpDim2 := dpDims[2]
+
+	// Pre-allocate arrays with full batch size
+	ttlSize := int(int64(bsz) * ttlDim1 * ttlDim2)
+	dpSize := int(int64(bsz) * dpDim1 * dpDim2)
+	ttlFlat := make([]float32, ttlSize)
+	dpFlat := make([]float32, dpSize)
+
+	// Fill in the data
+	for i := 0; i < bsz; i++ {
+		data, err := os.ReadFile(voiceStylePaths[i])
+		if err != nil {
+			return nil, fmt.Errorf("failed to read voice style file: %w", err)
+		}
+
+		var voiceStyle VoiceStyleData
+		if err := json.Unmarshal(data, &voiceStyle); err != nil {
+			return nil, fmt.Errorf("failed to parse voice style JSON: %w", err)
+		}
+
+		// Flatten TTL data
+		ttlOffset := int(int64(i) * ttlDim1 * ttlDim2)
+		idx := 0
+		for _, batch := range voiceStyle.StyleTTL.Data {
+			for _, row := range batch {
+				for _, val := range row {
+					ttlFlat[ttlOffset+idx] = float32(val)
+					idx++
+				}
+			}
+		}
+
+		// Flatten DP data
+		dpOffset := int(int64(i) * dpDim1 * dpDim2)
+		idx = 0
+		for _, batch := range voiceStyle.StyleDP.Data {
+			for _, row := range batch {
+				for _, val := range row {
+					dpFlat[dpOffset+idx] = float32(val)
+					idx++
+				}
+			}
+		}
+	}
+
+	ttlShape := []int64{int64(bsz), ttlDim1, ttlDim2}
+	dpShape := []int64{int64(bsz), dpDim1, dpDim2}
+
+	ttlTensor, err := ort.NewTensor(ttlShape, ttlFlat)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create TTL tensor: %w", err)
+	}
+
+	dpTensor, err := ort.NewTensor(dpShape, dpFlat)
+	if err != nil {
+		ttlTensor.Destroy()
+		return nil, fmt.Errorf("failed to create DP tensor: %w", err)
+	}
+
+	if verbose {
+		fmt.Printf("Loaded %d voice styles\n\n", bsz)
+	}
+
+	return &Style{
+		TtlTensor: ttlTensor,
+		DpTensor:  dpTensor,
+	}, nil
+}
+
+// TextToSpeech generates speech from text
+type TextToSpeech struct {
+	cfg           Config
+	textProcessor *UnicodeProcessor
+	dpOrt         *ort.DynamicAdvancedSession
+	textEncOrt    *ort.DynamicAdvancedSession
+	vectorEstOrt  *ort.DynamicAdvancedSession
+	vocoderOrt    *ort.DynamicAdvancedSession
+	SampleRate    int
+	baseChunkSize int
+	chunkCompress int
+	ldim          int
+}
+
+func (tts *TextToSpeech) sampleNoisyLatent(durOnnx []float32) ([][][]float64, [][][]float64) {
+	bsz := len(durOnnx)
+	maxDur := float64(0)
+	for _, d := range durOnnx {
+		if float64(d) > maxDur {
+			maxDur = float64(d)
+		}
+	}
+
+	wavLenMax := maxDur * float64(tts.SampleRate)
+	wavLengths := make([]int64, bsz)
+	for i, d := range durOnnx {
+		wavLengths[i] = int64(float64(d) * float64(tts.SampleRate))
+	}
+
+	chunkSize := tts.baseChunkSize * tts.chunkCompress
+	latentLen := int((wavLenMax + float64(chunkSize) - 1) / float64(chunkSize))
+	latentDim := tts.ldim * tts.chunkCompress
+
+	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+	noisyLatent := make([][][]float64, bsz)
+	for b := 0; b < bsz; b++ {
+		batch := make([][]float64, latentDim)
+		for d := 0; d < latentDim; d++ {
+			row := make([]float64, latentLen)
+			for t := 0; t < latentLen; t++ {
+				// Box-Muller transform for normal distribution
+				// Add epsilon to avoid log(0)
+				const eps = 1e-10
+				u1 := math.Max(eps, rng.Float64())
+				u2 := rng.Float64()
+				row[t] = math.Sqrt(-2.0*math.Log(u1)) * math.Cos(2.0*math.Pi*u2)
+			}
+			batch[d] = row
+		}
+		noisyLatent[b] = batch
+	}
+
+	latentMask := getLatentMask(wavLengths, tts.cfg)
+
+	// Apply mask
+	for b := 0; b < bsz; b++ {
+		for d := 0; d < latentDim; d++ {
+			for t := 0; t < latentLen; t++ {
+				noisyLatent[b][d][t] *= latentMask[b][0][t]
+			}
+		}
+	}
+
+	return noisyLatent, latentMask
+}
+
+func (tts *TextToSpeech) _infer(textList []string, langList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) {
+	bsz := len(textList)
+
+	// Process text
+	textIDs, textMask := tts.textProcessor.Call(textList, langList)
+	textIDsShape := []int64{int64(bsz), int64(len(textIDs[0]))}
+	textMaskShape := []int64{int64(bsz), 1, int64(len(textMask[0][0]))}
+
+	textIDsTensor := IntArrayToTensor(textIDs, textIDsShape)
+	defer textIDsTensor.Destroy()
+	textMaskTensor := ArrayToTensor(textMask, textMaskShape)
+	defer textMaskTensor.Destroy()
+
+	// Predict duration
+	dpOutputs := []ort.Value{nil}
+	err := tts.dpOrt.Run(
+		[]ort.Value{textIDsTensor, style.DpTensor, textMaskTensor},
+		dpOutputs,
+	)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to run duration predictor: %w", err)
+	}
+	durTensor := dpOutputs[0].(*ort.Tensor[float32])
+	defer durTensor.Destroy()
+	durOnnx := durTensor.GetData()
+	
+	// Apply speed factor to duration
+	for i := range durOnnx {
+		durOnnx[i] /= speed
+	}
+
+	// Encode text
+	textIDsTensor2 := IntArrayToTensor(textIDs, textIDsShape)
+	defer textIDsTensor2.Destroy()
+	textEncOutputs := []ort.Value{nil}
+	err = tts.textEncOrt.Run(
+		[]ort.Value{textIDsTensor2, style.TtlTensor, textMaskTensor},
+		textEncOutputs,
+	)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to run text encoder: %w", err)
+	}
+	textEmbTensor := textEncOutputs[0].(*ort.Tensor[float32])
+	defer textEmbTensor.Destroy()
+
+	// Sample noisy latent
+	xt, latentMask := tts.sampleNoisyLatent(durOnnx)
+	latentShape := []int64{int64(bsz), int64(len(xt[0])), int64(len(xt[0][0]))}
+	latentMaskShape := []int64{int64(bsz), 1, int64(len(latentMask[0][0]))}
+
+	// Prepare constant arrays
+	totalStepArray := make([]float32, bsz)
+	for b := 0; b < bsz; b++ {
+		totalStepArray[b] = float32(totalStep)
+	}
+	scalarShape := []int64{int64(bsz)}
+
+	totalStepTensor, _ := ort.NewTensor(scalarShape, totalStepArray)
+	defer totalStepTensor.Destroy()
+
+	// Denoising loop
+	for step := 0; step < totalStep; step++ {
+		currentStepArray := make([]float32, bsz)
+		for b := 0; b < bsz; b++ {
+			currentStepArray[b] = float32(step)
+		}
+
+		currentStepTensor, _ := ort.NewTensor(scalarShape, currentStepArray)
+		noisyLatentTensor := ArrayToTensor(xt, latentShape)
+		latentMaskTensor := ArrayToTensor(latentMask, latentMaskShape)
+		textMaskTensor2 := ArrayToTensor(textMask, textMaskShape)
+
+		vectorEstOutputs := []ort.Value{nil}
+		err = tts.vectorEstOrt.Run(
+			[]ort.Value{noisyLatentTensor, textEmbTensor, style.TtlTensor, latentMaskTensor, textMaskTensor2,
+				currentStepTensor, totalStepTensor},
+			vectorEstOutputs,
+		)
+		if err != nil {
+			return nil, nil, fmt.Errorf("failed to run vector estimator: %w", err)
+		}
+
+		denoisedTensor := vectorEstOutputs[0].(*ort.Tensor[float32])
+		denoisedData := denoisedTensor.GetData()
+
+		// Update latent
+		idx := 0
+		for b := 0; b < bsz; b++ {
+			for d := 0; d < len(xt[b]); d++ {
+				for t := 0; t < len(xt[b][d]); t++ {
+					xt[b][d][t] = float64(denoisedData[idx])
+					idx++
+				}
+			}
+		}
+
+		noisyLatentTensor.Destroy()
+		latentMaskTensor.Destroy()
+		textMaskTensor2.Destroy()
+		currentStepTensor.Destroy()
+		denoisedTensor.Destroy()
+	}
+
+	// Generate waveform
+	finalLatentTensor := ArrayToTensor(xt, latentShape)
+	defer finalLatentTensor.Destroy()
+
+	vocoderOutputs := []ort.Value{nil}
+	err = tts.vocoderOrt.Run(
+		[]ort.Value{finalLatentTensor},
+		vocoderOutputs,
+	)
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to run vocoder: %w", err)
+	}
+
+	wavBatchTensor := vocoderOutputs[0].(*ort.Tensor[float32])
+	defer wavBatchTensor.Destroy()
+	wav := wavBatchTensor.GetData()
+
+	return wav, durOnnx, nil
+}
+
+// Call synthesizes speech from a single text with automatic chunking
+func (tts *TextToSpeech) Call(text string, lang string, style *Style, totalStep int, speed float32, silenceDuration float32) ([]float32, float32, error) {
+	maxLen := 300
+	if lang == "ko" || lang == "ja" {
+		maxLen = 120
+	}
+	chunks := chunkText(text, maxLen)
+	
+	var wavCat []float32
+	var durCat float32
+
+	for i, chunk := range chunks {
+		wav, duration, err := tts._infer([]string{chunk}, []string{lang}, style, totalStep, speed)
+		if err != nil {
+			return nil, 0, err
+		}
+
+		dur := duration[0]
+		wavLen := int(float32(tts.SampleRate) * dur)
+		wavChunk := wav[:wavLen]
+
+		if i == 0 {
+			wavCat = wavChunk
+			durCat = dur
+		} else {
+			silenceLen := int(silenceDuration * float32(tts.SampleRate))
+			silence := make([]float32, silenceLen)
+			
+			wavCat = append(wavCat, silence...)
+			wavCat = append(wavCat, wavChunk...)
+			durCat += silenceDuration + dur
+		}
+	}
+
+	return wavCat, durCat, nil
+}
+
+// Batch synthesizes speech from multiple texts
+func (tts *TextToSpeech) Batch(textList []string, langList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) {
+	return tts._infer(textList, langList, style, totalStep, speed)
+}
+
+func (tts *TextToSpeech) Destroy() {
+	if tts.dpOrt != nil {
+		tts.dpOrt.Destroy()
+	}
+	if tts.textEncOrt != nil {
+		tts.textEncOrt.Destroy()
+	}
+	if tts.vectorEstOrt != nil {
+		tts.vectorEstOrt.Destroy()
+	}
+	if tts.vocoderOrt != nil {
+		tts.vocoderOrt.Destroy()
+	}
+}
+
+// LoadTextToSpeech loads TTS components
+func LoadTextToSpeech(onnxDir string, useGPU bool, cfg Config) (*TextToSpeech, error) {
+	if useGPU {
+		return nil, fmt.Errorf("GPU mode is not supported yet")
+	}
+	fmt.Println("Using CPU for inference") // LocalAI: drop redundant newline (vet)
+
+	// Load models
+	dpPath := filepath.Join(onnxDir, "duration_predictor.onnx")
+	textEncPath := filepath.Join(onnxDir, "text_encoder.onnx")
+	vectorEstPath := filepath.Join(onnxDir, "vector_estimator.onnx")
+	vocoderPath := filepath.Join(onnxDir, "vocoder.onnx")
+
+	dpOrt, err := ort.NewDynamicAdvancedSession(dpPath, []string{"text_ids", "style_dp", "text_mask"},
+		[]string{"duration"}, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load duration predictor: %w", err)
+	}
+
+	textEncOrt, err := ort.NewDynamicAdvancedSession(textEncPath, []string{"text_ids", "style_ttl", "text_mask"},
+		[]string{"text_emb"}, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load text encoder: %w", err)
+	}
+
+	vectorEstOrt, err := ort.NewDynamicAdvancedSession(vectorEstPath,
+		[]string{"noisy_latent", "text_emb", "style_ttl", "latent_mask", "text_mask", "current_step", "total_step"},
+		[]string{"denoised_latent"}, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load vector estimator: %w", err)
+	}
+
+	vocoderOrt, err := ort.NewDynamicAdvancedSession(vocoderPath, []string{"latent"},
+		[]string{"wav_tts"}, nil)
+	if err != nil {
+		return nil, fmt.Errorf("failed to load vocoder: %w", err)
+	}
+
+	// Load text processor
+	unicodeIndexerPath := filepath.Join(onnxDir, "unicode_indexer.json")
+	textProcessor, err := NewUnicodeProcessor(unicodeIndexerPath)
+	if err != nil {
+		return nil, err
+	}
+
+	textToSpeech := &TextToSpeech{
+		cfg:           cfg,
+		textProcessor: textProcessor,
+		dpOrt:         dpOrt,
+		textEncOrt:    textEncOrt,
+		vectorEstOrt:  vectorEstOrt,
+		vocoderOrt:    vocoderOrt,
+		SampleRate:    cfg.AE.SampleRate,
+		baseChunkSize: cfg.AE.BaseChunkSize,
+		chunkCompress: cfg.TTL.ChunkCompressFactor,
+		ldim:          cfg.TTL.LatentDim,
+	}
+
+	return textToSpeech, nil
+}
+
+// InitializeONNXRuntime initializes ONNX Runtime environment
+func InitializeONNXRuntime() error {
+	libPath := os.Getenv("ONNXRUNTIME_LIB_PATH")
+	if libPath == "" {
+		candidates := []string{
+			"/opt/homebrew/opt/onnxruntime/lib/libonnxruntime.dylib",
+			"/usr/local/opt/onnxruntime/lib/libonnxruntime.dylib",
+			"/opt/homebrew/lib/libonnxruntime.dylib",
+			"/usr/local/lib/libonnxruntime.dylib",
+			"/usr/local/lib/libonnxruntime.so",
+			"/usr/lib/libonnxruntime.so",
+		}
+		for _, candidate := range candidates {
+			if _, err := os.Stat(candidate); err == nil {
+				libPath = candidate
+				break
+			}
+		}
+		if libPath == "" {
+			libPath = "/usr/local/lib/libonnxruntime.so"
+		}
+	}
+	ort.SetSharedLibraryPath(libPath)
+
+	if err := ort.InitializeEnvironment(); err != nil {
+		return fmt.Errorf("failed to initialize ONNX Runtime: %w\nHint: install ONNX Runtime (macOS: brew install onnxruntime) or set ONNXRUNTIME_LIB_PATH", err)
+	}
+	return nil
+}
+
+// sanitizeFilename creates a safe filename from text (supports Unicode)
+func sanitizeFilename(text string, maxLen int) string {
+	runes := []rune(text)
+	if len(runes) > maxLen {
+		runes = runes[:maxLen]
+	}
+	
+	result := make([]rune, 0, len(runes))
+	for _, r := range runes {
+		// unicode.IsLetter matches any Unicode letter, unicode.IsDigit matches any Unicode digit
+		if unicode.IsLetter(r) || unicode.IsDigit(r) {
+			result = append(result, r)
+		} else {
+			result = append(result, '_')
+		}
+	}
+	return string(result)
+}
+
+// extractWavSegment extracts a single audio segment from batch output
+func extractWavSegment(wav []float32, duration float32, sampleRate int, index int, batchSize int) []float64 {
+	wavLen := int(float64(sampleRate) * float64(duration))
+	wavPerBatch := len(wav) / batchSize
+	
+	wavStart := index * wavPerBatch
+	wavEnd := wavStart + wavLen
+	if wavEnd > len(wav) {
+		wavEnd = len(wav)
+	}
+	
+	wavOut := make([]float64, wavLen)
+	for j := 0; j < wavLen && wavStart+j < len(wav); j++ {
+		wavOut[j] = float64(wav[wavStart+j])
+	}
+	
+	return wavOut
+}
+
+// Timer measures execution time
+func Timer(name string, fn func() interface{}) interface{} {
+	start := time.Now()
+	fmt.Printf("%s...\n", name)
+	result := fn()
+	elapsed := time.Since(start).Seconds()
+	fmt.Printf("  -> %s completed in %.2f sec\n", name, elapsed)
+	return result
+}
+
+// LoadCfgs loads configuration from JSON file
+func LoadCfgs(onnxDir string) (Config, error) {
+	cfgPath := filepath.Join(onnxDir, "tts.json")
+	data, err := os.ReadFile(cfgPath)
+	if err != nil {
+		return Config{}, err
+	}
+
+	var cfg Config
+	if err := json.Unmarshal(data, &cfg); err != nil {
+		return Config{}, err
+	}
+
+	return cfg, nil
+}
+
+// JSON loading helpers
+func loadJSONInt64(filePath string) ([]int64, error) {
+	data, err := os.ReadFile(filePath)
+	if err != nil {
+		return nil, err
+	}
+
+	var result []int64
+	if err := json.Unmarshal(data, &result); err != nil {
+		return nil, err
+	}
+
+	return result, nil
+}
+
+// Tensor conversion utilities
+func ArrayToTensor(array [][][]float64, shape []int64) *ort.Tensor[float32] {
+	// Flatten array
+	totalSize := int64(1)
+	for _, dim := range shape {
+		totalSize *= dim
+	}
+
+	flat := make([]float32, totalSize)
+	idx := 0
+	for b := 0; b < len(array); b++ {
+		for d := 0; d < len(array[b]); d++ {
+			for t := 0; t < len(array[b][d]); t++ {
+				flat[idx] = float32(array[b][d][t])
+				idx++
+			}
+		}
+	}
+
+	tensor, err := ort.NewTensor(shape, flat)
+	if err != nil {
+		panic(err)
+	}
+
+	return tensor
+}
+
+func IntArrayToTensor(array [][]int64, shape []int64) *ort.Tensor[int64] {
+	// Flatten array
+	totalSize := int64(1)
+	for _, dim := range shape {
+		totalSize *= dim
+	}
+
+	flat := make([]int64, totalSize)
+	idx := 0
+	for b := 0; b < len(array); b++ {
+		for t := 0; t < len(array[b]); t++ {
+			flat[idx] = array[b][t]
+			idx++
+		}
+	}
+
+	tensor, err := ort.NewTensor(shape, flat)
+	if err != nil {
+		panic(err)
+	}
+
+	return tensor
+}
diff --git a/backend/go/supertonic/main.go b/backend/go/supertonic/main.go
new file mode 100644
index 000000000000..49e9ea3a5b26
--- /dev/null
+++ b/backend/go/supertonic/main.go
@@ -0,0 +1,27 @@
+package main
+
+// Started internally by LocalAI; a server is allocated per model.
+
+import (
+	"flag"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+	ort "github.com/yalue/onnxruntime_go"
+)
+
+var addr = flag.String("addr", "localhost:50051", "the address to connect to")
+
+func main() {
+	flag.Parse()
+
+	// InitializeONNXRuntime reads ONNXRUNTIME_LIB_PATH (set by run.sh) and
+	// dlopens libonnxruntime before any session is created in Load().
+	if err := InitializeONNXRuntime(); err != nil {
+		panic(err)
+	}
+	defer func() { _ = ort.DestroyEnvironment() }()
+
+	if err := grpc.StartServer(*addr, &SupertonicBackend{}); err != nil {
+		panic(err)
+	}
+}
diff --git a/backend/go/supertonic/main_suite_test.go b/backend/go/supertonic/main_suite_test.go
new file mode 100644
index 000000000000..90bcb61f54b1
--- /dev/null
+++ b/backend/go/supertonic/main_suite_test.go
@@ -0,0 +1,13 @@
+package main
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestSupertonic(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Supertonic backend test suite")
+}
diff --git a/backend/go/supertonic/package.sh b/backend/go/supertonic/package.sh
new file mode 100755
index 000000000000..9e2a016256a9
--- /dev/null
+++ b/backend/go/supertonic/package.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+REPO_ROOT="${CURDIR}/../../.."
+
+mkdir -p $CURDIR/package/lib
+
+cp -avf $CURDIR/supertonic $CURDIR/package/
+cp -avf $CURDIR/run.sh $CURDIR/package/
+cp -rfLv $CURDIR/backend-assets/lib/* $CURDIR/package/lib/
+
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
diff --git a/backend/go/supertonic/run.sh b/backend/go/supertonic/run.sh
new file mode 100755
index 000000000000..2dabf7eb3337
--- /dev/null
+++ b/backend/go/supertonic/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -ex
+
+CURDIR=$(dirname "$(realpath $0)")
+
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
+
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
+fi
+
+exec $CURDIR/supertonic "$@"
diff --git a/backend/index.yaml b/backend/index.yaml
index 19483ab03900..919254cc473b 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -1368,6 +1368,20 @@
     nvidia: "cuda12-sherpa-onnx"
     nvidia-cuda-12: "cuda12-sherpa-onnx"
     metal: "metal-sherpa-onnx"
+- &supertonic
+  name: "supertonic"
+  alias: "supertonic"
+  urls:
+    - https://github.com/supertone-inc/supertonic
+  description: |
+    Supertonic backend: lightning-fast, on-device multilingual text-to-speech via ONNX Runtime.
+    Runs Supertone's flow-matching TTS model (Supertone/supertonic-3), 44.1kHz output, 31 languages,
+    multiple preset voice styles. No espeak-ng dependency.
+  tags:
+    - text-to-speech
+    - TTS
+  capabilities:
+    default: "cpu-supertonic"
 - !!merge <<: *neutts
   name: "neutts-development"
   capabilities:
@@ -5132,3 +5146,18 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-sherpa-onnx"
   mirrors:
     - localai/localai-backends:master-metal-darwin-arm64-sherpa-onnx
+## supertonic
+- !!merge <<: *supertonic
+  name: "supertonic-development"
+  capabilities:
+    default: "cpu-supertonic-development"
+- !!merge <<: *supertonic
+  name: "cpu-supertonic"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic"
+  mirrors:
+    - localai/localai-backends:latest-cpu-supertonic
+- !!merge <<: *supertonic
+  name: "cpu-supertonic-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic"
+  mirrors:
+    - localai/localai-backends:master-cpu-supertonic
diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go
index c03d52ee4f4b..900fa2de12b7 100644
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -434,6 +434,13 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "json-editor",
 			Order:       78,
 		},
+		"pipeline.max_history_items": {
+			Section:     "pipeline",
+			Label:       "Max History Items",
+			Description: "Cap how many trailing conversation items are fed to the LLM each realtime turn (0 = unlimited, rely on the LLM's context window). Set it on a composed pipeline (VAD+STT+LLM+TTS) so a long-running session doesn't grow until the context fills. Unset uses the per-model-type default.",
+			Component:   "number",
+			Order:       79,
+		},
 
 		// --- Functions ---
 		"function.grammar.parallel_calls": {
diff --git a/core/http/endpoints/localai/backend.go b/core/http/endpoints/localai/backend.go
index 331e49e43673..4b86095e23dd 100644
--- a/core/http/endpoints/localai/backend.go
+++ b/core/http/endpoints/localai/backend.go
@@ -38,6 +38,7 @@ var knownPrefOnlyBackends = []schema.KnownBackend{
 	{Name: "qwen3-tts-cpp", Modality: "tts", AutoDetect: false, Description: "Qwen3 TTS C++ (preference-only)"},
 	{Name: "omnivoice-cpp", Modality: "tts", AutoDetect: false, Description: "OmniVoice C++ TTS with voice cloning and voice design (preference-only)"},
 	{Name: "faster-qwen3-tts", Modality: "tts", AutoDetect: false, Description: "Faster Qwen3 TTS (preference-only)"},
+	{Name: "supertonic", Modality: "tts", AutoDetect: false, Description: "Supertonic multilingual ONNX TTS (preference-only)"},
 	// Detection
 	{Name: "sam3-cpp", Modality: "detection", AutoDetect: false, Description: "SAM3 C++ object detection (preference-only)"},
 	// Audio transform (audio-in / audio-out, optional reference signal)
diff --git a/core/http/endpoints/localai/backend_test.go b/core/http/endpoints/localai/backend_test.go
index 0c21bb7b4f6a..2f82450bd76f 100644
--- a/core/http/endpoints/localai/backend_test.go
+++ b/core/http/endpoints/localai/backend_test.go
@@ -145,6 +145,7 @@ var _ = Describe("Backend Endpoints", func() {
 			expectPrefOnly("qwen-tts", "tts")
 			expectPrefOnly("qwen3-tts-cpp", "tts")
 			expectPrefOnly("faster-qwen3-tts", "tts")
+			expectPrefOnly("supertonic", "tts")
 			expectPrefOnly("sam3-cpp", "detection")
 		})
 
diff --git a/gallery/index.yaml b/gallery/index.yaml
index 446ed0269def..06a37c0cce5a 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3510,6 +3510,78 @@
     - filename: kokoro-int8-multi-lang-v1_0.tar.bz2
       sha256: 75654a84864be26f345f020f4070c2c019e96dd1b7f9bf6e2ffd59efac6aa5a3
       uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-int8-multi-lang-v1_0.tar.bz2
+- name: supertonic-3
+  url: github:mudler/LocalAI/gallery/supertonic.yaml@master
+  urls:
+    - https://github.com/supertone-inc/supertonic
+    - https://huggingface.co/Supertone/supertonic-3
+  description: |
+    Supertonic multilingual text-to-speech (Supertone/supertonic-3), served through the native supertonic backend via ONNX Runtime. Lightning-fast on-device flow-matching TTS with 44.1 kHz output, 31 languages, and 10 preset voice styles (F1-F5, M1-M5). No espeak-ng dependency. Defaults to voice F1; override per request with the OpenAI `voice` field, and optionally pass `language=` (e.g. en, ko, ja, it; "na" for language-agnostic).
+  license: mit
+  icon: https://huggingface.co/Supertone/supertonic-3/resolve/main/img/Supertonic3_HeroImage.png
+  tags:
+    - text-to-speech
+    - tts
+    - multilingual
+    - onnx
+    - supertonic
+    - flow-matching
+    - multi-speaker
+  last_checked: "2026-06-15"
+  overrides:
+    known_usecases:
+      - tts
+    parameters:
+      model: supertonic-3/onnx/tts.json
+  files:
+    - filename: supertonic-3/onnx/duration_predictor.onnx
+      sha256: c3eb91414d5ff8a7a239b7fe9e34e7e2bf8a8140d8375ffb14718b1c639325db
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/duration_predictor.onnx
+    - filename: supertonic-3/onnx/text_encoder.onnx
+      sha256: c7befd5ea8c3119769e8a6c1486c4edc6a3bc8365c67621c881bbb774b9902ff
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/text_encoder.onnx
+    - filename: supertonic-3/onnx/vector_estimator.onnx
+      sha256: 883ac868ea0275ef0e991524dc64f16b3c0376efd7c320af6b53f5b780d7c61c
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/vector_estimator.onnx
+    - filename: supertonic-3/onnx/vocoder.onnx
+      sha256: 085de76dd8e8d5836d6ca66826601f615939218f90e519f70ee8a36ed2a4c4ba
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/vocoder.onnx
+    - filename: supertonic-3/onnx/tts.json
+      sha256: 42078d3aef1cd43ab43021f3c54f47d2d75ceb4e75f627f118890128b06a0d09
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/tts.json
+    - filename: supertonic-3/onnx/unicode_indexer.json
+      sha256: 9bf7346e43883a81f8645c81224f786d43c5b57f3641f6e7671a7d6c493cb24f
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/unicode_indexer.json
+    - filename: supertonic-3/voice_styles/F1.json
+      sha256: bbdec6ee00231c2c742ad05483df5334cab3b52fda3ba38e6a07059c4563dbc2
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F1.json
+    - filename: supertonic-3/voice_styles/F2.json
+      sha256: 7c722c6a72707b1a77f035d67f0d1351ba187738e06f7683e8c72b1df3477fc6
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F2.json
+    - filename: supertonic-3/voice_styles/F3.json
+      sha256: 12f6ef2573baa2defa1128069cb59f203e3ab67c92af77b42df8a0e3a2f7c6ab
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F3.json
+    - filename: supertonic-3/voice_styles/F4.json
+      sha256: c2fa764c1225a76dfc3e2c73e8aa4f70d9ee48793860eb34c295fff01c2e032b
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F4.json
+    - filename: supertonic-3/voice_styles/F5.json
+      sha256: 45966e73316415626cf41a7d1c6f3b4c70dbc1ba2bee5c1978ef0ce33244fc8d
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F5.json
+    - filename: supertonic-3/voice_styles/M1.json
+      sha256: e35604687f5d23694b8e91593a93eec0e4eca6c0b02bb8ed69139ab2ea6b0a5b
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M1.json
+    - filename: supertonic-3/voice_styles/M2.json
+      sha256: b76cbf62bac707c710cf0ae5aba5e31eea1a6339a9734bfae33ab98499534a50
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M2.json
+    - filename: supertonic-3/voice_styles/M3.json
+      sha256: ea1ac35ccb91b0d7ecad533a2fbd0eec10c91513d8951e3b25fbba99954e159b
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M3.json
+    - filename: supertonic-3/voice_styles/M4.json
+      sha256: ca8eefad4fcd989c9379032ff3e50738adc547eeb5e221b82593a6d7b3bac303
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M4.json
+    - filename: supertonic-3/voice_styles/M5.json
+      sha256: dd22b92740314321f8ae11c5e87f8dd60d060f15dd3a632b5adf77f471f77af2
+      uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M5.json
 - name: voxcpm-1.5
   url: github:mudler/LocalAI/gallery/virtual.yaml@master
   urls:
diff --git a/gallery/supertonic.yaml b/gallery/supertonic.yaml
new file mode 100644
index 000000000000..d51a0c88206f
--- /dev/null
+++ b/gallery/supertonic.yaml
@@ -0,0 +1,19 @@
+---
+name: "supertonic"
+
+config_file: |
+  backend: supertonic
+  options:
+    # Generation knobs read by the supertonic backend at TTS time.
+    # steps = flow-matching denoising steps (quality); speed = rate;
+    # silence = inter-chunk silence seconds for long inputs.
+    - supertonic.steps=8
+    - supertonic.speed=1.05
+    - supertonic.silence=0.3
+    # Voice style used when a request omits `voice`. The model ships
+    # F1-F5 / M1-M5 under voice_styles/; override per request via the
+    # OpenAI `voice` field.
+    - supertonic.default_voice=F1
+    # Default language tag when a request omits `language`. "na" is the
+    # model's language-agnostic mode.
+    - supertonic.default_lang=na
diff --git a/go.mod b/go.mod
index e0d0c01af756..2735de2653d5 100644
--- a/go.mod
+++ b/go.mod
@@ -65,6 +65,7 @@ require (
 	github.com/testcontainers/testcontainers-go/modules/nats v0.42.0
 	github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0
 	github.com/timbutler/zxcvbn v1.0.4
+	github.com/yalue/onnxruntime_go v1.11.0
 	go.opentelemetry.io/otel v1.44.0
 	go.opentelemetry.io/otel/exporters/prometheus v0.66.0
 	go.opentelemetry.io/otel/metric v1.44.0
@@ -497,7 +498,7 @@ require (
 	golang.org/x/sync v0.20.0
 	golang.org/x/sys v0.45.0 // indirect
 	golang.org/x/term v0.43.0
-	golang.org/x/text v0.37.0 // indirect
+	golang.org/x/text v0.37.0
 	golang.org/x/tools v0.45.0 // indirect
 	golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
 	golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb // indirect
diff --git a/go.sum b/go.sum
index 6a0bb516bcbe..45feafef58a6 100644
--- a/go.sum
+++ b/go.sum
@@ -1377,6 +1377,8 @@ github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavM
 github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
 github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
 github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
+github.com/yalue/onnxruntime_go v1.11.0 h1:aKH4yPIbqfcB3SfnQWq/WxzLelkyolntHnffL3eMBHY=
+github.com/yalue/onnxruntime_go v1.11.0/go.mod h1:b4X26A8pekNb1ACJ58wAXgNKeUCGEAQ9dmACut9Sm/4=
 github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4=
 github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4=
 github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=