diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 953913293953..318d99832d65 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -4490,6 +4490,36 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + # supertonic CPU (amd64) + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-supertonic' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "supertonic" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + # supertonic CPU (arm64) + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-supertonic' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "supertonic" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' # Darwin matrix (consumed by backend-jobs-darwin). includeDarwin: diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml index bb381567baa5..a8bac30dd1f7 100644 --- a/.github/workflows/secscan.yaml +++ b/.github/workflows/secscan.yaml @@ -21,7 +21,10 @@ jobs: uses: securego/gosec@v2.27.1 with: # we let the report trigger content trigger a failure using the GitHub Security features. - args: '-no-fail -fmt sarif -out results.sarif ./...' + # backend/go/supertonic is excluded: it vendors upstream supertone-inc/supertonic + # (helper.go), whose findings (G304 model-file loads, G404 math/rand for flow-matching + # noise, G104 unhandled errors) are inherent to that upstream code, not ours to rewrite. + args: '-no-fail -exclude-dir=backend/go/supertonic -fmt sarif -out results.sarif ./...' - name: Upload SARIF file if: ${{ github.actor != 'dependabot[bot]' }} uses: github/codeql-action/upload-sarif@v4 diff --git a/.golangci.yml b/.golangci.yml index dceb32374fa8..d25d1ccb4789 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -74,6 +74,8 @@ linters: paths: # Upstream whisper.cpp source tree fetched by the whisper backend Makefile. - 'backend/go/whisper/sources' + # Vendored upstream supertonic pipeline (supertone-inc/supertonic go/helper.go). + - 'backend/go/supertonic/helper.go' - 'docs/' rules: # CLI entry points: kong's `env:"..."` tag is the legitimate env→struct diff --git a/Makefile b/Makefile index ecca9d3c7b9e..5db33f1ac4f2 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/omnivoice-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio backends/supertonic GOCMD=go GOTEST=$(GOCMD) test @@ -595,6 +595,7 @@ test-extra: prepare-test-extra $(MAKE) -C backend/rust/kokoros test $(MAKE) -C backend/go/rfdetr-cpp test $(MAKE) -C backend/go/locate-anything-cpp test + $(MAKE) -C backend/go/supertonic test ## ## End-to-end gRPC tests that exercise a built backend container image. @@ -1181,6 +1182,7 @@ BACKEND_VIBEVOICE_CPP = vibevoice-cpp|golang|.|false|true BACKEND_LOCALVQE = localvqe|golang|.|false|true BACKEND_OPUS = opus|golang|.|false|true BACKEND_SHERPA_ONNX = sherpa-onnx|golang|.|false|true +BACKEND_SUPERTONIC = supertonic|golang|.|false|true # Python backends with root context BACKEND_RERANKERS = rerankers|python|.|false|true @@ -1308,12 +1310,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_KOKOROS))) $(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP))) $(eval $(call generate-docker-build-target,$(BACKEND_RFDETR_CPP))) $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX))) +$(eval $(call generate-docker-build-target,$(BACKEND_SUPERTONIC))) # Pattern rule for docker-save targets docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy +docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-crispasr docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-rfdetr-cpp docker-build-qwen3-tts-cpp docker-build-omnivoice-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy docker-build-supertonic ######################################################## ### Mock Backend for E2E Tests diff --git a/backend/go/supertonic/.gitignore b/backend/go/supertonic/.gitignore new file mode 100644 index 000000000000..10f6d5c1f85e --- /dev/null +++ b/backend/go/supertonic/.gitignore @@ -0,0 +1,4 @@ +/supertonic +/sources/ +/backend-assets/ +/package/ diff --git a/backend/go/supertonic/Makefile b/backend/go/supertonic/Makefile new file mode 100644 index 000000000000..ab4991442bf2 --- /dev/null +++ b/backend/go/supertonic/Makefile @@ -0,0 +1,62 @@ +CURRENT_DIR=$(abspath ./) +GOCMD=go + +ONNX_VERSION?=1.24.4 +ONNX_ARCH?=x64 +ONNX_OS?=linux + +ifneq (,$(findstring aarch64,$(shell uname -m))) + ONNX_ARCH=aarch64 +endif + +ifeq ($(OS),Darwin) + ONNX_OS=osx + ifneq (,$(findstring arm64,$(shell uname -m))) + ONNX_ARCH=arm64 + else + ONNX_ARCH=x86_64 + endif +endif + +# CUDA 12 ships as -gpu, CUDA 13 as -gpu_cuda13 (underscore). CPU has no suffix. +ifeq ($(BUILD_TYPE),cublas) + ONNX_PROVIDER=cuda + ifeq ($(CUDA_MAJOR_VERSION),13) + ONNX_VARIANT=-gpu_cuda13 + else + ONNX_VARIANT=-gpu + endif +else + ONNX_VARIANT= + ONNX_PROVIDER=cpu +endif + +sources/onnxruntime: + mkdir -p sources/onnxruntime + curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)$(ONNX_VARIANT)-$(ONNX_VERSION).tgz \ + -o sources/onnxruntime/onnxruntime.tgz + cd sources/onnxruntime && tar -xf onnxruntime.tgz --strip-components=1 && rm onnxruntime.tgz + +backend-assets/lib: sources/onnxruntime + mkdir -p backend-assets/lib + cp -rfLv sources/onnxruntime/lib/* backend-assets/lib/ + +supertonic: backend-assets/lib + CGO_ENABLED=1 $(GOCMD) build \ + -ldflags "$(LD_FLAGS) -X main.onnxProvider=$(ONNX_PROVIDER)" \ + -tags "$(GO_TAGS)" -o supertonic ./ + +package: + bash package.sh + +build: supertonic package + +# Tests need only the Go toolchain (gcc); yalue dlopens onnxruntime at +# runtime, so no tarball download is required to compile or run unit specs. +test: + CGO_ENABLED=1 $(GOCMD) test -v -timeout 120s ./... + +clean: + rm -rf supertonic sources/ backend-assets/ package/ + +.PHONY: build package clean test diff --git a/backend/go/supertonic/backend.go b/backend/go/supertonic/backend.go new file mode 100644 index 000000000000..36028d33d80b --- /dev/null +++ b/backend/go/supertonic/backend.go @@ -0,0 +1,307 @@ +package main + +import ( + "bytes" + "encoding/binary" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + laudio "github.com/mudler/LocalAI/pkg/audio" + "github.com/mudler/LocalAI/pkg/grpc/base" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" +) + +// onnxProvider is set via -ldflags "-X main.onnxProvider=cuda" by the +// CUDA build (later phase). Defaults to CPU. +var onnxProvider = "cpu" + +// Per-model generation defaults, overridable via ModelOptions.Options: +// +// supertonic.steps= denoising steps (quality), default 8 +// supertonic.speed= speech rate, default 1.05 +// supertonic.silence= inter-chunk silence seconds, default 0.3 +// supertonic.default_voice= voice-style used when request omits voice +// supertonic.default_lang= language tag used when request omits it +const ( + optionSteps = "supertonic.steps=" + optionSpeed = "supertonic.speed=" + optionSilence = "supertonic.silence=" + optionDefaultVoice = "supertonic.default_voice=" + optionDefaultLang = "supertonic.default_lang=" +) + +type SupertonicBackend struct { + base.SingleThread + + tts *TextToSpeech + cfg Config + modelDir string + voicesDir string + defaultVoice string + defaultLang string + steps int + speed float32 + silence float32 + + styleMu sync.Mutex + styles map[string]*Style // voice name -> loaded style cache +} + +func (s *SupertonicBackend) Load(opts *pb.ModelOptions) error { + modelDir, err := resolveModelDir(opts.ModelFile) + if err != nil { + return err + } + s.modelDir = modelDir + s.voicesDir = resolveVoicesDir(modelDir) + + cfg, err := LoadCfgs(modelDir) + if err != nil { + return fmt.Errorf("loading tts.json from %s: %w", modelDir, err) + } + s.cfg = cfg + + // onnxProvider is "cpu" for the CPU build; the CUDA build sets it to + // "cuda" via -ldflags. Upstream LoadTextToSpeech still errors on GPU + // until the CUDA phase wires the execution provider. + tts, err := LoadTextToSpeech(modelDir, onnxProvider == "cuda", cfg) + if err != nil { + return fmt.Errorf("loading supertonic models from %s: %w", modelDir, err) + } + s.tts = tts + + s.steps = int(findOptionInt(opts, optionSteps, 8)) + s.speed = findOptionFloat(opts, optionSpeed, 1.05) + s.silence = findOptionFloat(opts, optionSilence, 0.3) + s.defaultVoice = findOptionValue(opts, optionDefaultVoice, "") + s.defaultLang = findOptionValue(opts, optionDefaultLang, "na") + s.styles = map[string]*Style{} + return nil +} + +func (s *SupertonicBackend) TTS(req *pb.TTSRequest) error { + wav, sr, err := s.synthesize(req) + if err != nil { + return err + } + out := make([]float64, len(wav)) + for i, v := range wav { + out[i] = float64(v) + } + if err := writeWavFile(req.Dst, out, sr); err != nil { + return fmt.Errorf("writing wav to %s: %w", req.Dst, err) + } + return nil +} + +func (s *SupertonicBackend) TTSStream(req *pb.TTSRequest, results chan []byte) error { + defer close(results) + + wav, sr, err := s.synthesize(req) + if err != nil { + return err + } + + results <- streamingWAVHeader(uint32(sr)) + + const chunkSamples = 4096 + for off := 0; off < len(wav); off += chunkSamples { + end := off + chunkSamples + if end > len(wav) { + end = len(wav) + } + results <- pcmFloatToInt16LE(wav[off:end]) + } + return nil +} + +// synthesize runs the full pipeline and returns the trimmed mono float32 +// PCM and its sample rate. +func (s *SupertonicBackend) synthesize(req *pb.TTSRequest) ([]float32, int, error) { + if s.tts == nil { + return nil, 0, fmt.Errorf("supertonic model not loaded") + } + if strings.TrimSpace(req.Text) == "" { + return nil, 0, fmt.Errorf("empty text") + } + + style, err := s.loadStyle(s.voiceName(req.Voice)) + if err != nil { + return nil, 0, err + } + + lang := s.resolveLang("") + if req.Language != nil { + lang = s.resolveLang(*req.Language) + } + + wav, dur, err := s.tts.Call(req.Text, lang, style, s.steps, s.speed, s.silence) + if err != nil { + return nil, 0, err + } + + sr := s.tts.SampleRate + // Call returns concatenated audio; trim to the reported duration. + wavLen := int(float32(sr) * dur) + if wavLen < 0 { + wavLen = 0 + } + if wavLen > len(wav) { + wavLen = len(wav) + } + return wav[:wavLen], sr, nil +} + +// voiceName picks the request voice, falling back to the model default. +func (s *SupertonicBackend) voiceName(reqVoice string) string { + v := strings.TrimSpace(reqVoice) + if v == "" { + return s.defaultVoice + } + return v +} + +// resolveLang validates against AvailableLangs, falling back to the model +// default (then "na"). +func (s *SupertonicBackend) resolveLang(reqLang string) string { + l := strings.TrimSpace(reqLang) + if l != "" && isValidLang(l) { + return l + } + if s.defaultLang != "" && isValidLang(s.defaultLang) { + return s.defaultLang + } + return "na" +} + +// loadStyle resolves and caches a voice-style. An empty name with no model +// default is an error (supertonic requires a style embedding). +func (s *SupertonicBackend) loadStyle(name string) (*Style, error) { + if name == "" { + return nil, fmt.Errorf("no voice specified and no supertonic.default_voice set") + } + s.styleMu.Lock() + defer s.styleMu.Unlock() + if st, ok := s.styles[name]; ok { + return st, nil + } + path := s.voiceStylePath(name) + st, err := LoadVoiceStyle([]string{path}, false) + if err != nil { + return nil, fmt.Errorf("loading voice style %q (%s): %w", name, path, err) + } + s.styles[name] = st + return st, nil +} + +// voiceStylePath maps a voice name to a JSON path. Absolute paths are honored; +// names containing a separator resolve under modelDir; bare names resolve under +// the resolved voicesDir (see resolveVoicesDir). +func (s *SupertonicBackend) voiceStylePath(name string) string { + if !strings.HasSuffix(name, ".json") { + name += ".json" + } + if filepath.IsAbs(name) { + return name + } + if strings.ContainsRune(name, filepath.Separator) { + return filepath.Join(s.modelDir, name) + } + return filepath.Join(s.voicesDir, name) +} + +// resolveVoicesDir locates the voice_styles directory. The HF model layout +// puts the ONNX files in an onnx/ subdir with voice_styles/ as its sibling, +// so check modelDir/voice_styles first, then the parent's voice_styles. +func resolveVoicesDir(modelDir string) string { + candidates := []string{ + filepath.Join(modelDir, "voice_styles"), + filepath.Join(filepath.Dir(modelDir), "voice_styles"), + } + for _, c := range candidates { + if info, err := os.Stat(c); err == nil && info.IsDir() { + return c + } + } + return candidates[0] +} + +// resolveModelDir accepts either a directory (used as-is) or a file (its +// parent dir is used). +func resolveModelDir(modelFile string) (string, error) { + if modelFile == "" { + return "", fmt.Errorf("empty model path") + } + info, err := os.Stat(modelFile) + if err != nil { + return "", fmt.Errorf("stat model path %s: %w", modelFile, err) + } + if info.IsDir() { + return modelFile, nil + } + return filepath.Dir(modelFile), nil +} + +// ---- option helpers (mirrors backend/go/sherpa-onnx/backend.go) ---- + +func findOptionValue(opts *pb.ModelOptions, prefix, def string) string { + for _, o := range opts.Options { + if strings.HasPrefix(o, prefix) { + return strings.TrimPrefix(o, prefix) + } + } + return def +} + +func findOptionFloat(opts *pb.ModelOptions, prefix string, def float32) float32 { + raw := findOptionValue(opts, prefix, "") + if raw == "" { + return def + } + v, err := strconv.ParseFloat(raw, 32) + if err != nil { + return def + } + return float32(v) +} + +func findOptionInt(opts *pb.ModelOptions, prefix string, def int32) int32 { + raw := findOptionValue(opts, prefix, "") + if raw == "" { + return def + } + v, err := strconv.ParseInt(raw, 10, 32) + if err != nil { + return def + } + return int32(v) +} + +// ---- PCM helpers ---- + +func pcmFloatToInt16LE(samples []float32) []byte { + buf := make([]byte, len(samples)*2) + for i, f := range samples { + v := int32(f * 32767) + if v > 32767 { + v = 32767 + } else if v < -32768 { + v = -32768 + } + binary.LittleEndian.PutUint16(buf[2*i:], uint16(int16(v))) + } + return buf +} + +func streamingWAVHeader(sampleRate uint32) []byte { + const streamingSize = 0xFFFFFFFF + h := laudio.NewWAVHeaderWithRate(streamingSize, sampleRate) + h.ChunkSize = streamingSize + var buf bytes.Buffer + _ = h.Write(&buf) + return buf.Bytes() +} diff --git a/backend/go/supertonic/backend_test.go b/backend/go/supertonic/backend_test.go new file mode 100644 index 000000000000..d5027a082a13 --- /dev/null +++ b/backend/go/supertonic/backend_test.go @@ -0,0 +1,86 @@ +package main + +import ( + "os" + "path/filepath" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + pb "github.com/mudler/LocalAI/pkg/grpc/proto" +) + +var _ = Describe("voiceStylePath", func() { + s := &SupertonicBackend{modelDir: "/models/st/onnx", voicesDir: "/models/st/voice_styles"} + + It("resolves a bare name under the resolved voicesDir", func() { + Expect(s.voiceStylePath("M1")).To(Equal(filepath.Join("/models/st/voice_styles", "M1.json"))) + }) + It("keeps an explicit .json suffix", func() { + Expect(s.voiceStylePath("M1.json")).To(Equal(filepath.Join("/models/st/voice_styles", "M1.json"))) + }) + It("honors absolute paths", func() { + Expect(s.voiceStylePath("/abs/v.json")).To(Equal("/abs/v.json")) + }) +}) + +var _ = Describe("resolveVoicesDir", func() { + It("prefers voice_styles under modelDir", func() { + dir := GinkgoT().TempDir() + Expect(os.MkdirAll(filepath.Join(dir, "voice_styles"), 0o755)).To(Succeed()) + Expect(resolveVoicesDir(dir)).To(Equal(filepath.Join(dir, "voice_styles"))) + }) + It("falls back to the sibling voice_styles next to an onnx subdir", func() { + root := GinkgoT().TempDir() + Expect(os.MkdirAll(filepath.Join(root, "voice_styles"), 0o755)).To(Succeed()) + Expect(os.MkdirAll(filepath.Join(root, "onnx"), 0o755)).To(Succeed()) + Expect(resolveVoicesDir(filepath.Join(root, "onnx"))).To(Equal(filepath.Join(root, "voice_styles"))) + }) +}) + +var _ = Describe("resolveLang", func() { + It("accepts a valid request language", func() { + s := &SupertonicBackend{defaultLang: "na"} + Expect(s.resolveLang("ko")).To(Equal("ko")) + }) + It("falls back to the model default for an invalid language", func() { + s := &SupertonicBackend{defaultLang: "en"} + Expect(s.resolveLang("zz")).To(Equal("en")) + }) + It("falls back to na when nothing is valid", func() { + s := &SupertonicBackend{defaultLang: ""} + Expect(s.resolveLang("")).To(Equal("na")) + }) +}) + +var _ = Describe("pcmFloatToInt16LE", func() { + It("clamps and encodes little-endian", func() { + out := pcmFloatToInt16LE([]float32{0, 1.0, -1.0, 2.0}) + Expect(out).To(HaveLen(8)) + Expect(out[0:2]).To(Equal([]byte{0x00, 0x00})) // 0 + Expect(out[2:4]).To(Equal([]byte{0xff, 0x7f})) // 32767 + Expect(out[6:8]).To(Equal([]byte{0xff, 0x7f})) // clamp 2.0 -> 32767 + }) +}) + +var _ = Describe("end-to-end synthesis", Ordered, func() { + var modelDir string + BeforeAll(func() { + modelDir = os.Getenv("SUPERTONIC_MODEL_PATH") + if modelDir == "" { + Skip("set SUPERTONIC_MODEL_PATH to a supertonic model dir to run") + } + Expect(InitializeONNXRuntime()).To(Succeed()) + }) + + It("synthesizes a wav file", func() { + b := &SupertonicBackend{} + Expect(b.Load(&pb.ModelOptions{ModelFile: modelDir, Options: []string{"supertonic.default_voice=F1"}})).To(Succeed()) + dst := filepath.Join(GinkgoT().TempDir(), "out.wav") + lang := "en" + Expect(b.TTS(&pb.TTSRequest{Text: "Hello from LocalAI.", Dst: dst, Language: &lang})).To(Succeed()) + info, err := os.Stat(dst) + Expect(err).ToNot(HaveOccurred()) + Expect(info.Size()).To(BeNumerically(">", 44)) // header + PCM + }) +}) diff --git a/backend/go/supertonic/helper.go b/backend/go/supertonic/helper.go new file mode 100644 index 000000000000..9f927d5d3f0d --- /dev/null +++ b/backend/go/supertonic/helper.go @@ -0,0 +1,1085 @@ +// Vendored from supertone-inc/supertonic (go/helper.go) at commit +// dff55dc00064c398736080c78195f577527832ae. +// +// Copyright (c) Supertone, Inc. Licensed under the MIT License. +// See https://github.com/supertone-inc/supertonic/blob/main/LICENSE +// +// Local modifications (if any) are marked with "LocalAI:" comments. + +package main + +import ( + "encoding/json" + "fmt" + "math" + "math/rand" + "os" + "path/filepath" + "regexp" + "strings" + "time" + "unicode" + + "github.com/go-audio/audio" + "github.com/go-audio/wav" + ort "github.com/yalue/onnxruntime_go" + "golang.org/x/text/unicode/norm" +) + +// Available languages for multilingual TTS +var AvailableLangs = []string{"en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es", "et", "fi", "fr", "hi", "hr", "hu", "id", "it", "lt", "lv", "nl", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk", "vi", "na"} + +// Config structures +type SpecProcessorConfig struct { + NFFT int `json:"n_fft"` + WinLength int `json:"win_length"` + HopLength int `json:"hop_length"` + NMels int `json:"n_mels"` + Eps float64 `json:"eps"` + NormMean float64 `json:"norm_mean"` + NormStd float64 `json:"norm_std"` +} + +type EncoderConfig struct { + SpecProcessor SpecProcessorConfig `json:"spec_processor"` +} + +type AEConfig struct { + SampleRate int `json:"sample_rate"` + BaseChunkSize int `json:"base_chunk_size"` + Encoder EncoderConfig `json:"encoder"` +} + +type StyleTokenLayerConfig struct { + NStyle int `json:"n_style"` + StyleValueDim int `json:"style_value_dim"` +} + +type StyleEncoderConfig struct { + StyleTokenLayer StyleTokenLayerConfig `json:"style_token_layer"` +} + +type ProjOutConfig struct { + Idim int `json:"idim"` + Odim int `json:"odim"` +} + +type TextEncoderConfig struct { + ProjOut ProjOutConfig `json:"proj_out"` +} + +type TTLConfig struct { + ChunkCompressFactor int `json:"chunk_compress_factor"` + LatentDim int `json:"latent_dim"` + StyleEncoder StyleEncoderConfig `json:"style_encoder"` + TextEncoder TextEncoderConfig `json:"text_encoder"` +} + +type DPStyleEncoderConfig struct { + StyleTokenLayer StyleTokenLayerConfig `json:"style_token_layer"` +} + +type DPConfig struct { + LatentDim int `json:"latent_dim"` + ChunkCompressFactor int `json:"chunk_compress_factor"` + StyleEncoder DPStyleEncoderConfig `json:"style_encoder"` +} + +type Config struct { + AE AEConfig `json:"ae"` + TTL TTLConfig `json:"ttl"` + DP DPConfig `json:"dp"` +} + +// VoiceStyleData holds voice style JSON structure +type VoiceStyleData struct { + StyleTTL struct { + Data [][][]float64 `json:"data"` + Dims []int64 `json:"dims"` + Type string `json:"type"` + } `json:"style_ttl"` + StyleDP struct { + Data [][][]float64 `json:"data"` + Dims []int64 `json:"dims"` + Type string `json:"type"` + } `json:"style_dp"` +} + +// UnicodeProcessor for text processing +type UnicodeProcessor struct { + indexer []int64 +} + +// NewUnicodeProcessor creates a new UnicodeProcessor +func NewUnicodeProcessor(unicodeIndexerPath string) (*UnicodeProcessor, error) { + indexer, err := loadJSONInt64(unicodeIndexerPath) + if err != nil { + return nil, fmt.Errorf("failed to load unicode indexer: %w", err) + } + + return &UnicodeProcessor{indexer: indexer}, nil +} + +// Call processes text list to text IDs and mask +func (up *UnicodeProcessor) Call(textList []string, langList []string) ([][]int64, [][][]float64) { + // Preprocess texts + processedTexts := make([]string, len(textList)) + for i, text := range textList { + processedTexts[i] = preprocessText(text, langList[i]) + } + + // Get text lengths + textLengths := make([]int64, len(processedTexts)) + maxLen := 0 + for i, text := range processedTexts { + textLengths[i] = int64(len([]rune(text))) + if int(textLengths[i]) > maxLen { + maxLen = int(textLengths[i]) + } + } + + // Create text IDs + textIDs := make([][]int64, len(processedTexts)) + for i, text := range processedTexts { + row := make([]int64, maxLen) + runes := []rune(text) + for j, r := range runes { + unicodeVal := int(r) + if unicodeVal < len(up.indexer) { + row[j] = up.indexer[unicodeVal] + } else { + row[j] = -1 + } + } + textIDs[i] = row + } + + // Create text mask + textMask := lengthToMask(textLengths, maxLen) + + return textIDs, textMask +} + +// Text chunking utilities +const maxChunkLength = 300 + +var abbreviations = []string{ + "Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.", + "St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.", + "Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D.", +} + +func chunkText(text string, maxLen int) []string { + if maxLen == 0 { + maxLen = maxChunkLength + } + + text = strings.TrimSpace(text) + if text == "" { + return []string{""} + } + + // Split by paragraphs + paragraphs := regexp.MustCompile(`\n\s*\n`).Split(text, -1) + var chunks []string + + for _, para := range paragraphs { + para = strings.TrimSpace(para) + if para == "" { + continue + } + + if len(para) <= maxLen { + chunks = append(chunks, para) + continue + } + + // Split by sentences + sentences := splitSentences(para) + var current strings.Builder + currentLen := 0 + + for _, sentence := range sentences { + sentence = strings.TrimSpace(sentence) + if sentence == "" { + continue + } + + sentenceLen := len(sentence) + if sentenceLen > maxLen { + // If sentence is longer than maxLen, split by comma or space + if current.Len() > 0 { + chunks = append(chunks, strings.TrimSpace(current.String())) + current.Reset() + currentLen = 0 + } + + // Try splitting by comma + parts := strings.Split(sentence, ",") + for _, part := range parts { + part = strings.TrimSpace(part) + if part == "" { + continue + } + + partLen := len(part) + if partLen > maxLen { + // Split by space as last resort + words := strings.Fields(part) + var wordChunk strings.Builder + wordChunkLen := 0 + + for _, word := range words { + wordLen := len(word) + if wordChunkLen+wordLen+1 > maxLen && wordChunk.Len() > 0 { + chunks = append(chunks, strings.TrimSpace(wordChunk.String())) + wordChunk.Reset() + wordChunkLen = 0 + } + + if wordChunk.Len() > 0 { + wordChunk.WriteString(" ") + wordChunkLen++ + } + wordChunk.WriteString(word) + wordChunkLen += wordLen + } + + if wordChunk.Len() > 0 { + chunks = append(chunks, strings.TrimSpace(wordChunk.String())) + } + } else { + if currentLen+partLen+1 > maxLen && current.Len() > 0 { + chunks = append(chunks, strings.TrimSpace(current.String())) + current.Reset() + currentLen = 0 + } + + if current.Len() > 0 { + current.WriteString(", ") + currentLen += 2 + } + current.WriteString(part) + currentLen += partLen + } + } + continue + } + + if currentLen+sentenceLen+1 > maxLen && current.Len() > 0 { + chunks = append(chunks, strings.TrimSpace(current.String())) + current.Reset() + currentLen = 0 + } + + if current.Len() > 0 { + current.WriteString(" ") + currentLen++ + } + current.WriteString(sentence) + currentLen += sentenceLen + } + + if current.Len() > 0 { + chunks = append(chunks, strings.TrimSpace(current.String())) + } + } + + if len(chunks) == 0 { + return []string{""} + } + + return chunks +} + +func splitSentences(text string) []string { + // Go's regexp doesn't support lookbehind, so we use a simpler approach + // Split on sentence boundaries and then check if they're abbreviations + re := regexp.MustCompile(`([.!?])\s+`) + + // Find all matches + matches := re.FindAllStringIndex(text, -1) + if len(matches) == 0 { + return []string{text} + } + + var sentences []string + lastEnd := 0 + + for _, match := range matches { + // Get the text before the punctuation + beforePunc := text[lastEnd:match[0]] + + // Check if this ends with an abbreviation + isAbbrev := false + for _, abbrev := range abbreviations { + if strings.HasSuffix(strings.TrimSpace(beforePunc+text[match[0]:match[0]+1]), abbrev) { + isAbbrev = true + break + } + } + + if !isAbbrev { + // This is a real sentence boundary + sentences = append(sentences, text[lastEnd:match[1]]) + lastEnd = match[1] + } + } + + // Add the remaining text + if lastEnd < len(text) { + sentences = append(sentences, text[lastEnd:]) + } + + if len(sentences) == 0 { + return []string{text} + } + + return sentences +} + +// isValidLang checks if a language is in the available languages list +func isValidLang(lang string) bool { + for _, l := range AvailableLangs { + if l == lang { + return true + } + } + return false +} + +// Utility functions +func preprocessText(text string, lang string) string { + // TODO: Need advanced normalizer for better performance + // Apply NFKD normalization using golang.org/x/text/unicode/norm + text = norm.NFKD.String(text) + + // Remove emojis and various Unicode symbols + emojiPattern := regexp.MustCompile(`[\x{1F600}-\x{1F64F}\x{1F300}-\x{1F5FF}\x{1F680}-\x{1F6FF}\x{1F700}-\x{1F77F}\x{1F780}-\x{1F7FF}\x{1F800}-\x{1F8FF}\x{1F900}-\x{1F9FF}\x{1FA00}-\x{1FA6F}\x{1FA70}-\x{1FAFF}\x{2600}-\x{26FF}\x{2700}-\x{27BF}\x{1F1E6}-\x{1F1FF}]+`) + text = emojiPattern.ReplaceAllString(text, "") + + // Replace various dashes and symbols + replacements := map[string]string{ + "–": "-", // en dash + "‑": "-", // non-breaking hyphen + "—": "-", // em dash + "_": " ", // underscore + "\u201C": "\"", // left double quote + "\u201D": "\"", // right double quote + "\u2018": "'", // left single quote + "\u2019": "'", // right single quote + "´": "'", // acute accent + "`": "'", // grave accent + "[": " ", // left bracket + "]": " ", // right bracket + "|": " ", // vertical bar + "/": " ", // slash + "#": " ", // hash + "→": " ", // right arrow + "←": " ", // left arrow + } + + for old, new := range replacements { + text = strings.ReplaceAll(text, old, new) + } + + // Remove special symbols + specialSymbols := []string{"♥", "☆", "♡", "©", "\\"} + for _, symbol := range specialSymbols { + text = strings.ReplaceAll(text, symbol, "") + } + + // Replace known expressions + exprReplacements := map[string]string{ + "@": " at ", + "e.g.,": "for example, ", + "i.e.,": "that is, ", + } + + for old, new := range exprReplacements { + text = strings.ReplaceAll(text, old, new) + } + + // Fix spacing around punctuation + text = regexp.MustCompile(` ,`).ReplaceAllString(text, ",") + text = regexp.MustCompile(` \.`).ReplaceAllString(text, ".") + text = regexp.MustCompile(` !`).ReplaceAllString(text, "!") + text = regexp.MustCompile(` \?`).ReplaceAllString(text, "?") + text = regexp.MustCompile(` ;`).ReplaceAllString(text, ";") + text = regexp.MustCompile(` :`).ReplaceAllString(text, ":") + text = regexp.MustCompile(` '`).ReplaceAllString(text, "'") + + // Remove duplicate quotes + for strings.Contains(text, `""`) { + text = strings.ReplaceAll(text, `""`, `"`) + } + for strings.Contains(text, "''") { + text = strings.ReplaceAll(text, "''", "'") + } + for strings.Contains(text, "``") { + text = strings.ReplaceAll(text, "``", "`") + } + + // Remove extra spaces + text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ") + text = strings.TrimSpace(text) + + // If text doesn't end with punctuation, quotes, or closing brackets, add a period + if text != "" { + endsWithPunct := regexp.MustCompile(`[.!?;:,'"\x{201C}\x{201D}\x{2018}\x{2019})\]}…。」』】〉》›»]$`) + if !endsWithPunct.MatchString(text) { + text += "." + } + } + + // Validate language + if !isValidLang(lang) { + panic(fmt.Sprintf("Invalid language: %s. Available: %v", lang, AvailableLangs)) + } + + // Wrap text with language tags + text = fmt.Sprintf("<%s>%s", lang, text, lang) + + return text +} + +func lengthToMask(lengths []int64, maxLen int) [][][]float64 { + bsz := len(lengths) + mask := make([][][]float64, bsz) + + for i := 0; i < bsz; i++ { + row := make([]float64, maxLen) + for j := 0; j < maxLen; j++ { + if int64(j) < lengths[i] { + row[j] = 1.0 + } else { + row[j] = 0.0 + } + } + mask[i] = [][]float64{row} + } + + return mask +} + +func getTextMask(textLengths []int64, maxLen int) [][][]float64 { + return lengthToMask(textLengths, maxLen) +} + +func getLatentMask(wavLengths []int64, cfg Config) [][][]float64 { + baseChunkSize := int64(cfg.AE.BaseChunkSize) + chunkCompressFactor := int64(cfg.TTL.ChunkCompressFactor) + latentSize := baseChunkSize * chunkCompressFactor + + latentLengths := make([]int64, len(wavLengths)) + maxLen := int64(0) + for i, wavLen := range wavLengths { + latentLengths[i] = (wavLen + latentSize - 1) / latentSize + if latentLengths[i] > maxLen { + maxLen = latentLengths[i] + } + } + + return lengthToMask(latentLengths, int(maxLen)) +} + +func writeWavFile(filename string, audioData []float64, sampleRate int) error { + file, err := os.Create(filename) + if err != nil { + return err + } + defer file.Close() + + // Convert float64 to int + intData := make([]int, len(audioData)) + for i, sample := range audioData { + // Clamp to [-1, 1] and convert to 16-bit int + clamped := math.Max(-1.0, math.Min(1.0, sample)) + intData[i] = int(clamped * 32767) + } + + encoder := wav.NewEncoder(file, sampleRate, 16, 1, 1) + buf := &audio.IntBuffer{ + Data: intData, + Format: &audio.Format{SampleRate: sampleRate, NumChannels: 1}, + SourceBitDepth: 16, + } + + if err := encoder.Write(buf); err != nil { + return err + } + + return encoder.Close() +} + +// Style holds style tensors +type Style struct { + TtlTensor *ort.Tensor[float32] + DpTensor *ort.Tensor[float32] +} + +func (s *Style) Destroy() { + if s.TtlTensor != nil { + s.TtlTensor.Destroy() + } + if s.DpTensor != nil { + s.DpTensor.Destroy() + } +} + +// LoadVoiceStyle loads voice style from JSON files +func LoadVoiceStyle(voiceStylePaths []string, verbose bool) (*Style, error) { + bsz := len(voiceStylePaths) + + // Read first file to get dimensions + firstData, err := os.ReadFile(voiceStylePaths[0]) + if err != nil { + return nil, fmt.Errorf("failed to read voice style file: %w", err) + } + + var firstStyle VoiceStyleData + if err := json.Unmarshal(firstData, &firstStyle); err != nil { + return nil, fmt.Errorf("failed to parse voice style JSON: %w", err) + } + + ttlDims := firstStyle.StyleTTL.Dims + dpDims := firstStyle.StyleDP.Dims + + ttlDim1 := ttlDims[1] + ttlDim2 := ttlDims[2] + dpDim1 := dpDims[1] + dpDim2 := dpDims[2] + + // Pre-allocate arrays with full batch size + ttlSize := int(int64(bsz) * ttlDim1 * ttlDim2) + dpSize := int(int64(bsz) * dpDim1 * dpDim2) + ttlFlat := make([]float32, ttlSize) + dpFlat := make([]float32, dpSize) + + // Fill in the data + for i := 0; i < bsz; i++ { + data, err := os.ReadFile(voiceStylePaths[i]) + if err != nil { + return nil, fmt.Errorf("failed to read voice style file: %w", err) + } + + var voiceStyle VoiceStyleData + if err := json.Unmarshal(data, &voiceStyle); err != nil { + return nil, fmt.Errorf("failed to parse voice style JSON: %w", err) + } + + // Flatten TTL data + ttlOffset := int(int64(i) * ttlDim1 * ttlDim2) + idx := 0 + for _, batch := range voiceStyle.StyleTTL.Data { + for _, row := range batch { + for _, val := range row { + ttlFlat[ttlOffset+idx] = float32(val) + idx++ + } + } + } + + // Flatten DP data + dpOffset := int(int64(i) * dpDim1 * dpDim2) + idx = 0 + for _, batch := range voiceStyle.StyleDP.Data { + for _, row := range batch { + for _, val := range row { + dpFlat[dpOffset+idx] = float32(val) + idx++ + } + } + } + } + + ttlShape := []int64{int64(bsz), ttlDim1, ttlDim2} + dpShape := []int64{int64(bsz), dpDim1, dpDim2} + + ttlTensor, err := ort.NewTensor(ttlShape, ttlFlat) + if err != nil { + return nil, fmt.Errorf("failed to create TTL tensor: %w", err) + } + + dpTensor, err := ort.NewTensor(dpShape, dpFlat) + if err != nil { + ttlTensor.Destroy() + return nil, fmt.Errorf("failed to create DP tensor: %w", err) + } + + if verbose { + fmt.Printf("Loaded %d voice styles\n\n", bsz) + } + + return &Style{ + TtlTensor: ttlTensor, + DpTensor: dpTensor, + }, nil +} + +// TextToSpeech generates speech from text +type TextToSpeech struct { + cfg Config + textProcessor *UnicodeProcessor + dpOrt *ort.DynamicAdvancedSession + textEncOrt *ort.DynamicAdvancedSession + vectorEstOrt *ort.DynamicAdvancedSession + vocoderOrt *ort.DynamicAdvancedSession + SampleRate int + baseChunkSize int + chunkCompress int + ldim int +} + +func (tts *TextToSpeech) sampleNoisyLatent(durOnnx []float32) ([][][]float64, [][][]float64) { + bsz := len(durOnnx) + maxDur := float64(0) + for _, d := range durOnnx { + if float64(d) > maxDur { + maxDur = float64(d) + } + } + + wavLenMax := maxDur * float64(tts.SampleRate) + wavLengths := make([]int64, bsz) + for i, d := range durOnnx { + wavLengths[i] = int64(float64(d) * float64(tts.SampleRate)) + } + + chunkSize := tts.baseChunkSize * tts.chunkCompress + latentLen := int((wavLenMax + float64(chunkSize) - 1) / float64(chunkSize)) + latentDim := tts.ldim * tts.chunkCompress + + rng := rand.New(rand.NewSource(time.Now().UnixNano())) + noisyLatent := make([][][]float64, bsz) + for b := 0; b < bsz; b++ { + batch := make([][]float64, latentDim) + for d := 0; d < latentDim; d++ { + row := make([]float64, latentLen) + for t := 0; t < latentLen; t++ { + // Box-Muller transform for normal distribution + // Add epsilon to avoid log(0) + const eps = 1e-10 + u1 := math.Max(eps, rng.Float64()) + u2 := rng.Float64() + row[t] = math.Sqrt(-2.0*math.Log(u1)) * math.Cos(2.0*math.Pi*u2) + } + batch[d] = row + } + noisyLatent[b] = batch + } + + latentMask := getLatentMask(wavLengths, tts.cfg) + + // Apply mask + for b := 0; b < bsz; b++ { + for d := 0; d < latentDim; d++ { + for t := 0; t < latentLen; t++ { + noisyLatent[b][d][t] *= latentMask[b][0][t] + } + } + } + + return noisyLatent, latentMask +} + +func (tts *TextToSpeech) _infer(textList []string, langList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) { + bsz := len(textList) + + // Process text + textIDs, textMask := tts.textProcessor.Call(textList, langList) + textIDsShape := []int64{int64(bsz), int64(len(textIDs[0]))} + textMaskShape := []int64{int64(bsz), 1, int64(len(textMask[0][0]))} + + textIDsTensor := IntArrayToTensor(textIDs, textIDsShape) + defer textIDsTensor.Destroy() + textMaskTensor := ArrayToTensor(textMask, textMaskShape) + defer textMaskTensor.Destroy() + + // Predict duration + dpOutputs := []ort.Value{nil} + err := tts.dpOrt.Run( + []ort.Value{textIDsTensor, style.DpTensor, textMaskTensor}, + dpOutputs, + ) + if err != nil { + return nil, nil, fmt.Errorf("failed to run duration predictor: %w", err) + } + durTensor := dpOutputs[0].(*ort.Tensor[float32]) + defer durTensor.Destroy() + durOnnx := durTensor.GetData() + + // Apply speed factor to duration + for i := range durOnnx { + durOnnx[i] /= speed + } + + // Encode text + textIDsTensor2 := IntArrayToTensor(textIDs, textIDsShape) + defer textIDsTensor2.Destroy() + textEncOutputs := []ort.Value{nil} + err = tts.textEncOrt.Run( + []ort.Value{textIDsTensor2, style.TtlTensor, textMaskTensor}, + textEncOutputs, + ) + if err != nil { + return nil, nil, fmt.Errorf("failed to run text encoder: %w", err) + } + textEmbTensor := textEncOutputs[0].(*ort.Tensor[float32]) + defer textEmbTensor.Destroy() + + // Sample noisy latent + xt, latentMask := tts.sampleNoisyLatent(durOnnx) + latentShape := []int64{int64(bsz), int64(len(xt[0])), int64(len(xt[0][0]))} + latentMaskShape := []int64{int64(bsz), 1, int64(len(latentMask[0][0]))} + + // Prepare constant arrays + totalStepArray := make([]float32, bsz) + for b := 0; b < bsz; b++ { + totalStepArray[b] = float32(totalStep) + } + scalarShape := []int64{int64(bsz)} + + totalStepTensor, _ := ort.NewTensor(scalarShape, totalStepArray) + defer totalStepTensor.Destroy() + + // Denoising loop + for step := 0; step < totalStep; step++ { + currentStepArray := make([]float32, bsz) + for b := 0; b < bsz; b++ { + currentStepArray[b] = float32(step) + } + + currentStepTensor, _ := ort.NewTensor(scalarShape, currentStepArray) + noisyLatentTensor := ArrayToTensor(xt, latentShape) + latentMaskTensor := ArrayToTensor(latentMask, latentMaskShape) + textMaskTensor2 := ArrayToTensor(textMask, textMaskShape) + + vectorEstOutputs := []ort.Value{nil} + err = tts.vectorEstOrt.Run( + []ort.Value{noisyLatentTensor, textEmbTensor, style.TtlTensor, latentMaskTensor, textMaskTensor2, + currentStepTensor, totalStepTensor}, + vectorEstOutputs, + ) + if err != nil { + return nil, nil, fmt.Errorf("failed to run vector estimator: %w", err) + } + + denoisedTensor := vectorEstOutputs[0].(*ort.Tensor[float32]) + denoisedData := denoisedTensor.GetData() + + // Update latent + idx := 0 + for b := 0; b < bsz; b++ { + for d := 0; d < len(xt[b]); d++ { + for t := 0; t < len(xt[b][d]); t++ { + xt[b][d][t] = float64(denoisedData[idx]) + idx++ + } + } + } + + noisyLatentTensor.Destroy() + latentMaskTensor.Destroy() + textMaskTensor2.Destroy() + currentStepTensor.Destroy() + denoisedTensor.Destroy() + } + + // Generate waveform + finalLatentTensor := ArrayToTensor(xt, latentShape) + defer finalLatentTensor.Destroy() + + vocoderOutputs := []ort.Value{nil} + err = tts.vocoderOrt.Run( + []ort.Value{finalLatentTensor}, + vocoderOutputs, + ) + if err != nil { + return nil, nil, fmt.Errorf("failed to run vocoder: %w", err) + } + + wavBatchTensor := vocoderOutputs[0].(*ort.Tensor[float32]) + defer wavBatchTensor.Destroy() + wav := wavBatchTensor.GetData() + + return wav, durOnnx, nil +} + +// Call synthesizes speech from a single text with automatic chunking +func (tts *TextToSpeech) Call(text string, lang string, style *Style, totalStep int, speed float32, silenceDuration float32) ([]float32, float32, error) { + maxLen := 300 + if lang == "ko" || lang == "ja" { + maxLen = 120 + } + chunks := chunkText(text, maxLen) + + var wavCat []float32 + var durCat float32 + + for i, chunk := range chunks { + wav, duration, err := tts._infer([]string{chunk}, []string{lang}, style, totalStep, speed) + if err != nil { + return nil, 0, err + } + + dur := duration[0] + wavLen := int(float32(tts.SampleRate) * dur) + wavChunk := wav[:wavLen] + + if i == 0 { + wavCat = wavChunk + durCat = dur + } else { + silenceLen := int(silenceDuration * float32(tts.SampleRate)) + silence := make([]float32, silenceLen) + + wavCat = append(wavCat, silence...) + wavCat = append(wavCat, wavChunk...) + durCat += silenceDuration + dur + } + } + + return wavCat, durCat, nil +} + +// Batch synthesizes speech from multiple texts +func (tts *TextToSpeech) Batch(textList []string, langList []string, style *Style, totalStep int, speed float32) ([]float32, []float32, error) { + return tts._infer(textList, langList, style, totalStep, speed) +} + +func (tts *TextToSpeech) Destroy() { + if tts.dpOrt != nil { + tts.dpOrt.Destroy() + } + if tts.textEncOrt != nil { + tts.textEncOrt.Destroy() + } + if tts.vectorEstOrt != nil { + tts.vectorEstOrt.Destroy() + } + if tts.vocoderOrt != nil { + tts.vocoderOrt.Destroy() + } +} + +// LoadTextToSpeech loads TTS components +func LoadTextToSpeech(onnxDir string, useGPU bool, cfg Config) (*TextToSpeech, error) { + if useGPU { + return nil, fmt.Errorf("GPU mode is not supported yet") + } + fmt.Println("Using CPU for inference") // LocalAI: drop redundant newline (vet) + + // Load models + dpPath := filepath.Join(onnxDir, "duration_predictor.onnx") + textEncPath := filepath.Join(onnxDir, "text_encoder.onnx") + vectorEstPath := filepath.Join(onnxDir, "vector_estimator.onnx") + vocoderPath := filepath.Join(onnxDir, "vocoder.onnx") + + dpOrt, err := ort.NewDynamicAdvancedSession(dpPath, []string{"text_ids", "style_dp", "text_mask"}, + []string{"duration"}, nil) + if err != nil { + return nil, fmt.Errorf("failed to load duration predictor: %w", err) + } + + textEncOrt, err := ort.NewDynamicAdvancedSession(textEncPath, []string{"text_ids", "style_ttl", "text_mask"}, + []string{"text_emb"}, nil) + if err != nil { + return nil, fmt.Errorf("failed to load text encoder: %w", err) + } + + vectorEstOrt, err := ort.NewDynamicAdvancedSession(vectorEstPath, + []string{"noisy_latent", "text_emb", "style_ttl", "latent_mask", "text_mask", "current_step", "total_step"}, + []string{"denoised_latent"}, nil) + if err != nil { + return nil, fmt.Errorf("failed to load vector estimator: %w", err) + } + + vocoderOrt, err := ort.NewDynamicAdvancedSession(vocoderPath, []string{"latent"}, + []string{"wav_tts"}, nil) + if err != nil { + return nil, fmt.Errorf("failed to load vocoder: %w", err) + } + + // Load text processor + unicodeIndexerPath := filepath.Join(onnxDir, "unicode_indexer.json") + textProcessor, err := NewUnicodeProcessor(unicodeIndexerPath) + if err != nil { + return nil, err + } + + textToSpeech := &TextToSpeech{ + cfg: cfg, + textProcessor: textProcessor, + dpOrt: dpOrt, + textEncOrt: textEncOrt, + vectorEstOrt: vectorEstOrt, + vocoderOrt: vocoderOrt, + SampleRate: cfg.AE.SampleRate, + baseChunkSize: cfg.AE.BaseChunkSize, + chunkCompress: cfg.TTL.ChunkCompressFactor, + ldim: cfg.TTL.LatentDim, + } + + return textToSpeech, nil +} + +// InitializeONNXRuntime initializes ONNX Runtime environment +func InitializeONNXRuntime() error { + libPath := os.Getenv("ONNXRUNTIME_LIB_PATH") + if libPath == "" { + candidates := []string{ + "/opt/homebrew/opt/onnxruntime/lib/libonnxruntime.dylib", + "/usr/local/opt/onnxruntime/lib/libonnxruntime.dylib", + "/opt/homebrew/lib/libonnxruntime.dylib", + "/usr/local/lib/libonnxruntime.dylib", + "/usr/local/lib/libonnxruntime.so", + "/usr/lib/libonnxruntime.so", + } + for _, candidate := range candidates { + if _, err := os.Stat(candidate); err == nil { + libPath = candidate + break + } + } + if libPath == "" { + libPath = "/usr/local/lib/libonnxruntime.so" + } + } + ort.SetSharedLibraryPath(libPath) + + if err := ort.InitializeEnvironment(); err != nil { + return fmt.Errorf("failed to initialize ONNX Runtime: %w\nHint: install ONNX Runtime (macOS: brew install onnxruntime) or set ONNXRUNTIME_LIB_PATH", err) + } + return nil +} + +// sanitizeFilename creates a safe filename from text (supports Unicode) +func sanitizeFilename(text string, maxLen int) string { + runes := []rune(text) + if len(runes) > maxLen { + runes = runes[:maxLen] + } + + result := make([]rune, 0, len(runes)) + for _, r := range runes { + // unicode.IsLetter matches any Unicode letter, unicode.IsDigit matches any Unicode digit + if unicode.IsLetter(r) || unicode.IsDigit(r) { + result = append(result, r) + } else { + result = append(result, '_') + } + } + return string(result) +} + +// extractWavSegment extracts a single audio segment from batch output +func extractWavSegment(wav []float32, duration float32, sampleRate int, index int, batchSize int) []float64 { + wavLen := int(float64(sampleRate) * float64(duration)) + wavPerBatch := len(wav) / batchSize + + wavStart := index * wavPerBatch + wavEnd := wavStart + wavLen + if wavEnd > len(wav) { + wavEnd = len(wav) + } + + wavOut := make([]float64, wavLen) + for j := 0; j < wavLen && wavStart+j < len(wav); j++ { + wavOut[j] = float64(wav[wavStart+j]) + } + + return wavOut +} + +// Timer measures execution time +func Timer(name string, fn func() interface{}) interface{} { + start := time.Now() + fmt.Printf("%s...\n", name) + result := fn() + elapsed := time.Since(start).Seconds() + fmt.Printf(" -> %s completed in %.2f sec\n", name, elapsed) + return result +} + +// LoadCfgs loads configuration from JSON file +func LoadCfgs(onnxDir string) (Config, error) { + cfgPath := filepath.Join(onnxDir, "tts.json") + data, err := os.ReadFile(cfgPath) + if err != nil { + return Config{}, err + } + + var cfg Config + if err := json.Unmarshal(data, &cfg); err != nil { + return Config{}, err + } + + return cfg, nil +} + +// JSON loading helpers +func loadJSONInt64(filePath string) ([]int64, error) { + data, err := os.ReadFile(filePath) + if err != nil { + return nil, err + } + + var result []int64 + if err := json.Unmarshal(data, &result); err != nil { + return nil, err + } + + return result, nil +} + +// Tensor conversion utilities +func ArrayToTensor(array [][][]float64, shape []int64) *ort.Tensor[float32] { + // Flatten array + totalSize := int64(1) + for _, dim := range shape { + totalSize *= dim + } + + flat := make([]float32, totalSize) + idx := 0 + for b := 0; b < len(array); b++ { + for d := 0; d < len(array[b]); d++ { + for t := 0; t < len(array[b][d]); t++ { + flat[idx] = float32(array[b][d][t]) + idx++ + } + } + } + + tensor, err := ort.NewTensor(shape, flat) + if err != nil { + panic(err) + } + + return tensor +} + +func IntArrayToTensor(array [][]int64, shape []int64) *ort.Tensor[int64] { + // Flatten array + totalSize := int64(1) + for _, dim := range shape { + totalSize *= dim + } + + flat := make([]int64, totalSize) + idx := 0 + for b := 0; b < len(array); b++ { + for t := 0; t < len(array[b]); t++ { + flat[idx] = array[b][t] + idx++ + } + } + + tensor, err := ort.NewTensor(shape, flat) + if err != nil { + panic(err) + } + + return tensor +} diff --git a/backend/go/supertonic/main.go b/backend/go/supertonic/main.go new file mode 100644 index 000000000000..49e9ea3a5b26 --- /dev/null +++ b/backend/go/supertonic/main.go @@ -0,0 +1,27 @@ +package main + +// Started internally by LocalAI; a server is allocated per model. + +import ( + "flag" + + grpc "github.com/mudler/LocalAI/pkg/grpc" + ort "github.com/yalue/onnxruntime_go" +) + +var addr = flag.String("addr", "localhost:50051", "the address to connect to") + +func main() { + flag.Parse() + + // InitializeONNXRuntime reads ONNXRUNTIME_LIB_PATH (set by run.sh) and + // dlopens libonnxruntime before any session is created in Load(). + if err := InitializeONNXRuntime(); err != nil { + panic(err) + } + defer func() { _ = ort.DestroyEnvironment() }() + + if err := grpc.StartServer(*addr, &SupertonicBackend{}); err != nil { + panic(err) + } +} diff --git a/backend/go/supertonic/main_suite_test.go b/backend/go/supertonic/main_suite_test.go new file mode 100644 index 000000000000..90bcb61f54b1 --- /dev/null +++ b/backend/go/supertonic/main_suite_test.go @@ -0,0 +1,13 @@ +package main + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestSupertonic(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Supertonic backend test suite") +} diff --git a/backend/go/supertonic/package.sh b/backend/go/supertonic/package.sh new file mode 100755 index 000000000000..9e2a016256a9 --- /dev/null +++ b/backend/go/supertonic/package.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath $0)") +REPO_ROOT="${CURDIR}/../../.." + +mkdir -p $CURDIR/package/lib + +cp -avf $CURDIR/supertonic $CURDIR/package/ +cp -avf $CURDIR/run.sh $CURDIR/package/ +cp -rfLv $CURDIR/backend-assets/lib/* $CURDIR/package/lib/ + +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +else + echo "Error: Could not detect architecture" + exit 1 +fi + +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" +ls -liah $CURDIR/package/ +ls -liah $CURDIR/package/lib/ diff --git a/backend/go/supertonic/run.sh b/backend/go/supertonic/run.sh new file mode 100755 index 000000000000..2dabf7eb3337 --- /dev/null +++ b/backend/go/supertonic/run.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -ex + +CURDIR=$(dirname "$(realpath $0)") + +export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH +export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so + +if [ -f $CURDIR/lib/ld.so ]; then + echo "Using lib/ld.so" + exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@" +fi + +exec $CURDIR/supertonic "$@" diff --git a/backend/index.yaml b/backend/index.yaml index 19483ab03900..919254cc473b 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -1368,6 +1368,20 @@ nvidia: "cuda12-sherpa-onnx" nvidia-cuda-12: "cuda12-sherpa-onnx" metal: "metal-sherpa-onnx" +- &supertonic + name: "supertonic" + alias: "supertonic" + urls: + - https://github.com/supertone-inc/supertonic + description: | + Supertonic backend: lightning-fast, on-device multilingual text-to-speech via ONNX Runtime. + Runs Supertone's flow-matching TTS model (Supertone/supertonic-3), 44.1kHz output, 31 languages, + multiple preset voice styles. No espeak-ng dependency. + tags: + - text-to-speech + - TTS + capabilities: + default: "cpu-supertonic" - !!merge <<: *neutts name: "neutts-development" capabilities: @@ -5132,3 +5146,18 @@ uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-sherpa-onnx" mirrors: - localai/localai-backends:master-metal-darwin-arm64-sherpa-onnx +## supertonic +- !!merge <<: *supertonic + name: "supertonic-development" + capabilities: + default: "cpu-supertonic-development" +- !!merge <<: *supertonic + name: "cpu-supertonic" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic" + mirrors: + - localai/localai-backends:latest-cpu-supertonic +- !!merge <<: *supertonic + name: "cpu-supertonic-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic" + mirrors: + - localai/localai-backends:master-cpu-supertonic diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index c03d52ee4f4b..900fa2de12b7 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -434,6 +434,13 @@ func DefaultRegistry() map[string]FieldMetaOverride { Component: "json-editor", Order: 78, }, + "pipeline.max_history_items": { + Section: "pipeline", + Label: "Max History Items", + Description: "Cap how many trailing conversation items are fed to the LLM each realtime turn (0 = unlimited, rely on the LLM's context window). Set it on a composed pipeline (VAD+STT+LLM+TTS) so a long-running session doesn't grow until the context fills. Unset uses the per-model-type default.", + Component: "number", + Order: 79, + }, // --- Functions --- "function.grammar.parallel_calls": { diff --git a/core/http/endpoints/localai/backend.go b/core/http/endpoints/localai/backend.go index 331e49e43673..4b86095e23dd 100644 --- a/core/http/endpoints/localai/backend.go +++ b/core/http/endpoints/localai/backend.go @@ -38,6 +38,7 @@ var knownPrefOnlyBackends = []schema.KnownBackend{ {Name: "qwen3-tts-cpp", Modality: "tts", AutoDetect: false, Description: "Qwen3 TTS C++ (preference-only)"}, {Name: "omnivoice-cpp", Modality: "tts", AutoDetect: false, Description: "OmniVoice C++ TTS with voice cloning and voice design (preference-only)"}, {Name: "faster-qwen3-tts", Modality: "tts", AutoDetect: false, Description: "Faster Qwen3 TTS (preference-only)"}, + {Name: "supertonic", Modality: "tts", AutoDetect: false, Description: "Supertonic multilingual ONNX TTS (preference-only)"}, // Detection {Name: "sam3-cpp", Modality: "detection", AutoDetect: false, Description: "SAM3 C++ object detection (preference-only)"}, // Audio transform (audio-in / audio-out, optional reference signal) diff --git a/core/http/endpoints/localai/backend_test.go b/core/http/endpoints/localai/backend_test.go index 0c21bb7b4f6a..2f82450bd76f 100644 --- a/core/http/endpoints/localai/backend_test.go +++ b/core/http/endpoints/localai/backend_test.go @@ -145,6 +145,7 @@ var _ = Describe("Backend Endpoints", func() { expectPrefOnly("qwen-tts", "tts") expectPrefOnly("qwen3-tts-cpp", "tts") expectPrefOnly("faster-qwen3-tts", "tts") + expectPrefOnly("supertonic", "tts") expectPrefOnly("sam3-cpp", "detection") }) diff --git a/gallery/index.yaml b/gallery/index.yaml index 446ed0269def..06a37c0cce5a 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -3510,6 +3510,78 @@ - filename: kokoro-int8-multi-lang-v1_0.tar.bz2 sha256: 75654a84864be26f345f020f4070c2c019e96dd1b7f9bf6e2ffd59efac6aa5a3 uri: https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-int8-multi-lang-v1_0.tar.bz2 +- name: supertonic-3 + url: github:mudler/LocalAI/gallery/supertonic.yaml@master + urls: + - https://github.com/supertone-inc/supertonic + - https://huggingface.co/Supertone/supertonic-3 + description: | + Supertonic multilingual text-to-speech (Supertone/supertonic-3), served through the native supertonic backend via ONNX Runtime. Lightning-fast on-device flow-matching TTS with 44.1 kHz output, 31 languages, and 10 preset voice styles (F1-F5, M1-M5). No espeak-ng dependency. Defaults to voice F1; override per request with the OpenAI `voice` field, and optionally pass `language=` (e.g. en, ko, ja, it; "na" for language-agnostic). + license: mit + icon: https://huggingface.co/Supertone/supertonic-3/resolve/main/img/Supertonic3_HeroImage.png + tags: + - text-to-speech + - tts + - multilingual + - onnx + - supertonic + - flow-matching + - multi-speaker + last_checked: "2026-06-15" + overrides: + known_usecases: + - tts + parameters: + model: supertonic-3/onnx/tts.json + files: + - filename: supertonic-3/onnx/duration_predictor.onnx + sha256: c3eb91414d5ff8a7a239b7fe9e34e7e2bf8a8140d8375ffb14718b1c639325db + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/duration_predictor.onnx + - filename: supertonic-3/onnx/text_encoder.onnx + sha256: c7befd5ea8c3119769e8a6c1486c4edc6a3bc8365c67621c881bbb774b9902ff + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/text_encoder.onnx + - filename: supertonic-3/onnx/vector_estimator.onnx + sha256: 883ac868ea0275ef0e991524dc64f16b3c0376efd7c320af6b53f5b780d7c61c + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/vector_estimator.onnx + - filename: supertonic-3/onnx/vocoder.onnx + sha256: 085de76dd8e8d5836d6ca66826601f615939218f90e519f70ee8a36ed2a4c4ba + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/vocoder.onnx + - filename: supertonic-3/onnx/tts.json + sha256: 42078d3aef1cd43ab43021f3c54f47d2d75ceb4e75f627f118890128b06a0d09 + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/tts.json + - filename: supertonic-3/onnx/unicode_indexer.json + sha256: 9bf7346e43883a81f8645c81224f786d43c5b57f3641f6e7671a7d6c493cb24f + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/onnx/unicode_indexer.json + - filename: supertonic-3/voice_styles/F1.json + sha256: bbdec6ee00231c2c742ad05483df5334cab3b52fda3ba38e6a07059c4563dbc2 + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F1.json + - filename: supertonic-3/voice_styles/F2.json + sha256: 7c722c6a72707b1a77f035d67f0d1351ba187738e06f7683e8c72b1df3477fc6 + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F2.json + - filename: supertonic-3/voice_styles/F3.json + sha256: 12f6ef2573baa2defa1128069cb59f203e3ab67c92af77b42df8a0e3a2f7c6ab + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F3.json + - filename: supertonic-3/voice_styles/F4.json + sha256: c2fa764c1225a76dfc3e2c73e8aa4f70d9ee48793860eb34c295fff01c2e032b + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F4.json + - filename: supertonic-3/voice_styles/F5.json + sha256: 45966e73316415626cf41a7d1c6f3b4c70dbc1ba2bee5c1978ef0ce33244fc8d + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/F5.json + - filename: supertonic-3/voice_styles/M1.json + sha256: e35604687f5d23694b8e91593a93eec0e4eca6c0b02bb8ed69139ab2ea6b0a5b + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M1.json + - filename: supertonic-3/voice_styles/M2.json + sha256: b76cbf62bac707c710cf0ae5aba5e31eea1a6339a9734bfae33ab98499534a50 + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M2.json + - filename: supertonic-3/voice_styles/M3.json + sha256: ea1ac35ccb91b0d7ecad533a2fbd0eec10c91513d8951e3b25fbba99954e159b + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M3.json + - filename: supertonic-3/voice_styles/M4.json + sha256: ca8eefad4fcd989c9379032ff3e50738adc547eeb5e221b82593a6d7b3bac303 + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M4.json + - filename: supertonic-3/voice_styles/M5.json + sha256: dd22b92740314321f8ae11c5e87f8dd60d060f15dd3a632b5adf77f471f77af2 + uri: https://huggingface.co/Supertone/supertonic-3/resolve/main/voice_styles/M5.json - name: voxcpm-1.5 url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: diff --git a/gallery/supertonic.yaml b/gallery/supertonic.yaml new file mode 100644 index 000000000000..d51a0c88206f --- /dev/null +++ b/gallery/supertonic.yaml @@ -0,0 +1,19 @@ +--- +name: "supertonic" + +config_file: | + backend: supertonic + options: + # Generation knobs read by the supertonic backend at TTS time. + # steps = flow-matching denoising steps (quality); speed = rate; + # silence = inter-chunk silence seconds for long inputs. + - supertonic.steps=8 + - supertonic.speed=1.05 + - supertonic.silence=0.3 + # Voice style used when a request omits `voice`. The model ships + # F1-F5 / M1-M5 under voice_styles/; override per request via the + # OpenAI `voice` field. + - supertonic.default_voice=F1 + # Default language tag when a request omits `language`. "na" is the + # model's language-agnostic mode. + - supertonic.default_lang=na diff --git a/go.mod b/go.mod index e0d0c01af756..2735de2653d5 100644 --- a/go.mod +++ b/go.mod @@ -65,6 +65,7 @@ require ( github.com/testcontainers/testcontainers-go/modules/nats v0.42.0 github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 github.com/timbutler/zxcvbn v1.0.4 + github.com/yalue/onnxruntime_go v1.11.0 go.opentelemetry.io/otel v1.44.0 go.opentelemetry.io/otel/exporters/prometheus v0.66.0 go.opentelemetry.io/otel/metric v1.44.0 @@ -497,7 +498,7 @@ require ( golang.org/x/sync v0.20.0 golang.org/x/sys v0.45.0 // indirect golang.org/x/term v0.43.0 - golang.org/x/text v0.37.0 // indirect + golang.org/x/text v0.37.0 golang.org/x/tools v0.45.0 // indirect golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb // indirect diff --git a/go.sum b/go.sum index 6a0bb516bcbe..45feafef58a6 100644 --- a/go.sum +++ b/go.sum @@ -1377,6 +1377,8 @@ github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavM github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= +github.com/yalue/onnxruntime_go v1.11.0 h1:aKH4yPIbqfcB3SfnQWq/WxzLelkyolntHnffL3eMBHY= +github.com/yalue/onnxruntime_go v1.11.0/go.mod h1:b4X26A8pekNb1ACJ58wAXgNKeUCGEAQ9dmACut9Sm/4= github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4= github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=