diff --git a/CMakeLists.txt b/CMakeLists.txt index eb89465..45afbc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,6 @@ cmake_minimum_required(VERSION 3.14) project(s2cpp LANGUAGES C CXX) -if (WIN32) - add_definitions(-DWIN32_LEAN_AND_MEAN) -endif() - set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) @@ -15,6 +11,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) option(S2_VULKAN "Build with Vulkan backend" OFF) option(S2_CUDA "Build with CUDA backend" OFF) option(S2_METAL "Build with Metal backend" OFF) +option(S2_BUILD_SHARED_LIBRARIES "Compile S2 as a dynamic link library(dll) and a shared(.lib) library as well." OFF) # --------------------------------------------------------------------------- # GGML (git submodule — not modified) @@ -38,6 +35,7 @@ add_subdirectory(ggml) # s2 executable # --------------------------------------------------------------------------- set(S2_SOURCES + src/s2_export_api.cpp src/s2_audio.cpp src/s2_tokenizer.cpp src/s2_sampler.cpp @@ -46,36 +44,98 @@ set(S2_SOURCES src/s2_prompt.cpp src/s2_generate.cpp src/s2_pipeline.cpp - src/s2_server.cpp src/main.cpp ) +if(S2_BUILD_SHARED_LIBRARIES) add_executable(s2 ${S2_SOURCES}) +add_library(s2_dll SHARED ${S2_SOURCES}) +add_library(s2_lib STATIC ${S2_SOURCES}) target_include_directories(s2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/third_party ${CMAKE_CURRENT_SOURCE_DIR}/ggml/include - ${CMAKE_CURRENT_SOURCE_DIR}/ggml/src + ${CMAKE_CURRENT_SOURCE_DIR}/ggml/src # for ggml-common.h etc. +) +target_include_directories(s2_dll PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/third_party + ${CMAKE_CURRENT_SOURCE_DIR}/ggml/include + ${CMAKE_CURRENT_SOURCE_DIR}/ggml/src # for ggml-common.h etc. +) +target_include_directories(s2_lib PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/third_party + ${CMAKE_CURRENT_SOURCE_DIR}/ggml/include + ${CMAKE_CURRENT_SOURCE_DIR}/ggml/src # for ggml-common.h etc. ) target_link_libraries(s2 PRIVATE ggml) +target_link_libraries(s2_dll PRIVATE ggml) +target_link_libraries(s2_lib PRIVATE ggml) + +target_compile_definitions(s2_dll PRIVATE S2_LIBRARY) +target_compile_definitions(s2_lib PRIVATE S2_LIBRARY) if(S2_VULKAN) target_compile_definitions(s2 PRIVATE GGML_USE_VULKAN) + target_compile_definitions(s2_dll PRIVATE GGML_USE_VULKAN) + target_compile_definitions(s2_lib PRIVATE GGML_USE_VULKAN) endif() if(S2_CUDA) target_compile_definitions(s2 PRIVATE GGML_USE_CUDA) + target_compile_definitions(s2_dll PRIVATE GGML_USE_CUDA) + target_compile_definitions(s2_lib PRIVATE GGML_USE_CUDA) endif() + # Platform-specific if(UNIX AND NOT APPLE) target_link_libraries(s2 PRIVATE pthread m) + target_link_libraries(s2_dll PRIVATE pthread m) + target_link_libraries(s2_lib PRIVATE pthread m) +endif() + +# Install +install(TARGETS s2 RUNTIME DESTINATION bin) +install(TARGETS s2_dll RUNTIME DESTINATION bin) +install(TARGETS s2_lib RUNTIME DESTINATION bin) + +else() +add_executable(s2 ${S2_SOURCES}) + +target_include_directories(s2 PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/third_party + ${CMAKE_CURRENT_SOURCE_DIR}/ggml/include + ${CMAKE_CURRENT_SOURCE_DIR}/ggml/src # for ggml-common.h etc. +) + +target_link_libraries(s2 PRIVATE ggml) + +if(S2_VULKAN) + target_compile_definitions(s2 PRIVATE GGML_USE_VULKAN) +endif() +if(S2_CUDA) + target_compile_definitions(s2 PRIVATE GGML_USE_CUDA) endif() -if(WIN32) - target_link_libraries(s2 PRIVATE ws2_32 wsock32) +# Platform-specific +if(UNIX AND NOT APPLE) + target_link_libraries(s2 PRIVATE pthread m) endif() # Install install(TARGETS s2 RUNTIME DESTINATION bin) + +endif() + + + + + + + + + diff --git a/include/s2_codec.h b/include/s2_codec.h index a8bb480..c48e480 100644 --- a/include/s2_codec.h +++ b/include/s2_codec.h @@ -32,7 +32,7 @@ class AudioCodec { // Decode VQ codes to mono float32 audio. // codes: (num_codebooks, n_frames) flattened row-major. bool decode(const int32_t * codes, int32_t n_frames, int32_t n_threads, - std::vector & audio_out); + std::vector & audio_out, int32_t* audio_n_frames_out); int32_t sample_rate() const { return sample_rate_; } int32_t hop_length() const { return hop_length_; } diff --git a/include/s2_config.h b/include/s2_config.h new file mode 100644 index 0000000..6ec5c8d --- /dev/null +++ b/include/s2_config.h @@ -0,0 +1,20 @@ +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) + +#ifdef S2_LIBRARY +#define S2_Export __declspec(dllexport) +static bool SuppressNonEssentialVerbosity = true; +#else +#define S2_Export +static bool SuppressNonEssentialVerbosity = false; +#endif +#else + +// On Unix-like systems, check for GCC 4+ visibility support +#if __GNUC__ >= 4 +#define S2_Export __attribute__((visibility("default"))) +#else +#define S2_Export +#endif +#endif \ No newline at end of file diff --git a/include/s2_export_api.h b/include/s2_export_api.h new file mode 100644 index 0000000..4064226 --- /dev/null +++ b/include/s2_export_api.h @@ -0,0 +1,44 @@ +#pragma once + +#include "s2_audio.h" +#include "s2_codec.h" +#include "s2_generate.h" +#include "s2_model.h" +#include "s2_tokenizer.h" +#include "s2_pipeline.h" +#include "s2_prompt.h" +#include "s2_config.h" + +extern "C" +{ + S2_Export s2::Pipeline* AllocS2Pipeline(); + S2_Export void ReleaseS2Pipeline(s2::Pipeline* Pipeline); + S2_Export void SyncS2TokenizerConfigFromS2Model(s2::SlowARModel* Model, s2::Tokenizer* Tokenizer); + S2_Export int InitializeS2Pipeline(s2::Pipeline* Pipeline, s2::Tokenizer* Tokenizer, s2::SlowARModel* Model, s2::AudioCodec* AudioCodec); + + S2_Export s2::GenerateParams* AllocS2GenerateParams(); + S2_Export void ReleaseS2GenerateParams(s2::GenerateParams* GenerateParams); + S2_Export int InitializeS2GenerateParams(s2::GenerateParams* GenerateParams, int32_t max_new_tokens = -1, float temperature = -1, float top_p = -1, int32_t top_k = -1, int32_t min_tokens_before_end = -1, int32_t n_threads = -1, int verbose = -1); + + S2_Export s2::SlowARModel* AllocS2Model(); + S2_Export void ReleaseS2Model(s2::SlowARModel* Model); + S2_Export int InitializeS2Model(s2::SlowARModel* Model, const char* gguf_path, int32_t gpu_device, int32_t backend_type); + + S2_Export s2::Tokenizer* AllocS2Tokenizer(); + S2_Export void ReleaseS2Tokenizer(s2::Tokenizer* Tokenizer); + S2_Export int InitializeS2Tokenizer(s2::Tokenizer* Tokenizer, const char* path); + + S2_Export s2::AudioCodec* AllocS2AudioCodec(); + S2_Export void ReleaseS2AudioCodec(s2::AudioCodec* AudioCodec); + S2_Export int InitializeS2AudioCodec(s2::AudioCodec* AudioCodec, const char* gguf_path, int32_t gpu_device, int32_t backend_type); + + S2_Export std::vector* AllocS2AudioPromptCodes(); + S2_Export void ReleaseS2AudioPromptCodes(std::vector* AudioPromptCodes); + S2_Export int InitializeAudioPromptCodes(s2::Pipeline* Pipeline, int32_t ThreadCount, const char* ReferenceAudioPath, std::vector* AudioPromptCodes, int* TPrompt); + + S2_Export std::vector* AllocS2AudioBuffer(int InitialSize); + S2_Export void ReleaseS2AudioBuffer(std::vector* AudioBuffer); + S2_Export float* GetS2AudioBufferDataPointer(std::vector* AudioBuffer); + + S2_Export int S2Synthesize(s2::Pipeline* Pipeline, const s2::GenerateParams* GenerateParams, std::vector* AudioBuffer, std::vector* ReferenceAudioPromptCodes, int32_t* ReferenceAudioTPrompt, const char* ReferenceAudioPath, const char* ReferenceAudioTranscript, const char* TextToInfer, const char* OutputAudioPath, int32_t* AudioBufferOutputLength); +} \ No newline at end of file diff --git a/include/s2_pipeline.h b/include/s2_pipeline.h index 9e1bb31..ed43efd 100644 --- a/include/s2_pipeline.h +++ b/include/s2_pipeline.h @@ -36,9 +36,9 @@ class Pipeline { bool synthesize(const PipelineParams & params); bool synthesize_to_memory(const PipelineParams & params, void** ref_audio_buffer, size_t* ref_audio_size, void** wav_buffer, size_t* wav_size); - bool synthesize_raw(const PipelineParams & params, AudioData & ref_audio, std::vector & audio_out); + bool synthesize_raw(const PipelineParams & params, AudioData & ref_audio, std::vector & audio_out, int32_t* audio_out_length); -private: +//private: Tokenizer tokenizer_; SlowARModel model_; AudioCodec codec_; diff --git a/src/s2_codec.cpp b/src/s2_codec.cpp index 1713631..530866c 100755 --- a/src/s2_codec.cpp +++ b/src/s2_codec.cpp @@ -1028,7 +1028,7 @@ bool AudioCodec::encode(const float * audio, int32_t n_samples, int32_t n_thread // --------------------------------------------------------------------------- bool AudioCodec::decode(const int32_t * codes, int32_t n_frames, int32_t n_threads, - std::vector & audio_out) { + std::vector & audio_out, int32_t* audio_n_frames_out) { if (n_frames <= 0) return false; // Step 1: dequantize VQ codes to stage vector (n_frames, quantizer_input_dim) @@ -1132,10 +1132,12 @@ bool AudioCodec::decode(const int32_t * codes, int32_t n_frames, int32_t n_threa // audio_t is (1, T) or (C, T) — we expect (1, T), take total elements const int32_t n_samples = static_cast(ggml_nelements(audio_t)); - audio_out.resize(n_samples); + if(audio_out.size() < n_samples) { audio_out.resize(n_samples); } //Allows for pre-allocated audio buffers that don't need to resized each time. ggml_backend_tensor_get(audio_t, audio_out.data(), 0, n_samples * sizeof(float)); ggml_gallocr_free(allocr); ggml_free(ctx); + + *audio_n_frames_out = n_samples; } return true; } diff --git a/src/s2_export_api.cpp b/src/s2_export_api.cpp new file mode 100644 index 0000000..082a7d3 --- /dev/null +++ b/src/s2_export_api.cpp @@ -0,0 +1,202 @@ +#include "../include/s2_export_api.h" + +s2::Pipeline* AllocS2Pipeline() +{ + return new s2::Pipeline(); +} +void ReleaseS2Pipeline(s2::Pipeline* Pipeline) +{ + delete Pipeline; +} +void SyncS2TokenizerConfigFromS2Model(s2::SlowARModel* Model, s2::Tokenizer* Tokenizer) +{ + const s2::ModelHParams & hp = Model->hparams(); + s2::TokenizerConfig & tc = Tokenizer->config(); + if (hp.semantic_begin_id > 0) tc.semantic_begin_id = hp.semantic_begin_id; + if (hp.semantic_end_id > 0) tc.semantic_end_id = hp.semantic_end_id; + if (hp.num_codebooks > 0) tc.num_codebooks = hp.num_codebooks; + if (hp.codebook_size > 0) tc.codebook_size = hp.codebook_size; + if (hp.vocab_size > 0) tc.vocab_size = hp.vocab_size; +} +int InitializeS2Pipeline(s2::Pipeline* Pipeline, s2::Tokenizer* Tokenizer, s2::SlowARModel* Model, s2::AudioCodec* AudioCodec) +{ + if(!Pipeline->initialized_) + { + Pipeline->tokenizer_ = *Tokenizer; + Pipeline->model_ = *Model; + Pipeline->codec_ = *AudioCodec; + Pipeline->initialized_ = true; + return true; + } + return false; +} + +s2::GenerateParams* AllocS2GenerateParams() +{ + return new s2::GenerateParams(); +} +void ReleaseS2GenerateParams(s2::GenerateParams* GenerateParams) +{ + delete GenerateParams; +} +int InitializeS2GenerateParams(s2::GenerateParams* GenerateParams, int32_t max_new_tokens, float temperature, float top_p, int32_t top_k, int32_t min_tokens_before_end, int32_t n_threads, int verbose) +{ + GenerateParams->max_new_tokens = max_new_tokens >= 0 ? max_new_tokens : GenerateParams->max_new_tokens; + GenerateParams->temperature = temperature >= 0 ? temperature : GenerateParams->temperature; + GenerateParams->top_p = top_p >= 0 ? top_p : GenerateParams->top_p; + GenerateParams->top_k = top_k >= 0 ? top_k : GenerateParams->top_k; + GenerateParams->min_tokens_before_end = min_tokens_before_end >= 0 ? min_tokens_before_end : GenerateParams->min_tokens_before_end; + GenerateParams->n_threads = n_threads >= 0 ? n_threads : GenerateParams->n_threads; + GenerateParams->verbose = verbose >= 0 ? verbose : GenerateParams->verbose; + return true; +} + +s2::SlowARModel* AllocS2Model() +{ + return new s2::SlowARModel(); +} +void ReleaseS2Model(s2::SlowARModel* Model) +{ + delete Model; +} +int InitializeS2Model(s2::SlowARModel* Model, const char* gguf_path, int32_t gpu_device, int32_t backend_type) +{ + return Model->load(std::string(gguf_path), gpu_device, backend_type); +} + +s2::Tokenizer* AllocS2Tokenizer() +{ + return new s2::Tokenizer(); +} +void ReleaseS2Tokenizer(s2::Tokenizer* Tokenizer) +{ + delete Tokenizer; +} +int InitializeS2Tokenizer(s2::Tokenizer* Tokenizer, const char* path) +{ + return Tokenizer->load(std::string(path)); +} + +s2::AudioCodec* AllocS2AudioCodec() +{ + return new s2::AudioCodec(); +} +void ReleaseS2AudioCodec(s2::AudioCodec* AudioCodec) +{ + delete AudioCodec; +} +int InitializeS2AudioCodec(s2::AudioCodec* AudioCodec, const char* gguf_path, int32_t gpu_device, int32_t backend_type) +{ + return AudioCodec->load(std::string(gguf_path), gpu_device, backend_type); +} + +std::vector* AllocS2AudioPromptCodes() +{ + return new std::vector(); +} +void ReleaseS2AudioPromptCodes(std::vector* AudioPromptCodes) +{ + delete AudioPromptCodes; +} +int InitializeAudioPromptCodes(s2::Pipeline* Pipeline, int32_t ThreadCount, const char* ReferenceAudioPath, std::vector* AudioPromptCodes, int* TPrompt) +{ + int ReturnCode = 1; + if(AudioPromptCodes->size() == 0) + { + if (ReferenceAudioPath != NULL) { + s2::AudioData ref_audio; + if (load_audio(std::string(ReferenceAudioPath), ref_audio, Pipeline->codec_.sample_rate())) { + if (!Pipeline->codec_.encode(ref_audio.samples.data(), (int32_t)ref_audio.samples.size(), + ThreadCount, *AudioPromptCodes, *TPrompt)) { + ReturnCode = -1; //Pipeline warning: encode failed, running without reference audio. + AudioPromptCodes->clear(); + *TPrompt = 0; + } + } else { + ReturnCode = -2; //Pipeline warning: load_audio failed, running without reference audio. + } + } + } + return ReturnCode; +} + +std::vector* AllocS2AudioBuffer(int InitialSize) +{ + return InitialSize > 0 ? new std::vector(InitialSize) : new std::vector(); +} +void ReleaseS2AudioBuffer(std::vector* AudioBuffer) +{ + delete AudioBuffer; +} +float* GetS2AudioBufferDataPointer(std::vector* AudioBuffer) +{ + return AudioBuffer->data(); +} + +int S2Synthesize(s2::Pipeline* Pipeline, const s2::GenerateParams* GenerateParams, std::vector* AudioBuffer, std::vector* ReferenceAudioPromptCodes, int32_t* ReferenceAudioTPrompt, const char* ReferenceAudioPath, const char* ReferenceAudioTranscript, const char* TextToInfer, const char* OutputAudioPath, int32_t* AudioBufferOutputLength) +{ + if(Pipeline->initialized_) + { + const int32_t num_codebooks = Pipeline->model_.hparams().num_codebooks; int ReturnCode = true; + + // 1. Audio Prompt Loading + // encode() returns codes in row-major (num_codebooks, T_prompt) format, + // matching the layout expected by build_prompt() (prompt_codes[c*T+t]). + if(ReferenceAudioPromptCodes->size() == 0) + { + if (ReferenceAudioPath != NULL) { + s2::AudioData ref_audio; + if (load_audio(std::string(ReferenceAudioPath), ref_audio, Pipeline->codec_.sample_rate())) { + if (!Pipeline->codec_.encode(ref_audio.samples.data(), (int32_t)ref_audio.samples.size(), + GenerateParams->n_threads, *ReferenceAudioPromptCodes, *ReferenceAudioTPrompt)) { + ReturnCode = -1; //Pipeline warning: encode failed, running without reference audio. + ReferenceAudioPromptCodes->clear(); + *ReferenceAudioTPrompt = 0; + } + } else { + ReturnCode = -2; //Pipeline warning: load_audio failed, running without reference audio. + } + } + } + + // 2. Build Prompt Tensor + // build_prompt expects prompt_codes as (num_codebooks, T_prompt) row-major, + // which is exactly the format produced by encode() above. + s2::PromptTensor prompt = s2::build_prompt( + Pipeline->tokenizer_, std::string(TextToInfer), std::string(ReferenceAudioTranscript), + ReferenceAudioPromptCodes->empty() ? nullptr : ReferenceAudioPromptCodes->data(), + num_codebooks, *ReferenceAudioTPrompt); + + // 3. Setup KV Cache + int32_t max_seq_len = prompt.cols + GenerateParams->max_new_tokens; + if (!Pipeline->model_.init_kv_cache(max_seq_len)) { + return -3; //Pipeline error: init_kv_cache failed. + } + + // 4. Generate + // generate() returns GenerateResult.codes in row-major (num_codebooks, n_frames). + s2::GenerateResult res = s2::generate(Pipeline->model_, Pipeline->tokenizer_.config(), prompt, *GenerateParams); + if (res.n_frames == 0) { + return -4; //Pipeline error: generation produced no frames. + } + + // 5. Decode + // codec_.decode() receives codes in row-major (num_codebooks, n_frames), + // which matches GenerateResult.codes layout. + std::vector* audio_out = AudioBuffer == NULL ? new std::vector() : AudioBuffer; int32_t audio_n_frames_out = 0; + if (!Pipeline->codec_.decode(res.codes.data(), res.n_frames, GenerateParams->n_threads, *audio_out, audio_n_frames_out)) { + return -5; //Pipeline error: decode failed. + } + if(AudioBufferOutputLength != NULL) { *AudioBufferOutputLength = audio_n_frames_out; } + + if(OutputAudioPath != NULL) + { + // 6. Save + if (!s2::save_audio(std::string(OutputAudioPath), *audio_out, Pipeline->codec_.sample_rate())) { + return -6; //Pipeline error: save_audio failed to + (params.output_path). + } + } + return ReturnCode; + } + return false; +} \ No newline at end of file diff --git a/src/s2_generate.cpp b/src/s2_generate.cpp index b96183b..05d7d7a 100755 --- a/src/s2_generate.cpp +++ b/src/s2_generate.cpp @@ -1,3 +1,4 @@ +#include "../include/s2_config.h" #include "../include/s2_generate.h" #include #include @@ -47,7 +48,7 @@ GenerateResult generate( StepResult state; if (params.verbose) { - std::cout << "[Generate] Prefilling " << prompt.cols << " tokens..." << std::endl; + if(!SuppressNonEssentialVerbosity) { std::cout << "[Generate] Prefilling " << prompt.cols << " tokens..." << std::endl; } } if (!model.prefill(prompt_tm, prompt.cols, params.n_threads, state)) { std::cerr << "[Generate] Prefill failed." << std::endl; @@ -93,7 +94,7 @@ GenerateResult generate( const float ras_high_top_p = 0.9f; if (params.verbose) { - std::cout << "[Generate] Generating (max " << params.max_new_tokens << " tokens)..." << std::endl; + if(!SuppressNonEssentialVerbosity) { std::cout << "[Generate] Generating (max " << params.max_new_tokens << " tokens)..." << std::endl; } } int32_t step = 0; @@ -170,7 +171,7 @@ GenerateResult generate( step++; if (params.verbose && step % 50 == 0) { - std::cout << "\r[Generate] " << step << " / " << params.max_new_tokens << " tokens..." << std::flush; + if(!SuppressNonEssentialVerbosity) { std::cout << "\r[Generate] " << step << " / " << params.max_new_tokens << " tokens..." << std::flush; } } // Apply semantic mask and sample next main token @@ -179,8 +180,10 @@ GenerateResult generate( } if (params.verbose) { + if(!SuppressNonEssentialVerbosity) { std::cout << std::endl; std::cout << "[Generate] Done: " << out.n_frames << " frames generated." << std::endl; + } } // Compact codes from (num_cb, max_tokens) stride to (num_cb, n_frames) row-major diff --git a/src/s2_model.cpp b/src/s2_model.cpp index 350716f..51f5c8d 100755 --- a/src/s2_model.cpp +++ b/src/s2_model.cpp @@ -1,3 +1,4 @@ +#include "../include/s2_config.h" #include "../include/s2_model.h" #include #include @@ -101,7 +102,7 @@ bool SlowARModel::load(const std::string & gguf_path, int32_t gpu_device, int32_ if (!backend_ && backend_type == 0) { backend_ = ggml_backend_vk_init(static_cast(gpu_device)); if (!backend_) { - std::cerr << "[Model] Vulkan init failed, falling back to CPU." << std::endl; + if(!SuppressNonEssentialVerbosity) { std::cerr << "[Model] Vulkan init failed, falling back to CPU." << std::endl; } } } #endif @@ -109,12 +110,13 @@ bool SlowARModel::load(const std::string & gguf_path, int32_t gpu_device, int32_ if (!backend_ && backend_type == 1) { backend_ = ggml_backend_cuda_init(static_cast(gpu_device)); if (!backend_) { - std::cerr << "[Model] Cuda init failed, falling back to CPU." << std::endl; + if(!SuppressNonEssentialVerbosity) { std::cerr << "[Model] Cuda init failed, falling back to CPU." << std::endl; } } } #endif - if (!backend_) { - std::cerr << "[Model] NPU not compiled, falling back to CPU." << std::endl; + if (!backend_) + { + if(!SuppressNonEssentialVerbosity) { std::cerr << "[Model] NPU not compiled, falling back to CPU." << std::endl; } } } if (!backend_) { @@ -135,28 +137,28 @@ bool SlowARModel::load(const std::string & gguf_path, int32_t gpu_device, int32_ return false; } - std::cout << "[Model] Reading metadata from " << gguf_path << std::endl; + if(!SuppressNonEssentialVerbosity) { std::cout << "[Model] Reading metadata from " << gguf_path << std::endl; } // Helpers to read GGUF metadata auto get_u32 = [&](const char * key, uint32_t def) -> uint32_t { int id = gguf_find_key(ctx_gguf, key); - if (id < 0) { std::cerr << "[GGUF] missing key: " << key << " (using default " << def << ")\n"; return def; } + if (id < 0) { if(!SuppressNonEssentialVerbosity) { std::cerr << "[GGUF] missing key: " << key << " (using default " << def << ")\n"; } return def; } uint32_t v = gguf_get_val_u32(ctx_gguf, id); - std::cout << "[GGUF] " << key << " = " << v << "\n"; + if(!SuppressNonEssentialVerbosity) { std::cout << "[GGUF] " << key << " = " << v << "\n"; } return v; }; auto get_f32 = [&](const char * key, float def) -> float { int id = gguf_find_key(ctx_gguf, key); - if (id < 0) { std::cerr << "[GGUF] missing key: " << key << " (using default " << def << ")\n"; return def; } + if (id < 0) { if(!SuppressNonEssentialVerbosity) { std::cerr << "[GGUF] missing key: " << key << " (using default " << def << ")\n"; } return def; } float v = gguf_get_val_f32(ctx_gguf, id); - std::cout << "[GGUF] " << key << " = " << v << "\n"; + if(!SuppressNonEssentialVerbosity) { std::cout << "[GGUF] " << key << " = " << v << "\n"; } return v; }; auto get_bool = [&](const char * key, bool def) -> bool { int id = gguf_find_key(ctx_gguf, key); - if (id < 0) { std::cerr << "[GGUF] missing key: " << key << " (using default " << (def?"true":"false") << ")\n"; return def; } + if (id < 0) { if(!SuppressNonEssentialVerbosity) { std::cerr << "[GGUF] missing key: " << key << " (using default " << (def?"true":"false") << ")\n"; } return def; } bool v = gguf_get_val_bool(ctx_gguf, id); - std::cout << "[GGUF] " << key << " = " << (v?"true":"false") << "\n"; + if(!SuppressNonEssentialVerbosity) { std::cout << "[GGUF] " << key << " = " << (v?"true":"false") << "\n"; } return v; }; @@ -170,7 +172,7 @@ bool SlowARModel::load(const std::string & gguf_path, int32_t gpu_device, int32_ std::string arch = gguf_get_val_str(ctx_gguf, arch_id); arch_prefix = arch + "."; hparams_.has_fast_decoder = (arch == "fish-speech"); - std::cout << "[Model] Architecture: " << arch << std::endl; + if(!SuppressNonEssentialVerbosity) { std::cout << "[Model] Architecture: " << arch << std::endl; } } } @@ -209,11 +211,13 @@ bool SlowARModel::load(const std::string & gguf_path, int32_t gpu_device, int32_ hparams_.fast_has_project_in = get_bool("fish_speech.fast_project_in", false); } + if(!SuppressNonEssentialVerbosity) { std::cout << "[Model] Layers: " << hparams_.block_count << ", Dim: " << hparams_.embedding_length << ", Vocab: " << hparams_.vocab_size << ", head_count: " << hparams_.head_count << ", has_fast_decoder: " << hparams_.has_fast_decoder << std::endl; + } // --------------------------------------------------------------------------- // Load tensor pointers (metadata only — data loaded below) @@ -341,7 +345,7 @@ bool SlowARModel::load(const std::string & gguf_path, int32_t gpu_device, int32_ } #endif - std::cout << "[Model] Weights loaded. Total tensors: " << n_tensors << std::endl; + if(!SuppressNonEssentialVerbosity) { std::cout << "[Model] Weights loaded. Total tensors: " << n_tensors << std::endl; } gguf_free(ctx_gguf); return true; diff --git a/src/s2_pipeline.cpp b/src/s2_pipeline.cpp index 70e015d..b007b74 100644 --- a/src/s2_pipeline.cpp +++ b/src/s2_pipeline.cpp @@ -64,8 +64,8 @@ bool Pipeline::synthesize(const PipelineParams & params) { safe_print_error_ln("Pipeline warning: load_audio failed, running without reference audio."); } } - - if (!this->synthesize_raw(params, ref_audio, audio_out)) { + int32_t AudioOutFrames = 0; + if (!this->synthesize_raw(params, ref_audio, audio_out, &AudioOutFrames)) { safe_print_error_ln("Pipeline error: synthesis failed."); return false; } @@ -105,7 +105,8 @@ bool Pipeline::synthesize_to_memory(const PipelineParams & params, void** ref_au } } - if (!this->synthesize_raw(params, ref_audio, audio_out)) { + int32_t AudioOutFrames = 0; + if (!this->synthesize_raw(params, ref_audio, audio_out, &AudioOutFrames)) { safe_print_error_ln("Pipeline error: synthesis failed."); return false; } @@ -132,7 +133,7 @@ bool Pipeline::synthesize_to_memory(const PipelineParams & params, void** ref_au return true; } -bool Pipeline::synthesize_raw(const PipelineParams & params, AudioData & ref_audio, std::vector& audio_out) { +bool Pipeline::synthesize_raw(const PipelineParams & params, AudioData & ref_audio, std::vector& audio_out, int32_t* audio_out_length) { std::lock_guard lock(synthesize_mutex_); if (!initialized_) { @@ -177,7 +178,7 @@ bool Pipeline::synthesize_raw(const PipelineParams & params, AudioData & ref_aud return false; } - if (!codec_.decode(res.codes.data(), res.n_frames, params.gen.n_threads, audio_out)) { + if (!codec_.decode(res.codes.data(), res.n_frames, params.gen.n_threads, audio_out, audio_out_length)) { safe_print_error_ln("Pipeline error: decode failed."); return false; }