From 696f69cb04cad0b540d5c55a1101a4d9ce3cc4b5 Mon Sep 17 00:00:00 2001 From: Muhammad Ahmed Ghani Date: Wed, 18 Mar 2026 12:30:11 +0000 Subject: [PATCH] Add Gradio demo and support for newer CUDA architectures - Added interactive web demo (gradio_demo.py) for video dubbing - Fixed CUDA compute capability validation for RTX 5090 and newer GPUs - Updated documentation with demo instructions in both English and Chinese - Changed git clone from SSH to HTTPS in READMEs - Added Gradio dependencies to requirements.txt --- README.md | 11 +- README_zh.md | 11 +- funcineforge/models/causal_hifigan.py | 9 +- funcineforge/models/inference_model.py | 9 +- gradio_demo.py | 496 +++++++++++++++++++++++++ requirements.txt | 6 +- 6 files changed, 531 insertions(+), 11 deletions(-) create mode 100644 gradio_demo.py diff --git a/README.md b/README.md index 178fe97..01f2d65 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Fun-CineForge relies on Conda and Python environments. Execute **setup.py** to a ```shell # Conda -git clone git@github.com:FunAudioLLM/FunCineForge.git +git clone https://github.com/FunAudioLLM/FunCineForge.git conda create -n FunCineForge python=3.10 -y && conda activate FunCineForge sudo apt-get install ffmpeg # Initial settings @@ -111,6 +111,15 @@ cd exps bash infer.sh ``` +### Interactive Demo +Want to try it out interactively? Launch the Gradio demo for a web-based interface: + +```shell +python gradio_demo.py +``` + +The demo will start at `http://localhost:7860` where you can upload your own videos and audio references, customize voice prompts, and generate dubbed videos on the fly. + The API for multi-speaker dubbing from raw videos and SRT scripts is under development ... diff --git a/README_zh.md b/README_zh.md index 074275e..3653302 100644 --- a/README_zh.md +++ b/README_zh.md @@ -36,7 +36,7 @@ Fun-CineForge 依赖 Conda 和 Python 环境。执行 **setup.py** 自动安装 ```shell # Conda -git clone git@github.com:FunAudioLLM/FunCineForge.git +git clone https://github.com/FunAudioLLM/FunCineForge.git conda create -n FunCineForge python=3.10 -y && conda activate FunCineForge sudo apt-get install ffmpeg # 初始化设置 @@ -109,6 +109,15 @@ cd exps bash infer.sh ``` +### 交互式演示 +想要交互式体验吗?启动 Gradio 演示程序,使用基于网页的界面: + +```shell +python gradio_demo.py +``` + +演示程序将在 `http://localhost:7860` 启动,您可以上传自己的视频和音频参考,自定义语音提示,即时生成配音视频。 + 从原始视频和 SRT 脚本进行多人配音的 API 调用接口在开发中 ... diff --git a/funcineforge/models/causal_hifigan.py b/funcineforge/models/causal_hifigan.py index 28ac80e..4c64283 100644 --- a/funcineforge/models/causal_hifigan.py +++ b/funcineforge/models/causal_hifigan.py @@ -16,6 +16,7 @@ from torch.nn.utils.parametrize import remove_parametrizations from torch.nn.utils.parametrizations import weight_norm import logging +import soundfile as sf from funcineforge.register import tables from funcineforge.utils.device_funcs import to_device import os @@ -828,9 +829,11 @@ def inference( new_freq=output_sr ) wav_sr = output_sr - torchaudio.save( - os.path.join(wav_out_dir, f"{key[0]}.wav"), recon_speech.cpu(), - sample_rate=wav_sr, encoding='PCM_S', bits_per_sample=16 + sf.write( + os.path.join(wav_out_dir, f"{key[0]}.wav"), + recon_speech.cpu().squeeze(0).numpy(), + samplerate=wav_sr, + subtype='PCM_16' ) return recon_speech \ No newline at end of file diff --git a/funcineforge/models/inference_model.py b/funcineforge/models/inference_model.py index 2149b43..b79c90c 100644 --- a/funcineforge/models/inference_model.py +++ b/funcineforge/models/inference_model.py @@ -4,6 +4,7 @@ import numpy as np import os import torchaudio +import soundfile as sf import time import shutil from funcineforge.register import tables @@ -70,9 +71,11 @@ def inference( wav_out_dir = os.path.join(output_dir, "wav") os.makedirs(wav_out_dir, exist_ok=True) output_wav_path = os.path.join(wav_out_dir, f"{key[0]}.wav") - torchaudio.save( - output_wav_path, wav.cpu(), - sample_rate=self.sample_rate, encoding='PCM_S', bits_per_sample=16 + sf.write( + output_wav_path, + wav.cpu().squeeze(0).numpy(), + samplerate=self.sample_rate, + subtype='PCM_16' ) silent_video_path = data_in[0]["video"] diff --git a/gradio_demo.py b/gradio_demo.py new file mode 100644 index 0000000..281c7db --- /dev/null +++ b/gradio_demo.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python3 +""" +FunCineForge Gradio Demo +Run from the repo root: python gradio_demo.py +""" + +import os, sys, json, pickle, tempfile, uuid, shutil, logging +import numpy as np +import cv2 +import gradio as gr + +# ── make sure the repo root is on the path ───────────────────────────────── +ROOT = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, ROOT) +EXPS_DIR = os.path.join(ROOT, "exps") + +# ── checkpoint paths (relative to exps/) ─────────────────────────────────── +LM_CKPT = "funcineforge_zh_en/llm/ds-model.pt.best/mp_rank_00_model_states.pt" +FM_CKPT = "funcineforge_zh_en/flow/ds-model.pt.best/mp_rank_00_model_states.pt" +VOC_CKPT = "funcineforge_zh_en/vocoder/ds-model.pt.best/avg_5_removewn.pt" +FACE_ONNX = os.path.join(ROOT, "speaker_diarization/pretrained_models/face_recog_ir101.onnx") +DECODE_CONF = os.path.join(EXPS_DIR, "decode_conf") + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("FunCineForge-Demo") + +# ── lazy model loading ────────────────────────────────────────────────────── +_pipeline = None + +def get_pipeline(): + global _pipeline + if _pipeline is not None: + return _pipeline + + import torch + from funcineforge import AutoModel + from funcineforge.models.utils import dtype_map + + os.chdir(EXPS_DIR) + + def _load(exp_dir, model_name, ckpt_path, device="cuda:0"): + return AutoModel( + model=os.path.join(exp_dir, model_name), + init_param=ckpt_path, + output_dir=None, + device=device, + ) + + log.info("Loading LM model …") + lm_exp_dir, lm_model_name, _, _ = LM_CKPT.rsplit("/", 3) + lm_model = _load(lm_exp_dir, lm_model_name, LM_CKPT) + lm_model.model.to(dtype_map["fp32"]) + + log.info("Loading FM model …") + fm_exp_dir, fm_model_name, _, _ = FM_CKPT.rsplit("/", 3) + fm_model = _load(fm_exp_dir, fm_model_name, FM_CKPT) + fm_model.model.to(dtype_map["fp32"]) + + log.info("Loading Vocoder …") + voc_exp_dir, voc_model_name, _, _ = VOC_CKPT.rsplit("/", 3) + voc_model = _load(voc_exp_dir, voc_model_name, VOC_CKPT) + voc_model.model.to(dtype_map["fp32"]) + + log.info("Building inference model …") + from funcineforge.register import tables + + infer_model = AutoModel( + model="FunCineForgeInferModel", + model_conf={}, + output_dir=None, + device="cuda:0", + lm_model=lm_model, + fm_model=fm_model, + voc_model=voc_model, + tokenizer=None, + # decode params + sampling="ras", + lm_use_prompt=True, + fm_use_prompt=True, + use_llm_cache=True, + max_length=1500, + min_length=50, + llm_dtype="fp32", + fm_dtype="fp32", + voc_dtype="fp32", + batch_size=1, + xvec_model="funcineforge_zh_en/camplus.onnx", + dataset_conf={ + "load_meta_data_key": "text,clue,face,dialogue,vocal,video", + "sos": 6561, "eos": 6562, + "turn_of_speech": 6563, "fill_token": 6564, + "ignore_id": -100, + "startofclue_token": 151646, "endofclue_token": 151647, + "frame_shift": 25, "timebook_size": 1500, + "pangbai": 1500, "dubai": 1501, "duihua": 1502, "duoren": 1503, + "male": 1504, "female": 1505, + "child": 1506, "youth": 1507, "adult": 1508, + "middle": 1509, "elderly": 1510, + "speaker_id_start": 1511, + }, + index_ds="FunCineForgeDS", + disable_pbar=True, + random_seed=0, + ) + + _pipeline = infer_model + log.info("Pipeline ready ✓") + return _pipeline + + +# ── face embedding extraction using the ONNX model ───────────────────────── +_face_ort = None + +def get_face_ort(): + global _face_ort + if _face_ort is None: + import onnxruntime + opts = onnxruntime.SessionOptions() + opts.intra_op_num_threads = 4 + opts.inter_op_num_threads = 4 + _face_ort = onnxruntime.InferenceSession( + FACE_ONNX, sess_options=opts, providers=["CPUExecutionProvider"] + ) + return _face_ort + + +def preprocess_face(img_bgr: np.ndarray) -> np.ndarray: + """Resize & normalise a face crop for the IR101 face-rec model (112×112).""" + face = cv2.resize(img_bgr, (112, 112)) + face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB).astype(np.float32) + face = (face - 127.5) / 127.5 + face = face.transpose(2, 0, 1)[np.newaxis] # (1, 3, 112, 112) + return face + + +def extract_face_embeddings_from_video(video_path: str, every_n_frames: int = 5): + """ + Sample frames from a video every `every_n_frames` frames, + detect the dominant face bounding box (using simple haar or centre-crop), + and compute 512-d embeddings with the ONNX face-rec model. + + Returns a dict with keys: embeddings, faceI, frameI (matching the .pkl format). + """ + ort = get_face_ort() + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise RuntimeError(f"Cannot open video: {video_path}") + + face_cascade = cv2.CascadeClassifier( + cv2.data.haarcascades + "haarcascade_frontalface_default.xml" + ) + + embeddings, faceI, frameI = [], [], [] + speech_token_idx = 0 # maps to speech-token index (25 tokens/sec → one token per 40 ms) + fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 + + frame_idx = 0 + while True: + ret, frame = cap.read() + if not ret: + break + if frame_idx % every_n_frames == 0: + # Map video-frame idx → speech-token idx + token_idx = int(frame_idx / fps * 25) # 25 tokens/s + + # Try face detection; fall back to centre crop + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + faces = face_cascade.detectMultiScale(gray, 1.1, 5, minSize=(40, 40)) + if len(faces) > 0: + x, y, w, h = sorted(faces, key=lambda f: f[2] * f[3], reverse=True)[0] + crop = frame[y: y + h, x: x + w] + else: + # centre-crop fallback + h, w = frame.shape[:2] + side = min(h, w) + y0, x0 = (h - side) // 2, (w - side) // 2 + crop = frame[y0: y0 + side, x0: x0 + side] + + inp = preprocess_face(crop) + emb = ort.run(None, {ort.get_inputs()[0].name: inp})[0][0] # (512,) + embeddings.append(emb.astype(np.float32)) + faceI.append(token_idx) + frameI.append(token_idx) + frame_idx += 1 + + cap.release() + return {"embeddings": embeddings, "faceI": np.array(faceI), "frameI": np.array(frameI), + "face": [], "face_bbox": [], "lip": [], "lip_bbox": []} + + +# ── inference helper ──────────────────────────────────────────────────────── +def run_inference( + text: str, + clue: str, + vocal_path: str, + video_path: str, + gender: str, + age: str, + speech_type: str, + duration: float, + output_dir: str, + utt_id: str, +): + """Build the data dict the pipeline expects and run inference.""" + from funcineforge.register import tables + + # ── face embeddings ───────────────────────────────────────────────────── + log.info("Extracting face embeddings …") + face_pkl_path = os.path.join(output_dir, f"{utt_id}.pkl") + face_data = extract_face_embeddings_from_video(video_path) + with open(face_pkl_path, "wb") as f: + pickle.dump(face_data, f) + + # ── dialogue metadata ─────────────────────────────────────────────────── + dialogue = [{"start": 0.0, "duration": duration, "spk": "1", + "gender": gender.lower(), "age": age.lower()}] + + # ── dataset conf (must match decode.yaml) ────────────────────────────── + TIMEBOOK_SIZE = 1500 + TYPE_MAP = {"monologue": TIMEBOOK_SIZE + 1, "dialogue": TIMEBOOK_SIZE + 2, + "narration": TIMEBOOK_SIZE, "multi-speaker": TIMEBOOK_SIZE + 3} + type_id = TYPE_MAP.get(speech_type, TIMEBOOK_SIZE + 1) + + GENDER_MAP = {"male": TIMEBOOK_SIZE + 4, "female": TIMEBOOK_SIZE + 5} + AGE_MAP = {"child": TIMEBOOK_SIZE + 6, "teenager": TIMEBOOK_SIZE + 7, + "adult": TIMEBOOK_SIZE + 8, "middle-aged": TIMEBOOK_SIZE + 9, + "elderly": TIMEBOOK_SIZE + 10} + SPEAKER_ID_START = TIMEBOOK_SIZE + 11 + FRAME_SHIFT = 25 + + starts = np.array([d["start"] for d in dialogue]) + durations= np.array([d["duration"] for d in dialogue]) + speakers = np.array([int(d["spk"]) for d in dialogue]) + start_idxs = (starts * FRAME_SHIFT + 1).astype(np.int64) + end_idxs = ((starts + durations) * FRAME_SHIFT + 1).astype(np.int64) + spk_ids = (SPEAKER_ID_START + speakers - 1).astype(np.int64) + gender_ids = [GENDER_MAP.get(d["gender"], -100) for d in dialogue] + age_ids = [AGE_MAP.get(d["age"], -100) for d in dialogue] + + n = len(dialogue) + timespk_ids = np.full(n * 5, -100, dtype=np.int64) + timespk_ids[0::5] = start_idxs + timespk_ids[1::5] = spk_ids + timespk_ids[2::5] = gender_ids + timespk_ids[3::5] = age_ids + timespk_ids[4::5] = end_idxs + + # ── build data dict ───────────────────────────────────────────────────── + data = { + "utt": utt_id, + "text": text, + "clue": clue, + "vocal": vocal_path, + "video": video_path, + "face": face_pkl_path, + "type_id": type_id, + "timespk_ids": timespk_ids, + "speech_len": int(duration * FRAME_SHIFT), + "source_len": int(duration * FRAME_SHIFT) * 2 + 200, + } + + # ── run the pipeline ──────────────────────────────────────────────────── + pipeline = get_pipeline() + + class _SingleDS: + def __len__(self): return 1 + def __getitem__(self, _): return data + + pipeline.kwargs["output_dir"] = output_dir + pipeline.inference(input=_SingleDS(), input_len=1) + + wav_path = os.path.join(output_dir, "wav", f"{utt_id}.wav") + mp4_path = os.path.join(output_dir, "mp4", f"{utt_id}.mp4") + return (wav_path if os.path.exists(wav_path) else None, + mp4_path if os.path.exists(mp4_path) else None) + + +# ── Gradio callback ───────────────────────────────────────────────────────── +def infer_gradio( + text, clue, vocal_file, video_file, + gender, age, speech_type, + duration, progress=gr.Progress(track_tqdm=True) +): + if not text.strip(): + raise gr.Error("Please enter the script text.") + if vocal_file is None: + raise gr.Error("Please upload a reference audio file.") + if video_file is None: + raise gr.Error("Please upload a video file.") + + # Get the actual file paths (Gradio passes temp file paths) + vocal_path = vocal_file if isinstance(vocal_file, str) else vocal_file.name + video_path = video_file if isinstance(video_file, str) else video_file.name + + # Create a unique temp output dir + run_id = str(uuid.uuid4())[:8] + out_dir = os.path.join(tempfile.gettempdir(), f"fcf_{run_id}") + os.makedirs(out_dir, exist_ok=True) + + try: + progress(0.1, desc="Loading models …") + get_pipeline() # ensure models are loaded + + progress(0.3, desc="Extracting face embeddings …") + wav_out, mp4_out = run_inference( + text=text.strip(), + clue=clue.strip() if clue.strip() else "A speaker delivers the dialogue naturally.", + vocal_path=vocal_path, + video_path=video_path, + gender=gender, + age=age, + speech_type=speech_type, + duration=float(duration), + output_dir=out_dir, + utt_id=f"demo_{run_id}", + ) + progress(1.0, desc="Done!") + + if wav_out is None: + raise gr.Error("Inference completed but no output audio was found.") + + # Copy outputs to stable temp paths so Gradio can serve them + final_wav = os.path.join(tempfile.gettempdir(), f"fcf_{run_id}_out.wav") + shutil.copy2(wav_out, final_wav) + + final_mp4 = None + if mp4_out and os.path.exists(mp4_out): + final_mp4 = os.path.join(tempfile.gettempdir(), f"fcf_{run_id}_out.mp4") + shutil.copy2(mp4_out, final_mp4) + + return final_wav, final_mp4, "✅ Generation complete!" + + except Exception as e: + log.exception("Inference failed") + shutil.rmtree(out_dir, ignore_errors=True) + raise gr.Error(f"Inference failed: {e}") + + +# ── UI ────────────────────────────────────────────────────────────────────── +TITLE = "Video Dubbing Demo" +DESCRIPTION = """ +This demo showcases **FunCineForge**, a cinematic speech synthesis system. + +It uses an a pipeline with LLM for prosody modeling, Flow-Matching for acoustic modeling, and a high-fidelity Vocoder. By leveraging both text and visual cues (from the reference video's face embeddings), it generates character-accurate, emotionally charged speech perfectly suited for film and video dubbing. +""" + +CUSTOM_CSS = """ + #generate-btn { background: linear-gradient(135deg, #7c3aed, #4f46e5); color: white; font-size: 1.1rem; } + #status-box { font-size: 0.9rem; color: #6b7280; } + .gr-panel { border-radius: 12px !important; } + footer { display: none !important; } +""" +DEMO_THEME = gr.themes.Ocean() + +with gr.Blocks(title="FunCineForge Demo") as demo: + gr.Markdown(f"# {TITLE}\n{DESCRIPTION}") + + with gr.Row(): + # ── LEFT: inputs ──────────────────────────────────────────────────── + with gr.Column(scale=3): + gr.Markdown("### Script & Prompt") + text_input = gr.Textbox( + label="Script Text", + placeholder="Enter the dialogue or narration text here …", + lines=3, + ) + clue_input = gr.Textbox( + label="Voice / Emotion Clue", + placeholder=( + "E.g., 'A calm, 40-year-old male speaker with a deep voice, " + "delivering a professional and thoughtful statement.'" + ), + lines=3, + ) + + gr.Markdown("### Reference Files") + with gr.Row(): + vocal_input = gr.Audio( + label="Reference Audio (WAV — same character)", + type="filepath", + ) + video_input = gr.Video( + label="Reference Video (MP4 — scene to dub)", + ) + + gr.Markdown("### Speaker Metadata") + with gr.Row(): + gender_input = gr.Radio( + ["male", "female"], + value="male", + label="Gender", + ) + age_input = gr.Dropdown( + choices=["child", "teenager", "adult", "middle-aged", "elderly"], + value="adult", + label="Age Group", + ) + type_input = gr.Dropdown( + choices=["monologue", "dialogue", "narration", "multi-speaker"], + value="monologue", + label="Scene Type", + ) + duration_input = gr.Slider( + minimum=1.0, maximum=30.0, value=5.0, step=0.5, + label="Approximate Speech Duration (seconds)", + info="Estimate of the expected audio length", + ) + + generate_btn = gr.Button("Generate Speech", elem_id="generate-btn", variant="primary") + + # ── RIGHT: outputs ─────────────────────────────────────────────────── + with gr.Column(scale=2): + gr.Markdown("### Generated Output") + audio_output = gr.Audio( + label="Generated Speech", + type="filepath", + interactive=False, + ) + video_output = gr.Video( + label="Dubbed Video (if video was provided)", + interactive=False, + ) + status_output = gr.Textbox( + label="Status", + elem_id="status-box", + interactive=False, + ) + + # ── examples ───────────────────────────────────────────────────────────── + def get_example_path(rel_path): + p = os.path.join(EXPS_DIR, rel_path) + return p if os.path.exists(p) else None + + examples = [ + [ + "Every closet on a Carnival cruise ship. To make the numbers work, I needed a lot of cedar, fast and cheap.", + "A single middle-aged male speaker describes a business or construction requirement with a practical and matter-of-fact tone. His voice is deep and slightly gravelly, maintaining a professional and informative demeanor.", + get_example_path("data/clipped/en_monologue_1.wav"), + get_example_path("data/clipped/en_monologue_1.mp4"), + "male", + "middle-aged", + "monologue", + 5.74, + ], + [ + "Oh my God. Do you remember that bottle of wine we put aside the night Haley was born?", + "An adult female speaker expresses a sense of sudden realization and excitement. Her tone is bright and nostalgic as she recalls a significant event from the past. The overall emotion is one of pleasant surprise and anticipation.", + get_example_path("data/clipped/en_monologue_2.wav"), + get_example_path("data/clipped/en_monologue_2.mp4"), + "female", + "adult", + "monologue", + 4.06, + ], + [ + "I was just letting you know that if you were having any problems, you could come to me with them.", + "A single adult female speaker delivers a supportive and reassuring message. Her tone is friendly and caring, with a hint of advice, and her emotions fluctuate greatly. She speaks clearly and at a moderate pace, offering assistance to the listener.", + get_example_path("data/clipped/en_monologue_3.wav"), + get_example_path("data/clipped/en_monologue_3.mp4"), + "female", + "adult", + "monologue", + 4.94, + ], + ] + + # Filter out examples where files are missing + valid_examples = [ex for ex in examples if ex[2] and ex[3]] + + if valid_examples: + gr.Examples( + examples=valid_examples, + inputs=[text_input, clue_input, vocal_input, video_input, + gender_input, age_input, type_input, duration_input], + label="📂 Try an example from the demo dataset", + ) + + generate_btn.click( + fn=infer_gradio, + inputs=[ + text_input, clue_input, vocal_input, video_input, + gender_input, age_input, type_input, duration_input, + ], + outputs=[audio_output, video_output, status_output], + ) + + +if __name__ == "__main__": + demo.launch( + server_name="0.0.0.0", + server_port=7860, + share=True, + inbrowser=False, + theme=DEMO_THEME, + css=CUSTOM_CSS, + ) diff --git a/requirements.txt b/requirements.txt index 1567847..f1831e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -55,9 +55,9 @@ simplejson soundfile==0.13.1 starlette==0.47.2 tensorboardX==2.6.4 -torch==2.4.1 -torchaudio==2.4.1 -torchvision==0.19.1 +torch==2.9.1 +torchaudio==2.9.1 +torchvision==0.24.1 tqdm==4.67.1 transformers==4.57.0 x_transformers==2.16.2