From 3b5823f1e1453473049c47533e57f3ce205ea598 Mon Sep 17 00:00:00 2001 From: brunombo Date: Tue, 20 Jan 2026 12:57:57 +0100 Subject: [PATCH 1/9] =?UTF-8?q?Cr=C3=A9=C3=A9=20=C3=A0=20l'aide=20de=20Col?= =?UTF-8?q?ab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- long_TTS_xtts_v2.ipynb | 1077 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1077 insertions(+) create mode 100644 long_TTS_xtts_v2.ipynb diff --git a/long_TTS_xtts_v2.ipynb b/long_TTS_xtts_v2.ipynb new file mode 100644 index 000000000000..bf33b24e9584 --- /dev/null +++ b/long_TTS_xtts_v2.ipynb @@ -0,0 +1,1077 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4", + "authorship_tag": "ABX9TyMCKUnQgq5oD3rCdcOV+4wa", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "s8NfbT3sw2-z" + } + }, + { + "cell_type": "code", + "source": [ + "text_to_speech_to_synthetise= \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", + "\n", + "voice_gender = 'female_fr'\n", + "# ['female_fr', 'male_fr']" + ], + "metadata": { + "id": "XYDOUW523oJP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jIKtDA5hweJP" + }, + "outputs": [], + "source": [ + "# Installation des dépendances\n", + "!pip install -q scipy noisereduce\n", + "!pip install -q numpy==2.0.2\n", + "\n", + "# Installation du fork maintenu (supporte Python 3.12+)\n", + "!pip install -q coqui-tts\n", + "!pip install -q torchcodec" + ] + }, + { + "cell_type": "code", + "source": [ + "! pip install torchcodec" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nXsHJUGXH7_4", + "outputId": "da0c3b45-bc9a-490a-8597-bbcf7e2eaf25" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: torchcodec in /usr/local/lib/python3.12/dist-packages (0.9.1)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# -*- coding: utf-8 -*-\n", + "\"\"\"\n", + "TTS XTTS v2 - Version Long Audio (> 1 heure)\n", + "=============================================\n", + "\n", + "Module de synthèse vocale haute qualité utilisant Coqui XTTS v2.\n", + "Optimisé pour la génération d'audio longs avec:\n", + "- enable_text_splitting=True pour découpage automatique\n", + "- Chunking intelligent par paragraphes pour textes très longs\n", + "- Concaténation audio avec crossfade\n", + "- Barre de progression et estimation temps restant\n", + "- Gestion mémoire optimisée\n", + "- Correction du bug d'argument 'language' sur l'API synthesizer\n", + "\n", + "Auteur: Bruno\n", + "Date: Janvier 2025\n", + "Correction: Gemini\n", + "\"\"\"\n", + "\n", + "# ==============================================================================\n", + "# IMPORTS\n", + "# ==============================================================================\n", + "\n", + "from __future__ import annotations\n", + "\n", + "import os\n", + "import re\n", + "import gc\n", + "import wave\n", + "import time\n", + "import hashlib\n", + "import warnings\n", + "from pathlib import Path\n", + "from typing import Optional, Union, List, Callable\n", + "from dataclasses import dataclass\n", + "from enum import Enum\n", + "\n", + "import numpy as np\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "\n", + "# ==============================================================================\n", + "# INSTALLATION (Colab)\n", + "# ==============================================================================\n", + "\n", + "def install_dependencies():\n", + " \"\"\"Installe les dépendances si nécessaire (Colab).\"\"\"\n", + " import subprocess\n", + " import sys\n", + "\n", + " packages = [\n", + " (\"scipy\", \"scipy\"),\n", + " (\"noisereduce\", \"noisereduce\"),\n", + " (\"TTS\", \"coqui-tts\"),\n", + " ]\n", + "\n", + " for module, package in packages:\n", + " try:\n", + " __import__(module)\n", + " except ImportError:\n", + " print(f\"📦 Installation de {package}...\")\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", package])\n", + "\n", + " # numpy compatible\n", + " try:\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"numpy==2.0.2\"])\n", + " except:\n", + " pass\n", + "\n", + "# ==============================================================================\n", + "# CONFIGURATION\n", + "# ==============================================================================\n", + "\n", + "@dataclass\n", + "class TTSConfig:\n", + " \"\"\"Configuration globale du module TTS.\"\"\"\n", + " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", + " SAMPLE_RATE: int = 24000\n", + " DEFAULT_LANGUAGE: str = \"fr\"\n", + " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", + "\n", + " # Configuration pour audio longs\n", + " MAX_CHARS_PER_CHUNK: int = 500 # Caractères max par chunk pour textes très longs\n", + " CROSSFADE_DURATION: float = 0.05 # Durée du crossfade en secondes\n", + " ENABLE_TEXT_SPLITTING: bool = True # Activer le split natif XTTS\n", + "\n", + " PRESET_VOICES: dict = None\n", + "\n", + " def __post_init__(self):\n", + " self.PRESET_VOICES = {\n", + " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", + " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", + " }\n", + "\n", + "Config = TTSConfig()\n", + "\n", + "# ==============================================================================\n", + "# DEVICE MANAGEMENT\n", + "# ==============================================================================\n", + "\n", + "_device = None\n", + "_device_name = \"cpu\"\n", + "\n", + "def detect_device():\n", + " \"\"\"Détecte le meilleur device disponible.\"\"\"\n", + " global _device, _device_name\n", + " import torch\n", + "\n", + " # Essayer TPU\n", + " try:\n", + " import torch_xla.core.xla_model as xm\n", + " _device = xm.xla_device()\n", + " _device_name = \"tpu\"\n", + " print(f\"⚙️ Device: TPU\")\n", + " return\n", + " except:\n", + " pass\n", + "\n", + " # Essayer CUDA\n", + " if torch.cuda.is_available():\n", + " _device = torch.device(\"cuda\")\n", + " _device_name = f\"cuda ({torch.cuda.get_device_name(0)})\"\n", + " print(f\"⚙️ Device: {_device_name}\")\n", + " return\n", + "\n", + " # Fallback CPU\n", + " _device = torch.device(\"cpu\")\n", + " _device_name = \"cpu\"\n", + " print(f\"⚙️ Device: CPU\")\n", + "\n", + "# ==============================================================================\n", + "# TEXT SPLITTING UTILITIES\n", + "# ==============================================================================\n", + "\n", + "class TextSplitter:\n", + " \"\"\"\n", + " Utilitaire pour découper intelligemment les textes longs.\n", + " Préserve la cohérence des phrases et paragraphes.\n", + " \"\"\"\n", + "\n", + " @staticmethod\n", + " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", + " \"\"\"\n", + " Estime la durée audio pour un texte donné.\n", + " \"\"\"\n", + " return len(text) / chars_per_second\n", + "\n", + " @staticmethod\n", + " def split_into_sentences(text: str) -> List[str]:\n", + " \"\"\"Découpe le texte en phrases.\"\"\"\n", + " # Pattern pour fin de phrase\n", + " pattern = r'(?<=[.!?])\\s+'\n", + " sentences = re.split(pattern, text)\n", + " return [s.strip() for s in sentences if s.strip()]\n", + "\n", + " @staticmethod\n", + " def split_into_paragraphs(text: str) -> List[str]:\n", + " \"\"\"Découpe le texte en paragraphes.\"\"\"\n", + " paragraphs = re.split(r'\\n\\s*\\n', text)\n", + " return [p.strip() for p in paragraphs if p.strip()]\n", + "\n", + " @classmethod\n", + " def split_for_long_audio(\n", + " cls,\n", + " text: str,\n", + " max_chars: int = 500,\n", + " preserve_sentences: bool = True\n", + " ) -> List[str]:\n", + " \"\"\"\n", + " Découpe un texte long en chunks optimaux pour la synthèse.\n", + " \"\"\"\n", + " # Si texte court, retourner tel quel\n", + " if len(text) <= max_chars:\n", + " return [text]\n", + "\n", + " chunks = []\n", + "\n", + " if preserve_sentences:\n", + " sentences = cls.split_into_sentences(text)\n", + " current_chunk = \"\"\n", + "\n", + " for sentence in sentences:\n", + " # Si la phrase seule dépasse max_chars, la découper\n", + " if len(sentence) > max_chars:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = \"\"\n", + " # Découper la phrase longue par mots\n", + " words = sentence.split()\n", + " sub_chunk = \"\"\n", + " for word in words:\n", + " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", + " sub_chunk += \" \" + word if sub_chunk else word\n", + " else:\n", + " if sub_chunk:\n", + " chunks.append(sub_chunk.strip())\n", + " sub_chunk = word\n", + " if sub_chunk:\n", + " current_chunk = sub_chunk\n", + " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", + " current_chunk += \" \" + sentence if current_chunk else sentence\n", + " else:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = sentence\n", + "\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " else:\n", + " # Découpage simple par caractères\n", + " for i in range(0, len(text), max_chars):\n", + " chunks.append(text[i:i + max_chars])\n", + "\n", + " return chunks\n", + "\n", + "\n", + "# ==============================================================================\n", + "# AUDIO PROCESSING\n", + "# ==============================================================================\n", + "\n", + "class AudioProcessor:\n", + " \"\"\"Processeur audio pour post-traitement et concaténation.\"\"\"\n", + "\n", + " @staticmethod\n", + " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", + " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target_linear = 10 ** (target_db / 20)\n", + " audio = audio * (target_linear / peak)\n", + "\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + " @staticmethod\n", + " def crossfade(\n", + " audio1: np.ndarray,\n", + " audio2: np.ndarray,\n", + " sample_rate: int,\n", + " duration: float = 0.05\n", + " ) -> np.ndarray:\n", + " \"\"\"\n", + " Concatène deux segments audio avec crossfade.\n", + " \"\"\"\n", + " # Convertir en float si nécessaire\n", + " if audio1.dtype == np.int16:\n", + " audio1 = audio1.astype(np.float32) / 32768.0\n", + " if audio2.dtype == np.int16:\n", + " audio2 = audio2.astype(np.float32) / 32768.0\n", + "\n", + " fade_samples = int(sample_rate * duration)\n", + "\n", + " # Si audio trop court pour crossfade, concaténer simplement\n", + " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", + " return np.concatenate([audio1, audio2])\n", + "\n", + " # Créer les courbes de fade\n", + " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", + " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", + "\n", + " # Appliquer le crossfade\n", + " audio1_end = audio1[-fade_samples:] * fade_out\n", + " audio2_start = audio2[:fade_samples] * fade_in\n", + "\n", + " # Assembler\n", + " result = np.concatenate([\n", + " audio1[:-fade_samples],\n", + " audio1_end + audio2_start,\n", + " audio2[fade_samples:]\n", + " ])\n", + "\n", + " return result\n", + "\n", + " @classmethod\n", + " def concatenate_chunks(\n", + " cls,\n", + " audio_chunks: List[np.ndarray],\n", + " sample_rate: int,\n", + " crossfade_duration: float = 0.05\n", + " ) -> np.ndarray:\n", + " \"\"\"\n", + " Concatène plusieurs chunks audio avec crossfade.\n", + " \"\"\"\n", + " if not audio_chunks:\n", + " return np.array([], dtype=np.float32)\n", + "\n", + " if len(audio_chunks) == 1:\n", + " audio = audio_chunks[0]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " return audio\n", + "\n", + " result = audio_chunks[0]\n", + " if result.dtype == np.int16:\n", + " result = result.astype(np.float32) / 32768.0\n", + "\n", + " for chunk in audio_chunks[1:]:\n", + " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", + "\n", + " return result\n", + "\n", + " @staticmethod\n", + " def enhance(\n", + " audio: np.ndarray,\n", + " sample_rate: int,\n", + " normalize: bool = True,\n", + " warmth: bool = True\n", + " ) -> np.ndarray:\n", + " \"\"\"Améliore la qualité audio.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " if warmth:\n", + " try:\n", + " from scipy import signal\n", + " nyquist = sample_rate / 2\n", + " cutoff = min(300, nyquist * 0.9) / nyquist\n", + " b, a = signal.butter(2, cutoff, btype='low')\n", + " bass = signal.filtfilt(b, a, audio)\n", + " audio = audio + 0.15 * bass\n", + " except ImportError:\n", + " pass\n", + "\n", + " if normalize:\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target = 10 ** (-3.0 / 20)\n", + " audio = audio * (target / peak)\n", + "\n", + " audio = np.clip(audio, -1.0, 1.0)\n", + " return audio\n", + "\n", + "\n", + "# ==============================================================================\n", + "# PROGRESS TRACKER\n", + "# ==============================================================================\n", + "\n", + "class ProgressTracker:\n", + " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", + "\n", + " def __init__(self, total: int, description: str = \"\"):\n", + " self.total = total\n", + " self.current = 0\n", + " self.description = description\n", + " self.start_time = time.time()\n", + " self.chunk_times = []\n", + "\n", + " def update(self, chunk_duration: float = None):\n", + " \"\"\"Met à jour la progression.\"\"\"\n", + " self.current += 1\n", + " if chunk_duration:\n", + " self.chunk_times.append(chunk_duration)\n", + " self._display()\n", + "\n", + " def _display(self):\n", + " \"\"\"Affiche la barre de progression.\"\"\"\n", + " elapsed = time.time() - self.start_time\n", + " percent = (self.current / self.total) * 100\n", + "\n", + " # Estimation temps restant\n", + " if self.chunk_times:\n", + " avg_time = np.mean(self.chunk_times)\n", + " remaining = avg_time * (self.total - self.current)\n", + " eta_str = self._format_time(remaining)\n", + " else:\n", + " eta_str = \"...\"\n", + "\n", + " # Barre de progression\n", + " bar_length = 30\n", + " filled = int(bar_length * self.current / self.total)\n", + " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", + "\n", + " elapsed_str = self._format_time(elapsed)\n", + "\n", + " print(f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", + " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\", end=\"\")\n", + "\n", + " if self.current >= self.total:\n", + " print() # Nouvelle ligne à la fin\n", + "\n", + " @staticmethod\n", + " def _format_time(seconds: float) -> str:\n", + " \"\"\"Formate un temps en secondes en HH:MM:SS.\"\"\"\n", + " hours = int(seconds // 3600)\n", + " minutes = int((seconds % 3600) // 60)\n", + " secs = int(seconds % 60)\n", + "\n", + " if hours > 0:\n", + " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", + " return f\"{minutes:02d}:{secs:02d}\"\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TTS ENGINE\n", + "# ==============================================================================\n", + "\n", + "_tts_model = None\n", + "_voices_cache = {}\n", + "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", + "\n", + "def get_model():\n", + " \"\"\"Charge le modèle XTTS v2 avec cache.\"\"\"\n", + " global _tts_model\n", + "\n", + " if _tts_model is None:\n", + " print(\"🔄 Chargement du modèle XTTS v2...\")\n", + " from TTS.api import TTS\n", + "\n", + " _tts_model = TTS(Config.MODEL_NAME)\n", + "\n", + " if _device is not None and _device_name.startswith(\"cuda\"):\n", + " _tts_model = _tts_model.to(_device)\n", + "\n", + " print(\"✓ Modèle chargé\")\n", + "\n", + " return _tts_model\n", + "\n", + "\n", + "def get_voice_path(voice: str) -> str:\n", + " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", + " global _voices_cache\n", + " import urllib.request\n", + "\n", + " if voice in _voices_cache:\n", + " return _voices_cache[voice]\n", + "\n", + " if os.path.isfile(voice):\n", + " _voices_cache[voice] = voice\n", + " return voice\n", + "\n", + " if voice in Config.PRESET_VOICES:\n", + " url = Config.PRESET_VOICES[voice]\n", + " path = f\"/tmp/{voice}.wav\"\n", + "\n", + " if not os.path.exists(path):\n", + " print(f\"📥 Téléchargement de la voix '{voice}'...\")\n", + " urllib.request.urlretrieve(url, path)\n", + "\n", + " _voices_cache[voice] = path\n", + " return path\n", + "\n", + " raise FileNotFoundError(f\"Voix '{voice}' non trouvée\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# MAIN SYNTHESIS FUNCTIONS\n", + "# ==============================================================================\n", + "\n", + "def synthesize_chunk(\n", + " text: str,\n", + " voice_path: str,\n", + " language: str = \"fr\",\n", + " enable_text_splitting: bool = True\n", + ") -> np.ndarray:\n", + " \"\"\"\n", + " Synthétise un chunk de texte en audio via l'inférence directe (Low-Level).\n", + " Bypass total du SpeakerManager pour éviter le bug FileNotFoundError .pth\n", + " \"\"\"\n", + " model_wrapper = get_model()\n", + "\n", + " # 1. Accès \"chirurgical\" au modèle interne XTTS\n", + " # C'est lui qui fait le travail, sans la couche de gestion de fichiers buggée\n", + " if hasattr(model_wrapper, 'synthesizer'):\n", + " xtts_model = model_wrapper.synthesizer.tts_model\n", + " else:\n", + " # Cas rare ou structure différente, on tente l'accès direct\n", + " xtts_model = model_wrapper.tts_model\n", + "\n", + " # 2. Calcul manuel des latents (Empreinte vocale)\n", + " # On transforme le fichier WAV en vecteurs mathématiques\n", + " try:\n", + " gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(\n", + " audio_path=[voice_path],\n", + " gpt_cond_len=30,\n", + " max_ref_length=60\n", + " )\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur calcul latents: {e}\")\n", + " raise e\n", + "\n", + " # 3. Inférence directe\n", + " # On appelle la fonction de génération pure, sans passer par tts()\n", + " try:\n", + " out = xtts_model.inference(\n", + " text=text,\n", + " language=language,\n", + " gpt_cond_latent=gpt_cond_latent,\n", + " speaker_embedding=speaker_embedding,\n", + " temperature=0.7, # Paramètre standard pour la créativité\n", + " length_penalty=1.0, # Pénalité de longueur\n", + " repetition_penalty=2.0, # Évite les bégaiements\n", + " top_k=50,\n", + " top_p=0.8,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + "\n", + " # Le résultat est généralement dans un dictionnaire sous la clé 'wav'\n", + " if isinstance(out, dict) and 'wav' in out:\n", + " wav = out['wav']\n", + " else:\n", + " wav = out\n", + "\n", + " # S'assurer que c'est bien un numpy array sur CPU\n", + " if hasattr(wav, 'cpu'):\n", + " wav = wav.cpu().numpy()\n", + " if isinstance(wav, list):\n", + " wav = np.array(wav, dtype=np.float32)\n", + "\n", + " return wav\n", + "\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'inférence directe : {e}\")\n", + " raise e\n", + "\n", + "\n", + "def text_to_speech_long(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " max_chars_per_chunk: int = None,\n", + " show_progress: bool = True,\n", + " enable_text_splitting: bool = True\n", + ") -> dict:\n", + " \"\"\"\n", + " Génère un fichier audio long (> 1 heure) à partir de texte.\n", + " \"\"\"\n", + " import torch\n", + "\n", + " # Configuration\n", + " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Estimation initiale\n", + " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", + " print(f\"\\n📝 Texte: {len(text):,} caractères\")\n", + " print(f\"⏱️ Durée estimée: {ProgressTracker._format_time(estimated_duration)}\")\n", + "\n", + " # Découper le texte\n", + " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", + " print(f\"📦 Chunks: {len(chunks)}\")\n", + "\n", + " # Initialiser la progression\n", + " progress = None\n", + " if show_progress:\n", + " progress = ProgressTracker(len(chunks), \"🎙️ Synthèse\")\n", + "\n", + " # Générer l'audio chunk par chunk\n", + " audio_chunks = []\n", + "\n", + " for i, chunk in enumerate(chunks):\n", + " chunk_start = time.time()\n", + "\n", + " try:\n", + " wav = synthesize_chunk(\n", + " text=chunk,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + " audio_chunks.append(wav)\n", + "\n", + " except Exception as e:\n", + " print(f\"\\n⚠️ Erreur chunk {i+1}: {e}\")\n", + " # Continuer avec les autres chunks\n", + " continue\n", + "\n", + " # Libérer la mémoire GPU périodiquement\n", + " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", + " torch.cuda.empty_cache()\n", + "\n", + " chunk_duration = time.time() - chunk_start\n", + " if progress:\n", + " progress.update(chunk_duration)\n", + "\n", + " if not audio_chunks:\n", + " raise RuntimeError(\"Aucun audio généré\")\n", + "\n", + " print(\"\\n🔗 Concaténation des chunks...\")\n", + "\n", + " # Concaténer avec crossfade\n", + " final_audio = AudioProcessor.concatenate_chunks(\n", + " audio_chunks,\n", + " Config.SAMPLE_RATE,\n", + " Config.CROSSFADE_DURATION\n", + " )\n", + "\n", + " # Libérer les chunks de la mémoire\n", + " del audio_chunks\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " print(\"✨ Post-traitement...\")\n", + " final_audio = AudioProcessor.enhance(\n", + " final_audio,\n", + " Config.SAMPLE_RATE,\n", + " normalize=True,\n", + " warmth=True\n", + " )\n", + " else:\n", + " final_audio = AudioProcessor.normalize(final_audio)\n", + "\n", + " # Convertir en int16\n", + " final_audio = (final_audio * 32767).astype(np.int16)\n", + "\n", + " # Générer le nom de fichier\n", + " if output_path is None:\n", + " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", + " output_path = f\"tts_long_{voice}_{h}.wav\"\n", + "\n", + " # Dossier de sortie\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarder\n", + " print(f\"💾 Sauvegarde: {final_path}\")\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(final_audio.tobytes())\n", + "\n", + " # Calculer la durée réelle\n", + " duration = len(final_audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"\\n✅ Audio généré avec succès!\")\n", + " print(f\" 📁 Fichier: {final_path}\")\n", + " print(f\" ⏱️ Durée: {ProgressTracker._format_time(duration)}\")\n", + " print(f\" 📦 Chunks: {len(chunks)}\")\n", + " print(f\" 🎤 Voix: {voice}\")\n", + "\n", + " return {\n", + " 'path': str(final_path),\n", + " 'sample_rate': Config.SAMPLE_RATE,\n", + " 'duration_seconds': duration,\n", + " 'duration_formatted': ProgressTracker._format_time(duration),\n", + " 'audio_data': final_audio,\n", + " 'voice': voice,\n", + " 'language': language,\n", + " 'device': _device_name,\n", + " 'chunks_count': len(chunks),\n", + " 'text_length': len(text)\n", + " }\n", + "\n", + "\n", + "def text_to_speech(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " enable_text_splitting: bool = True\n", + ") -> dict:\n", + " \"\"\"\n", + " Génère un fichier audio à partir de texte avec XTTS v2.\n", + " \"\"\"\n", + " # Basculer automatiquement vers la version long pour textes > 10000 chars\n", + " if len(text) > 10000:\n", + " print(\"📢 Texte long détecté - utilisation de text_to_speech_long()\")\n", + " return text_to_speech_long(\n", + " text=text,\n", + " voice=voice,\n", + " language=language,\n", + " output_path=output_path,\n", + " enhance=enhance,\n", + " use_gdrive=use_gdrive,\n", + " gdrive_folder=gdrive_folder,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + "\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Générer l'audio avec enable_text_splitting\n", + " wav = synthesize_chunk(\n", + " text=text,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", + " else:\n", + " audio = AudioProcessor.normalize(wav)\n", + "\n", + " audio = (audio * 32767).astype(np.int16)\n", + "\n", + " # Nom de fichier\n", + " if output_path is None:\n", + " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", + " output_path = f\"tts_{voice}_{h}.wav\"\n", + "\n", + " # Dossier de sortie\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarder\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(audio.tobytes())\n", + "\n", + " duration = len(audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"✓ Audio généré: {final_path}\")\n", + " print(f\" Durée: {duration:.2f}s | Voix: {voice}\")\n", + "\n", + " return {\n", + " 'path': str(final_path),\n", + " 'sample_rate': Config.SAMPLE_RATE,\n", + " 'duration_seconds': duration,\n", + " 'audio_data': audio,\n", + " 'voice': voice,\n", + " 'language': language,\n", + " 'device': _device_name\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# UTILITIES\n", + "# ==============================================================================\n", + "\n", + "def preview_audio(result: dict) -> None:\n", + " \"\"\"Prévisualise l'audio dans le notebook.\"\"\"\n", + " from IPython.display import Audio, display\n", + "\n", + " audio = result['audio_data']\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " display(Audio(audio, rate=result['sample_rate']))\n", + "\n", + "\n", + "def list_voices() -> list:\n", + " \"\"\"Liste les voix disponibles.\"\"\"\n", + " return list(Config.PRESET_VOICES.keys())\n", + "\n", + "\n", + "def list_languages() -> list:\n", + " \"\"\"Liste les langues supportées.\"\"\"\n", + " return [\"en\", \"es\", \"fr\", \"de\", \"it\", \"pt\", \"pl\", \"tr\",\n", + " \"ru\", \"nl\", \"cs\", \"ar\", \"zh-cn\", \"ja\", \"hu\", \"ko\", \"hi\"]\n", + "\n", + "\n", + "def clear_cache():\n", + " \"\"\"Libère la mémoire.\"\"\"\n", + " global _tts_model\n", + " import torch\n", + "\n", + " _tts_model = None\n", + " gc.collect()\n", + "\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " print(\"✓ Cache vidé\")\n", + "\n", + "\n", + "def estimate_duration(text: str) -> dict:\n", + " \"\"\"\n", + " Estime la durée audio pour un texte.\n", + " \"\"\"\n", + " duration = TextSplitter.estimate_audio_duration(text)\n", + " chunks = len(TextSplitter.split_for_long_audio(text))\n", + "\n", + " return {\n", + " 'chars': len(text),\n", + " 'estimated_seconds': duration,\n", + " 'estimated_formatted': ProgressTracker._format_time(duration),\n", + " 'chunks_estimate': chunks\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# ALIASES\n", + "# ==============================================================================\n", + "\n", + "tts = text_to_speech\n", + "tts_long = text_to_speech_long\n", + "\n", + "\n", + "# ==============================================================================\n", + "# INITIALIZATION\n", + "# ==============================================================================\n", + "\n", + "def init():\n", + " \"\"\"Initialise le module.\"\"\"\n", + " detect_device()\n", + " print(\"✅ Module XTTS v2 Long Audio chargé\")\n", + " print(f\" Device: {_device_name}\")\n", + " print(f\" Voix: {list_voices()}\")\n", + " print(f\" enable_text_splitting: activé par défaut\")\n", + "\n", + "\n", + "# Auto-init\n", + "if __name__ != \"__main__\":\n", + " try:\n", + " detect_device()\n", + " except:\n", + " pass\n", + "\n", + "\n", + "# ==============================================================================\n", + "# EXAMPLE USAGE\n", + "# ==============================================================================\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Installation si nécessaire\n", + " install_dependencies()\n", + "\n", + " # Initialisation\n", + " init()\n", + "\n", + " # Exemple avec texte court\n", + " print(\"\\n\" + \"=\"*60)\n", + " print(\"EXEMPLE 1: Texte court\")\n", + " print(\"=\"*60)\n", + "\n", + " short_text = \"\"\"\n", + " Ce document présente les Manifold-Constrained Hyper-Connections,\n", + " une architecture novatrice conçue par DeepSeek-AI pour stabiliser\n", + " l'entraînement des grands modèles de langage.\n", + " \"\"\"\n", + "\n", + " result = text_to_speech(\n", + " text=short_text.strip(),\n", + " voice=\"female_fr\",\n", + " enhance=True\n", + " )\n", + "\n", + " print(f\"\\nRésultat: {result['duration_seconds']:.2f}s\")\n", + "\n", + " # Exemple avec texte long (simulé)\n", + " print(\"\\n\" + \"=\"*60)\n", + " print(\"EXEMPLE 2: Estimation pour texte long\")\n", + " print(\"=\"*60)\n", + "\n", + " # Simuler un texte de ~1 heure (environ 54000 caractères)\n", + " long_text = short_text.strip() * 300 # ~54000 chars ≈ 1 heure\n", + "\n", + " estimation = estimate_duration(long_text)\n", + " print(f\"\\nEstimation pour {estimation['chars']:,} caractères:\")\n", + " print(f\" Durée: {estimation['estimated_formatted']}\")\n", + " print(f\" Chunks: {estimation['chunks_estimate']}\")\n", + "\n", + " # Pour générer réellement:\n", + " # result = text_to_speech_long(long_text, voice=\"female_fr\", show_progress=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1FQhKQ1IE4iX", + "outputId": "f0b2eb8b-e071-40b0-e342-1395217fe769" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "⚙️ Device: cuda (Tesla T4)\n", + "✅ Module XTTS v2 Long Audio chargé\n", + " Device: cuda (Tesla T4)\n", + " Voix: ['female_fr', 'male_fr']\n", + " enable_text_splitting: activé par défaut\n", + "\n", + "============================================================\n", + "EXEMPLE 1: Texte court\n", + "============================================================\n", + "🔄 Chargement du modèle XTTS v2...\n", + "✓ Modèle chargé\n", + "✓ Audio généré: tts_female_fr_151473ed.wav\n", + " Durée: 9.79s | Voix: female_fr\n", + "\n", + "Résultat: 9.79s\n", + "\n", + "============================================================\n", + "EXEMPLE 2: Estimation pour texte long\n", + "============================================================\n", + "\n", + "Estimation pour 55,200 caractères:\n", + " Durée: 01:01:20\n", + " Chunks: 108\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Lire le fichier\n", + "with open(\"mon_texte_long.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " texte_complet = f.read()\n", + "\n", + "# Lancer la génération\n", + "text_to_speech_long(\n", + " text=texte_complet,\n", + " voice=\"female_fr\",\n", + " language=\"fr\"\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BRhVnXgSE7Yd", + "outputId": "ce14c47a-70f3-4158-954b-9f5a415724f9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "📝 Texte: 6,715 caractères\n", + "⏱️ Durée estimée: 07:27\n", + "📦 Chunks: 16\n", + "🎙️ Synthèse [██████████████████████████████] 16/16 (100.0%) | Temps: 03:10 | ETA: 00:00\n", + "\n", + "🔗 Concaténation des chunks...\n", + "💾 Sauvegarde: tts_long_female_fr_8aba435b.wav\n", + "\n", + "✅ Audio généré avec succès!\n", + " 📁 Fichier: tts_long_female_fr_8aba435b.wav\n", + " ⏱️ Durée: 06:37\n", + " 📦 Chunks: 16\n", + " 🎤 Voix: female_fr\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'path': 'tts_long_female_fr_8aba435b.wav',\n", + " 'sample_rate': 24000,\n", + " 'duration_seconds': 397.586,\n", + " 'duration_formatted': '06:37',\n", + " 'audio_data': array([28, 21, 31, ..., 3, 7, 1], dtype=int16),\n", + " 'voice': 'female_fr',\n", + " 'language': 'fr',\n", + " 'device': 'cuda (Tesla T4)',\n", + " 'chunks_count': 16,\n", + " 'text_length': 6715}" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "KaWW0-DIMy7R" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From e34108a4bf472238481183b30284683601058e91 Mon Sep 17 00:00:00 2001 From: brunombo Date: Tue, 20 Jan 2026 13:00:50 +0100 Subject: [PATCH 2/9] Delete long_TTS_xtts_v2.ipynb --- long_TTS_xtts_v2.ipynb | 1077 ---------------------------------------- 1 file changed, 1077 deletions(-) delete mode 100644 long_TTS_xtts_v2.ipynb diff --git a/long_TTS_xtts_v2.ipynb b/long_TTS_xtts_v2.ipynb deleted file mode 100644 index bf33b24e9584..000000000000 --- a/long_TTS_xtts_v2.ipynb +++ /dev/null @@ -1,1077 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4", - "authorship_tag": "ABX9TyMCKUnQgq5oD3rCdcOV+4wa", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [], - "metadata": { - "id": "s8NfbT3sw2-z" - } - }, - { - "cell_type": "code", - "source": [ - "text_to_speech_to_synthetise= \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", - "\n", - "voice_gender = 'female_fr'\n", - "# ['female_fr', 'male_fr']" - ], - "metadata": { - "id": "XYDOUW523oJP" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jIKtDA5hweJP" - }, - "outputs": [], - "source": [ - "# Installation des dépendances\n", - "!pip install -q scipy noisereduce\n", - "!pip install -q numpy==2.0.2\n", - "\n", - "# Installation du fork maintenu (supporte Python 3.12+)\n", - "!pip install -q coqui-tts\n", - "!pip install -q torchcodec" - ] - }, - { - "cell_type": "code", - "source": [ - "! pip install torchcodec" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nXsHJUGXH7_4", - "outputId": "da0c3b45-bc9a-490a-8597-bbcf7e2eaf25" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: torchcodec in /usr/local/lib/python3.12/dist-packages (0.9.1)\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# -*- coding: utf-8 -*-\n", - "\"\"\"\n", - "TTS XTTS v2 - Version Long Audio (> 1 heure)\n", - "=============================================\n", - "\n", - "Module de synthèse vocale haute qualité utilisant Coqui XTTS v2.\n", - "Optimisé pour la génération d'audio longs avec:\n", - "- enable_text_splitting=True pour découpage automatique\n", - "- Chunking intelligent par paragraphes pour textes très longs\n", - "- Concaténation audio avec crossfade\n", - "- Barre de progression et estimation temps restant\n", - "- Gestion mémoire optimisée\n", - "- Correction du bug d'argument 'language' sur l'API synthesizer\n", - "\n", - "Auteur: Bruno\n", - "Date: Janvier 2025\n", - "Correction: Gemini\n", - "\"\"\"\n", - "\n", - "# ==============================================================================\n", - "# IMPORTS\n", - "# ==============================================================================\n", - "\n", - "from __future__ import annotations\n", - "\n", - "import os\n", - "import re\n", - "import gc\n", - "import wave\n", - "import time\n", - "import hashlib\n", - "import warnings\n", - "from pathlib import Path\n", - "from typing import Optional, Union, List, Callable\n", - "from dataclasses import dataclass\n", - "from enum import Enum\n", - "\n", - "import numpy as np\n", - "\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", - "\n", - "# ==============================================================================\n", - "# INSTALLATION (Colab)\n", - "# ==============================================================================\n", - "\n", - "def install_dependencies():\n", - " \"\"\"Installe les dépendances si nécessaire (Colab).\"\"\"\n", - " import subprocess\n", - " import sys\n", - "\n", - " packages = [\n", - " (\"scipy\", \"scipy\"),\n", - " (\"noisereduce\", \"noisereduce\"),\n", - " (\"TTS\", \"coqui-tts\"),\n", - " ]\n", - "\n", - " for module, package in packages:\n", - " try:\n", - " __import__(module)\n", - " except ImportError:\n", - " print(f\"📦 Installation de {package}...\")\n", - " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", package])\n", - "\n", - " # numpy compatible\n", - " try:\n", - " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"numpy==2.0.2\"])\n", - " except:\n", - " pass\n", - "\n", - "# ==============================================================================\n", - "# CONFIGURATION\n", - "# ==============================================================================\n", - "\n", - "@dataclass\n", - "class TTSConfig:\n", - " \"\"\"Configuration globale du module TTS.\"\"\"\n", - " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", - " SAMPLE_RATE: int = 24000\n", - " DEFAULT_LANGUAGE: str = \"fr\"\n", - " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", - "\n", - " # Configuration pour audio longs\n", - " MAX_CHARS_PER_CHUNK: int = 500 # Caractères max par chunk pour textes très longs\n", - " CROSSFADE_DURATION: float = 0.05 # Durée du crossfade en secondes\n", - " ENABLE_TEXT_SPLITTING: bool = True # Activer le split natif XTTS\n", - "\n", - " PRESET_VOICES: dict = None\n", - "\n", - " def __post_init__(self):\n", - " self.PRESET_VOICES = {\n", - " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", - " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", - " }\n", - "\n", - "Config = TTSConfig()\n", - "\n", - "# ==============================================================================\n", - "# DEVICE MANAGEMENT\n", - "# ==============================================================================\n", - "\n", - "_device = None\n", - "_device_name = \"cpu\"\n", - "\n", - "def detect_device():\n", - " \"\"\"Détecte le meilleur device disponible.\"\"\"\n", - " global _device, _device_name\n", - " import torch\n", - "\n", - " # Essayer TPU\n", - " try:\n", - " import torch_xla.core.xla_model as xm\n", - " _device = xm.xla_device()\n", - " _device_name = \"tpu\"\n", - " print(f\"⚙️ Device: TPU\")\n", - " return\n", - " except:\n", - " pass\n", - "\n", - " # Essayer CUDA\n", - " if torch.cuda.is_available():\n", - " _device = torch.device(\"cuda\")\n", - " _device_name = f\"cuda ({torch.cuda.get_device_name(0)})\"\n", - " print(f\"⚙️ Device: {_device_name}\")\n", - " return\n", - "\n", - " # Fallback CPU\n", - " _device = torch.device(\"cpu\")\n", - " _device_name = \"cpu\"\n", - " print(f\"⚙️ Device: CPU\")\n", - "\n", - "# ==============================================================================\n", - "# TEXT SPLITTING UTILITIES\n", - "# ==============================================================================\n", - "\n", - "class TextSplitter:\n", - " \"\"\"\n", - " Utilitaire pour découper intelligemment les textes longs.\n", - " Préserve la cohérence des phrases et paragraphes.\n", - " \"\"\"\n", - "\n", - " @staticmethod\n", - " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", - " \"\"\"\n", - " Estime la durée audio pour un texte donné.\n", - " \"\"\"\n", - " return len(text) / chars_per_second\n", - "\n", - " @staticmethod\n", - " def split_into_sentences(text: str) -> List[str]:\n", - " \"\"\"Découpe le texte en phrases.\"\"\"\n", - " # Pattern pour fin de phrase\n", - " pattern = r'(?<=[.!?])\\s+'\n", - " sentences = re.split(pattern, text)\n", - " return [s.strip() for s in sentences if s.strip()]\n", - "\n", - " @staticmethod\n", - " def split_into_paragraphs(text: str) -> List[str]:\n", - " \"\"\"Découpe le texte en paragraphes.\"\"\"\n", - " paragraphs = re.split(r'\\n\\s*\\n', text)\n", - " return [p.strip() for p in paragraphs if p.strip()]\n", - "\n", - " @classmethod\n", - " def split_for_long_audio(\n", - " cls,\n", - " text: str,\n", - " max_chars: int = 500,\n", - " preserve_sentences: bool = True\n", - " ) -> List[str]:\n", - " \"\"\"\n", - " Découpe un texte long en chunks optimaux pour la synthèse.\n", - " \"\"\"\n", - " # Si texte court, retourner tel quel\n", - " if len(text) <= max_chars:\n", - " return [text]\n", - "\n", - " chunks = []\n", - "\n", - " if preserve_sentences:\n", - " sentences = cls.split_into_sentences(text)\n", - " current_chunk = \"\"\n", - "\n", - " for sentence in sentences:\n", - " # Si la phrase seule dépasse max_chars, la découper\n", - " if len(sentence) > max_chars:\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " current_chunk = \"\"\n", - " # Découper la phrase longue par mots\n", - " words = sentence.split()\n", - " sub_chunk = \"\"\n", - " for word in words:\n", - " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", - " sub_chunk += \" \" + word if sub_chunk else word\n", - " else:\n", - " if sub_chunk:\n", - " chunks.append(sub_chunk.strip())\n", - " sub_chunk = word\n", - " if sub_chunk:\n", - " current_chunk = sub_chunk\n", - " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", - " current_chunk += \" \" + sentence if current_chunk else sentence\n", - " else:\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " current_chunk = sentence\n", - "\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " else:\n", - " # Découpage simple par caractères\n", - " for i in range(0, len(text), max_chars):\n", - " chunks.append(text[i:i + max_chars])\n", - "\n", - " return chunks\n", - "\n", - "\n", - "# ==============================================================================\n", - "# AUDIO PROCESSING\n", - "# ==============================================================================\n", - "\n", - "class AudioProcessor:\n", - " \"\"\"Processeur audio pour post-traitement et concaténation.\"\"\"\n", - "\n", - " @staticmethod\n", - " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", - " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - "\n", - " peak = np.max(np.abs(audio))\n", - " if peak > 0:\n", - " target_linear = 10 ** (target_db / 20)\n", - " audio = audio * (target_linear / peak)\n", - "\n", - " return np.clip(audio, -1.0, 1.0)\n", - "\n", - " @staticmethod\n", - " def crossfade(\n", - " audio1: np.ndarray,\n", - " audio2: np.ndarray,\n", - " sample_rate: int,\n", - " duration: float = 0.05\n", - " ) -> np.ndarray:\n", - " \"\"\"\n", - " Concatène deux segments audio avec crossfade.\n", - " \"\"\"\n", - " # Convertir en float si nécessaire\n", - " if audio1.dtype == np.int16:\n", - " audio1 = audio1.astype(np.float32) / 32768.0\n", - " if audio2.dtype == np.int16:\n", - " audio2 = audio2.astype(np.float32) / 32768.0\n", - "\n", - " fade_samples = int(sample_rate * duration)\n", - "\n", - " # Si audio trop court pour crossfade, concaténer simplement\n", - " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", - " return np.concatenate([audio1, audio2])\n", - "\n", - " # Créer les courbes de fade\n", - " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", - " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", - "\n", - " # Appliquer le crossfade\n", - " audio1_end = audio1[-fade_samples:] * fade_out\n", - " audio2_start = audio2[:fade_samples] * fade_in\n", - "\n", - " # Assembler\n", - " result = np.concatenate([\n", - " audio1[:-fade_samples],\n", - " audio1_end + audio2_start,\n", - " audio2[fade_samples:]\n", - " ])\n", - "\n", - " return result\n", - "\n", - " @classmethod\n", - " def concatenate_chunks(\n", - " cls,\n", - " audio_chunks: List[np.ndarray],\n", - " sample_rate: int,\n", - " crossfade_duration: float = 0.05\n", - " ) -> np.ndarray:\n", - " \"\"\"\n", - " Concatène plusieurs chunks audio avec crossfade.\n", - " \"\"\"\n", - " if not audio_chunks:\n", - " return np.array([], dtype=np.float32)\n", - "\n", - " if len(audio_chunks) == 1:\n", - " audio = audio_chunks[0]\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - " return audio\n", - "\n", - " result = audio_chunks[0]\n", - " if result.dtype == np.int16:\n", - " result = result.astype(np.float32) / 32768.0\n", - "\n", - " for chunk in audio_chunks[1:]:\n", - " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", - "\n", - " return result\n", - "\n", - " @staticmethod\n", - " def enhance(\n", - " audio: np.ndarray,\n", - " sample_rate: int,\n", - " normalize: bool = True,\n", - " warmth: bool = True\n", - " ) -> np.ndarray:\n", - " \"\"\"Améliore la qualité audio.\"\"\"\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - "\n", - " if warmth:\n", - " try:\n", - " from scipy import signal\n", - " nyquist = sample_rate / 2\n", - " cutoff = min(300, nyquist * 0.9) / nyquist\n", - " b, a = signal.butter(2, cutoff, btype='low')\n", - " bass = signal.filtfilt(b, a, audio)\n", - " audio = audio + 0.15 * bass\n", - " except ImportError:\n", - " pass\n", - "\n", - " if normalize:\n", - " peak = np.max(np.abs(audio))\n", - " if peak > 0:\n", - " target = 10 ** (-3.0 / 20)\n", - " audio = audio * (target / peak)\n", - "\n", - " audio = np.clip(audio, -1.0, 1.0)\n", - " return audio\n", - "\n", - "\n", - "# ==============================================================================\n", - "# PROGRESS TRACKER\n", - "# ==============================================================================\n", - "\n", - "class ProgressTracker:\n", - " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", - "\n", - " def __init__(self, total: int, description: str = \"\"):\n", - " self.total = total\n", - " self.current = 0\n", - " self.description = description\n", - " self.start_time = time.time()\n", - " self.chunk_times = []\n", - "\n", - " def update(self, chunk_duration: float = None):\n", - " \"\"\"Met à jour la progression.\"\"\"\n", - " self.current += 1\n", - " if chunk_duration:\n", - " self.chunk_times.append(chunk_duration)\n", - " self._display()\n", - "\n", - " def _display(self):\n", - " \"\"\"Affiche la barre de progression.\"\"\"\n", - " elapsed = time.time() - self.start_time\n", - " percent = (self.current / self.total) * 100\n", - "\n", - " # Estimation temps restant\n", - " if self.chunk_times:\n", - " avg_time = np.mean(self.chunk_times)\n", - " remaining = avg_time * (self.total - self.current)\n", - " eta_str = self._format_time(remaining)\n", - " else:\n", - " eta_str = \"...\"\n", - "\n", - " # Barre de progression\n", - " bar_length = 30\n", - " filled = int(bar_length * self.current / self.total)\n", - " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", - "\n", - " elapsed_str = self._format_time(elapsed)\n", - "\n", - " print(f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", - " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\", end=\"\")\n", - "\n", - " if self.current >= self.total:\n", - " print() # Nouvelle ligne à la fin\n", - "\n", - " @staticmethod\n", - " def _format_time(seconds: float) -> str:\n", - " \"\"\"Formate un temps en secondes en HH:MM:SS.\"\"\"\n", - " hours = int(seconds // 3600)\n", - " minutes = int((seconds % 3600) // 60)\n", - " secs = int(seconds % 60)\n", - "\n", - " if hours > 0:\n", - " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", - " return f\"{minutes:02d}:{secs:02d}\"\n", - "\n", - "\n", - "# ==============================================================================\n", - "# TTS ENGINE\n", - "# ==============================================================================\n", - "\n", - "_tts_model = None\n", - "_voices_cache = {}\n", - "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", - "\n", - "def get_model():\n", - " \"\"\"Charge le modèle XTTS v2 avec cache.\"\"\"\n", - " global _tts_model\n", - "\n", - " if _tts_model is None:\n", - " print(\"🔄 Chargement du modèle XTTS v2...\")\n", - " from TTS.api import TTS\n", - "\n", - " _tts_model = TTS(Config.MODEL_NAME)\n", - "\n", - " if _device is not None and _device_name.startswith(\"cuda\"):\n", - " _tts_model = _tts_model.to(_device)\n", - "\n", - " print(\"✓ Modèle chargé\")\n", - "\n", - " return _tts_model\n", - "\n", - "\n", - "def get_voice_path(voice: str) -> str:\n", - " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", - " global _voices_cache\n", - " import urllib.request\n", - "\n", - " if voice in _voices_cache:\n", - " return _voices_cache[voice]\n", - "\n", - " if os.path.isfile(voice):\n", - " _voices_cache[voice] = voice\n", - " return voice\n", - "\n", - " if voice in Config.PRESET_VOICES:\n", - " url = Config.PRESET_VOICES[voice]\n", - " path = f\"/tmp/{voice}.wav\"\n", - "\n", - " if not os.path.exists(path):\n", - " print(f\"📥 Téléchargement de la voix '{voice}'...\")\n", - " urllib.request.urlretrieve(url, path)\n", - "\n", - " _voices_cache[voice] = path\n", - " return path\n", - "\n", - " raise FileNotFoundError(f\"Voix '{voice}' non trouvée\")\n", - "\n", - "\n", - "# ==============================================================================\n", - "# MAIN SYNTHESIS FUNCTIONS\n", - "# ==============================================================================\n", - "\n", - "def synthesize_chunk(\n", - " text: str,\n", - " voice_path: str,\n", - " language: str = \"fr\",\n", - " enable_text_splitting: bool = True\n", - ") -> np.ndarray:\n", - " \"\"\"\n", - " Synthétise un chunk de texte en audio via l'inférence directe (Low-Level).\n", - " Bypass total du SpeakerManager pour éviter le bug FileNotFoundError .pth\n", - " \"\"\"\n", - " model_wrapper = get_model()\n", - "\n", - " # 1. Accès \"chirurgical\" au modèle interne XTTS\n", - " # C'est lui qui fait le travail, sans la couche de gestion de fichiers buggée\n", - " if hasattr(model_wrapper, 'synthesizer'):\n", - " xtts_model = model_wrapper.synthesizer.tts_model\n", - " else:\n", - " # Cas rare ou structure différente, on tente l'accès direct\n", - " xtts_model = model_wrapper.tts_model\n", - "\n", - " # 2. Calcul manuel des latents (Empreinte vocale)\n", - " # On transforme le fichier WAV en vecteurs mathématiques\n", - " try:\n", - " gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(\n", - " audio_path=[voice_path],\n", - " gpt_cond_len=30,\n", - " max_ref_length=60\n", - " )\n", - " except Exception as e:\n", - " print(f\"⚠️ Erreur calcul latents: {e}\")\n", - " raise e\n", - "\n", - " # 3. Inférence directe\n", - " # On appelle la fonction de génération pure, sans passer par tts()\n", - " try:\n", - " out = xtts_model.inference(\n", - " text=text,\n", - " language=language,\n", - " gpt_cond_latent=gpt_cond_latent,\n", - " speaker_embedding=speaker_embedding,\n", - " temperature=0.7, # Paramètre standard pour la créativité\n", - " length_penalty=1.0, # Pénalité de longueur\n", - " repetition_penalty=2.0, # Évite les bégaiements\n", - " top_k=50,\n", - " top_p=0.8,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - "\n", - " # Le résultat est généralement dans un dictionnaire sous la clé 'wav'\n", - " if isinstance(out, dict) and 'wav' in out:\n", - " wav = out['wav']\n", - " else:\n", - " wav = out\n", - "\n", - " # S'assurer que c'est bien un numpy array sur CPU\n", - " if hasattr(wav, 'cpu'):\n", - " wav = wav.cpu().numpy()\n", - " if isinstance(wav, list):\n", - " wav = np.array(wav, dtype=np.float32)\n", - "\n", - " return wav\n", - "\n", - " except Exception as e:\n", - " print(f\"⚠️ Erreur lors de l'inférence directe : {e}\")\n", - " raise e\n", - "\n", - "\n", - "def text_to_speech_long(\n", - " text: str,\n", - " voice: str = \"female_fr\",\n", - " language: str = \"fr\",\n", - " output_path: Optional[str] = None,\n", - " enhance: bool = False,\n", - " use_gdrive: bool = False,\n", - " gdrive_folder: str = None,\n", - " max_chars_per_chunk: int = None,\n", - " show_progress: bool = True,\n", - " enable_text_splitting: bool = True\n", - ") -> dict:\n", - " \"\"\"\n", - " Génère un fichier audio long (> 1 heure) à partir de texte.\n", - " \"\"\"\n", - " import torch\n", - "\n", - " # Configuration\n", - " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", - " voice_path = get_voice_path(voice)\n", - "\n", - " # Estimation initiale\n", - " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", - " print(f\"\\n📝 Texte: {len(text):,} caractères\")\n", - " print(f\"⏱️ Durée estimée: {ProgressTracker._format_time(estimated_duration)}\")\n", - "\n", - " # Découper le texte\n", - " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", - " print(f\"📦 Chunks: {len(chunks)}\")\n", - "\n", - " # Initialiser la progression\n", - " progress = None\n", - " if show_progress:\n", - " progress = ProgressTracker(len(chunks), \"🎙️ Synthèse\")\n", - "\n", - " # Générer l'audio chunk par chunk\n", - " audio_chunks = []\n", - "\n", - " for i, chunk in enumerate(chunks):\n", - " chunk_start = time.time()\n", - "\n", - " try:\n", - " wav = synthesize_chunk(\n", - " text=chunk,\n", - " voice_path=voice_path,\n", - " language=language,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - " audio_chunks.append(wav)\n", - "\n", - " except Exception as e:\n", - " print(f\"\\n⚠️ Erreur chunk {i+1}: {e}\")\n", - " # Continuer avec les autres chunks\n", - " continue\n", - "\n", - " # Libérer la mémoire GPU périodiquement\n", - " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", - " torch.cuda.empty_cache()\n", - "\n", - " chunk_duration = time.time() - chunk_start\n", - " if progress:\n", - " progress.update(chunk_duration)\n", - "\n", - " if not audio_chunks:\n", - " raise RuntimeError(\"Aucun audio généré\")\n", - "\n", - " print(\"\\n🔗 Concaténation des chunks...\")\n", - "\n", - " # Concaténer avec crossfade\n", - " final_audio = AudioProcessor.concatenate_chunks(\n", - " audio_chunks,\n", - " Config.SAMPLE_RATE,\n", - " Config.CROSSFADE_DURATION\n", - " )\n", - "\n", - " # Libérer les chunks de la mémoire\n", - " del audio_chunks\n", - " gc.collect()\n", - " if _device_name.startswith(\"cuda\"):\n", - " torch.cuda.empty_cache()\n", - "\n", - " # Post-traitement\n", - " if enhance:\n", - " print(\"✨ Post-traitement...\")\n", - " final_audio = AudioProcessor.enhance(\n", - " final_audio,\n", - " Config.SAMPLE_RATE,\n", - " normalize=True,\n", - " warmth=True\n", - " )\n", - " else:\n", - " final_audio = AudioProcessor.normalize(final_audio)\n", - "\n", - " # Convertir en int16\n", - " final_audio = (final_audio * 32767).astype(np.int16)\n", - "\n", - " # Générer le nom de fichier\n", - " if output_path is None:\n", - " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", - " output_path = f\"tts_long_{voice}_{h}.wav\"\n", - "\n", - " # Dossier de sortie\n", - " if use_gdrive:\n", - " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", - " folder.mkdir(parents=True, exist_ok=True)\n", - " final_path = folder / Path(output_path).name\n", - " else:\n", - " final_path = Path(output_path)\n", - "\n", - " # Sauvegarder\n", - " print(f\"💾 Sauvegarde: {final_path}\")\n", - " with wave.open(str(final_path), \"wb\") as wav_file:\n", - " wav_file.setnchannels(1)\n", - " wav_file.setsampwidth(2)\n", - " wav_file.setframerate(Config.SAMPLE_RATE)\n", - " wav_file.writeframes(final_audio.tobytes())\n", - "\n", - " # Calculer la durée réelle\n", - " duration = len(final_audio) / Config.SAMPLE_RATE\n", - "\n", - " print(f\"\\n✅ Audio généré avec succès!\")\n", - " print(f\" 📁 Fichier: {final_path}\")\n", - " print(f\" ⏱️ Durée: {ProgressTracker._format_time(duration)}\")\n", - " print(f\" 📦 Chunks: {len(chunks)}\")\n", - " print(f\" 🎤 Voix: {voice}\")\n", - "\n", - " return {\n", - " 'path': str(final_path),\n", - " 'sample_rate': Config.SAMPLE_RATE,\n", - " 'duration_seconds': duration,\n", - " 'duration_formatted': ProgressTracker._format_time(duration),\n", - " 'audio_data': final_audio,\n", - " 'voice': voice,\n", - " 'language': language,\n", - " 'device': _device_name,\n", - " 'chunks_count': len(chunks),\n", - " 'text_length': len(text)\n", - " }\n", - "\n", - "\n", - "def text_to_speech(\n", - " text: str,\n", - " voice: str = \"female_fr\",\n", - " language: str = \"fr\",\n", - " output_path: Optional[str] = None,\n", - " enhance: bool = False,\n", - " use_gdrive: bool = False,\n", - " gdrive_folder: str = None,\n", - " enable_text_splitting: bool = True\n", - ") -> dict:\n", - " \"\"\"\n", - " Génère un fichier audio à partir de texte avec XTTS v2.\n", - " \"\"\"\n", - " # Basculer automatiquement vers la version long pour textes > 10000 chars\n", - " if len(text) > 10000:\n", - " print(\"📢 Texte long détecté - utilisation de text_to_speech_long()\")\n", - " return text_to_speech_long(\n", - " text=text,\n", - " voice=voice,\n", - " language=language,\n", - " output_path=output_path,\n", - " enhance=enhance,\n", - " use_gdrive=use_gdrive,\n", - " gdrive_folder=gdrive_folder,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - "\n", - " voice_path = get_voice_path(voice)\n", - "\n", - " # Générer l'audio avec enable_text_splitting\n", - " wav = synthesize_chunk(\n", - " text=text,\n", - " voice_path=voice_path,\n", - " language=language,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - "\n", - " # Post-traitement\n", - " if enhance:\n", - " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", - " else:\n", - " audio = AudioProcessor.normalize(wav)\n", - "\n", - " audio = (audio * 32767).astype(np.int16)\n", - "\n", - " # Nom de fichier\n", - " if output_path is None:\n", - " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", - " output_path = f\"tts_{voice}_{h}.wav\"\n", - "\n", - " # Dossier de sortie\n", - " if use_gdrive:\n", - " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", - " folder.mkdir(parents=True, exist_ok=True)\n", - " final_path = folder / Path(output_path).name\n", - " else:\n", - " final_path = Path(output_path)\n", - "\n", - " # Sauvegarder\n", - " with wave.open(str(final_path), \"wb\") as wav_file:\n", - " wav_file.setnchannels(1)\n", - " wav_file.setsampwidth(2)\n", - " wav_file.setframerate(Config.SAMPLE_RATE)\n", - " wav_file.writeframes(audio.tobytes())\n", - "\n", - " duration = len(audio) / Config.SAMPLE_RATE\n", - "\n", - " print(f\"✓ Audio généré: {final_path}\")\n", - " print(f\" Durée: {duration:.2f}s | Voix: {voice}\")\n", - "\n", - " return {\n", - " 'path': str(final_path),\n", - " 'sample_rate': Config.SAMPLE_RATE,\n", - " 'duration_seconds': duration,\n", - " 'audio_data': audio,\n", - " 'voice': voice,\n", - " 'language': language,\n", - " 'device': _device_name\n", - " }\n", - "\n", - "\n", - "# ==============================================================================\n", - "# UTILITIES\n", - "# ==============================================================================\n", - "\n", - "def preview_audio(result: dict) -> None:\n", - " \"\"\"Prévisualise l'audio dans le notebook.\"\"\"\n", - " from IPython.display import Audio, display\n", - "\n", - " audio = result['audio_data']\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - "\n", - " display(Audio(audio, rate=result['sample_rate']))\n", - "\n", - "\n", - "def list_voices() -> list:\n", - " \"\"\"Liste les voix disponibles.\"\"\"\n", - " return list(Config.PRESET_VOICES.keys())\n", - "\n", - "\n", - "def list_languages() -> list:\n", - " \"\"\"Liste les langues supportées.\"\"\"\n", - " return [\"en\", \"es\", \"fr\", \"de\", \"it\", \"pt\", \"pl\", \"tr\",\n", - " \"ru\", \"nl\", \"cs\", \"ar\", \"zh-cn\", \"ja\", \"hu\", \"ko\", \"hi\"]\n", - "\n", - "\n", - "def clear_cache():\n", - " \"\"\"Libère la mémoire.\"\"\"\n", - " global _tts_model\n", - " import torch\n", - "\n", - " _tts_model = None\n", - " gc.collect()\n", - "\n", - " if _device_name.startswith(\"cuda\"):\n", - " torch.cuda.empty_cache()\n", - "\n", - " print(\"✓ Cache vidé\")\n", - "\n", - "\n", - "def estimate_duration(text: str) -> dict:\n", - " \"\"\"\n", - " Estime la durée audio pour un texte.\n", - " \"\"\"\n", - " duration = TextSplitter.estimate_audio_duration(text)\n", - " chunks = len(TextSplitter.split_for_long_audio(text))\n", - "\n", - " return {\n", - " 'chars': len(text),\n", - " 'estimated_seconds': duration,\n", - " 'estimated_formatted': ProgressTracker._format_time(duration),\n", - " 'chunks_estimate': chunks\n", - " }\n", - "\n", - "\n", - "# ==============================================================================\n", - "# ALIASES\n", - "# ==============================================================================\n", - "\n", - "tts = text_to_speech\n", - "tts_long = text_to_speech_long\n", - "\n", - "\n", - "# ==============================================================================\n", - "# INITIALIZATION\n", - "# ==============================================================================\n", - "\n", - "def init():\n", - " \"\"\"Initialise le module.\"\"\"\n", - " detect_device()\n", - " print(\"✅ Module XTTS v2 Long Audio chargé\")\n", - " print(f\" Device: {_device_name}\")\n", - " print(f\" Voix: {list_voices()}\")\n", - " print(f\" enable_text_splitting: activé par défaut\")\n", - "\n", - "\n", - "# Auto-init\n", - "if __name__ != \"__main__\":\n", - " try:\n", - " detect_device()\n", - " except:\n", - " pass\n", - "\n", - "\n", - "# ==============================================================================\n", - "# EXAMPLE USAGE\n", - "# ==============================================================================\n", - "\n", - "if __name__ == \"__main__\":\n", - " # Installation si nécessaire\n", - " install_dependencies()\n", - "\n", - " # Initialisation\n", - " init()\n", - "\n", - " # Exemple avec texte court\n", - " print(\"\\n\" + \"=\"*60)\n", - " print(\"EXEMPLE 1: Texte court\")\n", - " print(\"=\"*60)\n", - "\n", - " short_text = \"\"\"\n", - " Ce document présente les Manifold-Constrained Hyper-Connections,\n", - " une architecture novatrice conçue par DeepSeek-AI pour stabiliser\n", - " l'entraînement des grands modèles de langage.\n", - " \"\"\"\n", - "\n", - " result = text_to_speech(\n", - " text=short_text.strip(),\n", - " voice=\"female_fr\",\n", - " enhance=True\n", - " )\n", - "\n", - " print(f\"\\nRésultat: {result['duration_seconds']:.2f}s\")\n", - "\n", - " # Exemple avec texte long (simulé)\n", - " print(\"\\n\" + \"=\"*60)\n", - " print(\"EXEMPLE 2: Estimation pour texte long\")\n", - " print(\"=\"*60)\n", - "\n", - " # Simuler un texte de ~1 heure (environ 54000 caractères)\n", - " long_text = short_text.strip() * 300 # ~54000 chars ≈ 1 heure\n", - "\n", - " estimation = estimate_duration(long_text)\n", - " print(f\"\\nEstimation pour {estimation['chars']:,} caractères:\")\n", - " print(f\" Durée: {estimation['estimated_formatted']}\")\n", - " print(f\" Chunks: {estimation['chunks_estimate']}\")\n", - "\n", - " # Pour générer réellement:\n", - " # result = text_to_speech_long(long_text, voice=\"female_fr\", show_progress=True)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1FQhKQ1IE4iX", - "outputId": "f0b2eb8b-e071-40b0-e342-1395217fe769" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "⚙️ Device: cuda (Tesla T4)\n", - "✅ Module XTTS v2 Long Audio chargé\n", - " Device: cuda (Tesla T4)\n", - " Voix: ['female_fr', 'male_fr']\n", - " enable_text_splitting: activé par défaut\n", - "\n", - "============================================================\n", - "EXEMPLE 1: Texte court\n", - "============================================================\n", - "🔄 Chargement du modèle XTTS v2...\n", - "✓ Modèle chargé\n", - "✓ Audio généré: tts_female_fr_151473ed.wav\n", - " Durée: 9.79s | Voix: female_fr\n", - "\n", - "Résultat: 9.79s\n", - "\n", - "============================================================\n", - "EXEMPLE 2: Estimation pour texte long\n", - "============================================================\n", - "\n", - "Estimation pour 55,200 caractères:\n", - " Durée: 01:01:20\n", - " Chunks: 108\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# Lire le fichier\n", - "with open(\"mon_texte_long.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " texte_complet = f.read()\n", - "\n", - "# Lancer la génération\n", - "text_to_speech_long(\n", - " text=texte_complet,\n", - " voice=\"female_fr\",\n", - " language=\"fr\"\n", - ")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "BRhVnXgSE7Yd", - "outputId": "ce14c47a-70f3-4158-954b-9f5a415724f9" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "📝 Texte: 6,715 caractères\n", - "⏱️ Durée estimée: 07:27\n", - "📦 Chunks: 16\n", - "🎙️ Synthèse [██████████████████████████████] 16/16 (100.0%) | Temps: 03:10 | ETA: 00:00\n", - "\n", - "🔗 Concaténation des chunks...\n", - "💾 Sauvegarde: tts_long_female_fr_8aba435b.wav\n", - "\n", - "✅ Audio généré avec succès!\n", - " 📁 Fichier: tts_long_female_fr_8aba435b.wav\n", - " ⏱️ Durée: 06:37\n", - " 📦 Chunks: 16\n", - " 🎤 Voix: female_fr\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'path': 'tts_long_female_fr_8aba435b.wav',\n", - " 'sample_rate': 24000,\n", - " 'duration_seconds': 397.586,\n", - " 'duration_formatted': '06:37',\n", - " 'audio_data': array([28, 21, 31, ..., 3, 7, 1], dtype=int16),\n", - " 'voice': 'female_fr',\n", - " 'language': 'fr',\n", - " 'device': 'cuda (Tesla T4)',\n", - " 'chunks_count': 16,\n", - " 'text_length': 6715}" - ] - }, - "metadata": {}, - "execution_count": 8 - } - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "KaWW0-DIMy7R" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file From 83b59d147d5a498b77f149672ed17341fb1b7399 Mon Sep 17 00:00:00 2001 From: brunombo Date: Thu, 29 Jan 2026 08:36:19 +0100 Subject: [PATCH 3/9] =?UTF-8?q?Cr=C3=A9=C3=A9=20=C3=A0=20l'aide=20de=20Col?= =?UTF-8?q?ab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- long_TTS_xtts_v3.ipynb | 1085 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1085 insertions(+) create mode 100644 long_TTS_xtts_v3.ipynb diff --git a/long_TTS_xtts_v3.ipynb b/long_TTS_xtts_v3.ipynb new file mode 100644 index 000000000000..75f8cbcc8a44 --- /dev/null +++ b/long_TTS_xtts_v3.ipynb @@ -0,0 +1,1085 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4", + "authorship_tag": "ABX9TyPEqfSBvulICbte1mdWmCvM", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "s8NfbT3sw2-z" + } + }, + { + "cell_type": "code", + "source": [ + "PROMPT = \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", + "\n", + "voice_gender = 'female_fr'\n", + "# ['female_fr', 'male_fr']" + ], + "metadata": { + "id": "XYDOUW523oJP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jIKtDA5hweJP", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "acd264d8-f686-43ec-98f2-d8c267422276" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m862.8/862.8 kB\u001b[0m \u001b[31m28.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m345.1/345.1 kB\u001b[0m \u001b[31m31.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.2/56.2 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m997.3/997.3 kB\u001b[0m \u001b[31m59.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m648.4/648.4 kB\u001b[0m \u001b[31m53.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m41.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "# Installation des dépendances\n", + "!pip install -q scipy noisereduce\n", + "\n", + "# Installation du fork maintenu (supporte Python 3.12+)\n", + "!pip install -q coqui-tts\n", + "!pip install -q torchcodec\n", + "\n", + "\n", + "# Installation des dépendances\n", + "!pip install -q scipy noisereduce\n", + "!pip install -q numpy==2.0.2\n", + "\n", + "# Installation de soundfile pour le chargement audio (évite le bug torchcodec)\n", + "!pip install -q soundfile\n", + "\n", + "# Installation du fork maintenu (supporte Python 3.12+)\n", + "!pip install -q coqui-tts\n", + "\n", + "# Note: torchcodec n'est plus nécessaire - on utilise soundfile comme backend" + ] + }, + { + "cell_type": "code", + "source": [ + "# -*- coding: utf-8 -*-\n", + "\"\"\"\n", + "TTS XTTS v2 - Version Long Audio (> 1 heure)\n", + "=============================================\n", + "\n", + "Module de synthèse vocale haute qualité utilisant Coqui XTTS v2.\n", + "Optimisé pour la génération d'audio longs avec:\n", + "- enable_text_splitting=True pour découpage automatique\n", + "- Chunking intelligent par paragraphes pour textes très longs\n", + "- Concaténation audio avec crossfade\n", + "- Barre de progression et estimation temps restant\n", + "- Gestion mémoire optimisée\n", + "- Correction du bug d'argument 'language' sur l'API synthesizer\n", + "\n", + "Auteur: Bruno\n", + "Date: Janvier 2025\n", + "Correction: Gemini\n", + "\"\"\"\n", + "\n", + "# ==============================================================================\n", + "# IMPORTS\n", + "# ==============================================================================\n", + "\n", + "from __future__ import annotations\n", + "\n", + "import os\n", + "import re\n", + "import gc\n", + "import wave\n", + "import time\n", + "import hashlib\n", + "import warnings\n", + "from pathlib import Path\n", + "from typing import Optional, Union, List, Callable\n", + "from dataclasses import dataclass\n", + "from enum import Enum\n", + "\n", + "import numpy as np\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "\n", + "# ==============================================================================\n", + "# INSTALLATION (Colab)\n", + "# ==============================================================================\n", + "\n", + "def install_dependencies():\n", + " \"\"\"Installe les dépendances si nécessaire (Colab).\"\"\"\n", + " import subprocess\n", + " import sys\n", + "\n", + " # Installer FFmpeg pour torchcodec\n", + " try:\n", + " print(\"📦 Installation de FFmpeg...\")\n", + " subprocess.check_call([\"apt-get\", \"update\", \"-qq\"])\n", + " subprocess.check_call([\"apt-get\", \"install\", \"-qq\", \"ffmpeg\"])\n", + " print(\"✓ FFmpeg installé\")\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'installation de FFmpeg: {e}\")\n", + "\n", + " packages = [\n", + " (\"scipy\", \"scipy\"),\n", + " (\"noisereduce\", \"noisereduce\"),\n", + " (\"TTS\", \"coqui-tts\"),\n", + " ]\n", + "\n", + " for module, package in packages:\n", + " try:\n", + " __import__(module)\n", + " except ImportError:\n", + " print(f\"📦 Installation de {package}...\")\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", package])\n", + "\n", + " # numpy compatible\n", + " # The previous attempt to install a specific numpy version was causing compatibility issues.\n", + " # Removing this line to allow torchcodec and other libraries to install a compatible numpy version.\n", + " # try:\n", + " # subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"numpy==2.0.2\"])\n", + " # except:\n", + " # pass\n", + "\n", + "# ==============================================================================\n", + "# CONFIGURATION\n", + "# ==============================================================================\n", + "\n", + "@dataclass\n", + "class TTSConfig:\n", + " \"\"\"Configuration globale du module TTS.\"\"\"\n", + " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", + " SAMPLE_RATE: int = 24000\n", + " DEFAULT_LANGUAGE: str = \"fr\"\n", + " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", + "\n", + " # Configuration pour audio longs\n", + " MAX_CHARS_PER_CHUNK: int = 500 # Caractères max par chunk pour textes très longs\n", + " CROSSFADE_DURATION: float = 0.05 # Durée du crossfade en secondes\n", + " ENABLE_TEXT_SPLITTING: bool = True # Activer le split natif XTTS\n", + "\n", + " PRESET_VOICES: dict = None\n", + "\n", + " def __post_init__(self):\n", + " self.PRESET_VOICES = {\n", + " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", + " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", + " }\n", + "\n", + "Config = TTSConfig()\n", + "\n", + "# ==============================================================================\n", + "# DEVICE MANAGEMENT\n", + "# ==============================================================================\n", + "\n", + "_device = None\n", + "_device_name = \"cpu\"\n", + "\n", + "def detect_device():\n", + " \"\"\"Détecte le meilleur device disponible.\"\"\"\n", + " global _device, _device_name\n", + " import torch\n", + "\n", + " # Essayer TPU\n", + " try:\n", + " import torch_xla.core.xla_model as xm\n", + " _device = xm.xla_device()\n", + " _device_name = \"tpu\"\n", + " print(f\"⚙️ Device: TPU\")\n", + " return\n", + " except:\n", + " pass\n", + "\n", + " # Essayer CUDA\n", + " if torch.cuda.is_available():\n", + " _device = torch.device(\"cuda\")\n", + " _device_name = f\"cuda ({torch.cuda.get_device_name(0)}\";\n", + " print(f\"⚙️ Device: {_device_name}\")\n", + " return\n", + "\n", + " # Fallback CPU\n", + " _device = torch.device(\"cpu\")\n", + " _device_name = \"cpu\"\n", + " print(f\"⚙️ Device: CPU\")\n", + "\n", + "# ==============================================================================\n", + "# TEXT SPLITTING UTILITIES\n", + "# ==============================================================================\n", + "\n", + "class TextSplitter:\n", + " \"\"\"\n", + " Utilitaire pour découper intelligemment les textes longs.\n", + " Préserve la cohérence des phrases et paragraphes.\n", + " \"\"\"\n", + "\n", + " @staticmethod\n", + " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", + " \"\"\"\n", + " Estime la durée audio pour un texte donné.\n", + " \"\"\"\n", + " return len(text) / chars_per_second\n", + "\n", + " @staticmethod\n", + " def split_into_sentences(text: str) -> List[str]:\n", + " \"\"\"Découpe le texte en phrases.\"\"\"\n", + " # Pattern pour fin de phrase\n", + " pattern = r'(?<=[.!?])\\s+'\n", + " sentences = re.split(pattern, text)\n", + " return [s.strip() for s in sentences if s.strip()]\n", + "\n", + " @staticmethod\n", + " def split_into_paragraphs(text: str) -> List[str]:\n", + " \"\"\"Découpe le texte en paragraphes.\"\"\"\n", + " paragraphs = re.split(r'\\n\\s*\\n', text)\n", + " return [p.strip() for p in paragraphs if p.strip()]\n", + "\n", + " @classmethod\n", + " def split_for_long_audio(\n", + " cls,\n", + " text: str,\n", + " max_chars: int = 500,\n", + " preserve_sentences: bool = True\n", + " ) -> List[str]:\n", + " \"\"\"\n", + " Découpe un texte long en chunks optimaux pour la synthèse.\n", + " \"\"\"\n", + " # Si texte court, retourner tel quel\n", + " if len(text) <= max_chars:\n", + " return [text]\n", + "\n", + " chunks = []\n", + "\n", + " if preserve_sentences:\n", + " sentences = cls.split_into_sentences(text)\n", + " current_chunk = \"\"\n", + "\n", + " for sentence in sentences:\n", + " # Si la phrase seule dépasse max_chars, la découper\n", + " if len(sentence) > max_chars:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = \"\"\n", + " # Découper la phrase longue par mots\n", + " words = sentence.split()\n", + " sub_chunk = \"\"\n", + " for word in words:\n", + " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", + " sub_chunk += \" \" + word if sub_chunk else word\n", + " else:\n", + " if sub_chunk:\n", + " chunks.append(sub_chunk.strip())\n", + " sub_chunk = word\n", + " if sub_chunk:\n", + " current_chunk = sub_chunk\n", + " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", + " current_chunk += \" \" + sentence if current_chunk else sentence\n", + " else:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = sentence\n", + "\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " else:\n", + " # Découpage simple par caractères\n", + " for i in range(0, len(text), max_chars):\n", + " chunks.append(text[i:i + max_chars])\n", + "\n", + " return chunks\n", + "\n", + "\n", + "# ==============================================================================\n", + "# AUDIO PROCESSING\n", + "# ==============================================================================\n", + "\n", + "class AudioProcessor:\n", + " \"\"\"Processeur audio pour post-traitement et concaténation.\"\"\"\n", + "\n", + " @staticmethod\n", + " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", + " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target_linear = 10 ** (target_db / 20)\n", + " audio = audio * (target_linear / peak)\n", + "\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + " @staticmethod\n", + " def crossfade(\n", + " audio1: np.ndarray,\n", + " audio2: np.ndarray,\n", + " sample_rate: int,\n", + " duration: float = 0.05\n", + " ) -> np.ndarray:\n", + " \"\"\"\n", + " Concatène deux segments audio avec crossfade.\n", + " \"\"\"\n", + " # Convertir en float si nécessaire\n", + " if audio1.dtype == np.int16:\n", + " audio1 = audio1.astype(np.float32) / 32768.0\n", + " if audio2.dtype == np.int16:\n", + " audio2 = audio2.astype(np.float32) / 32768.0\n", + "\n", + " fade_samples = int(sample_rate * duration)\n", + "\n", + " # Si audio trop court pour crossfade, concaténer simplement\n", + " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", + " return np.concatenate([audio1, audio2])\n", + "\n", + " # Créer les courbes de fade\n", + " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", + " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", + "\n", + " # Appliquer le crossfade\n", + " audio1_end = audio1[-fade_samples:] * fade_out\n", + " audio2_start = audio2[:fade_samples] * fade_in\n", + "\n", + " # Assembler\n", + " result = np.concatenate([\n", + " audio1[:-fade_samples],\n", + " audio1_end + audio2_start,\n", + " audio2[fade_samples:]\n", + " ])\n", + "\n", + " return result\n", + "\n", + " @classmethod\n", + " def concatenate_chunks(\n", + " cls,\n", + " audio_chunks: List[np.ndarray],\n", + " sample_rate: int,\n", + " crossfade_duration: float = 0.05\n", + " ) -> np.ndarray:\n", + " \"\"\"\n", + " Concatène plusieurs chunks audio avec crossfade.\n", + " \"\"\"\n", + " if not audio_chunks:\n", + " return np.array([], dtype=np.float32)\n", + "\n", + " if len(audio_chunks) == 1:\n", + " audio = audio_chunks[0]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " return audio\n", + "\n", + " result = audio_chunks[0]\n", + " if result.dtype == np.int16:\n", + " result = result.astype(np.float32) / 32768.0\n", + "\n", + " for chunk in audio_chunks[1:]:\n", + " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", + "\n", + " return result\n", + "\n", + " @staticmethod\n", + " def enhance(\n", + " audio: np.ndarray,\n", + " sample_rate: int,\n", + " normalize: bool = True,\n", + " warmth: bool = True\n", + " ) -> np.ndarray:\n", + " \"\"\"Améliore la qualité audio.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " if warmth:\n", + " try:\n", + " from scipy import signal\n", + " nyquist = sample_rate / 2\n", + " cutoff = min(300, nyquist * 0.9) / nyquist\n", + " b, a = signal.butter(2, cutoff, btype='low')\n", + " bass = signal.filtfilt(b, a, audio)\n", + " audio = audio + 0.15 * bass\n", + " except ImportError:\n", + " pass\n", + "\n", + " if normalize:\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target = 10 ** (-3.0 / 20)\n", + " audio = audio * (target / peak)\n", + "\n", + " audio = np.clip(audio, -1.0, 1.0)\n", + " return audio\n", + "\n", + "\n", + "# ==============================================================================\n", + "# PROGRESS TRACKER\n", + "# ==============================================================================\n", + "\n", + "class ProgressTracker:\n", + " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", + "\n", + " def __init__(self, total: int, description: str = \"\"):\n", + " self.total = total\n", + " self.current = 0\n", + " self.description = description\n", + " self.start_time = time.time()\n", + " self.chunk_times = []\n", + "\n", + " def update(self, chunk_duration: float = None):\n", + " \"\"\"Met à jour la progression.\"\"\"\n", + " self.current += 1\n", + " if chunk_duration:\n", + " self.chunk_times.append(chunk_duration)\n", + " self._display()\n", + "\n", + " def _display(self):\n", + " \"\"\"Affiche la barre de progression.\"\"\"\n", + " elapsed = time.time() - self.start_time\n", + " percent = (self.current / self.total) * 100\n", + "\n", + " # Estimation temps restant\n", + " if self.chunk_times:\n", + " avg_time = np.mean(self.chunk_times)\n", + " remaining = avg_time * (self.total - self.current)\n", + " eta_str = self._format_time(remaining)\n", + " else:\n", + " eta_str = \"...\"\n", + "\n", + " # Barre de progression\n", + " bar_length = 30\n", + " filled = int(bar_length * self.current / self.total)\n", + " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", + "\n", + " elapsed_str = self._format_time(elapsed)\n", + "\n", + " print(f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", + " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\", end=\"\")\n", + "\n", + " if self.current >= self.total:\n", + " print() # Nouvelle ligne à la fin\n", + "\n", + " @staticmethod\n", + " def _format_time(seconds: float) -> str:\n", + " \"\"\"Formate un temps en secondes en HH:MM:SS.\"\"\"\n", + " hours = int(seconds // 3600)\n", + " minutes = int((seconds % 3600) // 60)\n", + " secs = int(seconds % 60)\n", + "\n", + " if hours > 0:\n", + " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", + " return f\"{minutes:02d}:{secs:02d}\"\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TTS ENGINE\n", + "# ==============================================================================\n", + "\n", + "_tts_model = None\n", + "_voices_cache = {}\n", + "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", + "\n", + "def get_model():\n", + " \"\"\"Charge le modèle XTTS v2 avec cache.\"\"\"\n", + " global _tts_model\n", + "\n", + " if _tts_model is None:\n", + " print(\"🔄 Chargement du modèle XTTS v2...\")\n", + " from TTS.api import TTS\n", + "\n", + " _tts_model = TTS(Config.MODEL_NAME)\n", + "\n", + " if _device is not None and _device_name.startswith(\"cuda\"):\n", + " _tts_model = _tts_model.to(_device)\n", + "\n", + " print(\"✓ Modèle chargé\")\n", + "\n", + " return _tts_model\n", + "\n", + "\n", + "def get_voice_path(voice: str) -> str:\n", + " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", + " global _voices_cache\n", + " import urllib.request\n", + "\n", + " if voice in _voices_cache:\n", + " return _voices_cache[voice]\n", + "\n", + " if os.path.isfile(voice):\n", + " _voices_cache[voice] = voice\n", + " return voice\n", + "\n", + " if voice in Config.PRESET_VOICES:\n", + " url = Config.PRESET_VOICES[voice]\n", + " path = f\"/tmp/{voice}.wav\"\n", + "\n", + " if not os.path.exists(path):\n", + " print(f\"📥 Téléchargement de la voix '{voice}'...\")\n", + " urllib.request.urlretrieve(url, path)\n", + "\n", + " _voices_cache[voice] = path\n", + " return path\n", + "\n", + " raise FileNotFoundError(f\"Voix '{voice}' non trouvée\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# MAIN SYNTHESIS FUNCTIONS\n", + "# ==============================================================================\n", + "\n", + "def synthesize_chunk(\n", + " text: str,\n", + " voice_path: str,\n", + " language: str = \"fr\",\n", + " enable_text_splitting: bool = True\n", + ") -> np.ndarray:\n", + " \"\"\"\n", + " Synthétise un chunk de texte en audio via l'inférence directe (Low-Level).\n", + " Bypass total du SpeakerManager pour éviter le bug FileNotFoundError .pth\n", + " \"\"\"\n", + " model_wrapper = get_model()\n", + "\n", + " # 1. Accès \"chirurgical\" au modèle interne XTTS\n", + " # C'est lui qui fait le travail, sans la couche de gestion de fichiers buggée\n", + " if hasattr(model_wrapper, 'synthesizer'):\n", + " xtts_model = model_wrapper.synthesizer.tts_model\n", + " else:\n", + " # Cas rare ou structure différente, on tente l'accès direct\n", + " xtts_model = model_wrapper.tts_model\n", + "\n", + " # 2. Calcul manuel des latents (Empreinte vocale)\n", + " # On transforme le fichier WAV en vecteurs mathématiques\n", + " try:\n", + " gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(\n", + " audio_path=[voice_path],\n", + " gpt_cond_len=30,\n", + " max_ref_length=60\n", + " )\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur calcul latents: {e}\")\n", + " raise e\n", + "\n", + " # 3. Inférence directe\n", + " # On appelle la fonction de génération pure, sans passer par tts()\n", + " try:\n", + " out = xtts_model.inference(\n", + " text=text,\n", + " language=language,\n", + " gpt_cond_latent=gpt_cond_latent,\n", + " speaker_embedding=speaker_embedding,\n", + " temperature=0.7, # Paramètre standard pour la créativité\n", + " length_penalty=1.0, # Pénalité de longueur\n", + " repetition_penalty=2.0, # Évite les bégaiements\n", + " top_k=50,\n", + " top_p=0.8,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + "\n", + " # Le résultat est généralement dans un dictionnaire sous la clé 'wav'\n", + " if isinstance(out, dict) and 'wav' in out:\n", + " wav = out['wav']\n", + " else:\n", + " wav = out\n", + "\n", + " # S'assurer que c'est bien un numpy array sur CPU\n", + " if hasattr(wav, 'cpu'):\n", + " wav = wav.cpu().numpy()\n", + " if isinstance(wav, list):\n", + " wav = np.array(wav, dtype=np.float32)\n", + "\n", + " return wav\n", + "\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'inférence directe : {e}\")\n", + " raise e\n", + "\n", + "\n", + "def text_to_speech_long(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " max_chars_per_chunk: int = None,\n", + " show_progress: bool = True,\n", + " enable_text_splitting: bool = True\n", + ") -> dict:\n", + " \"\"\"\n", + " Génère un fichier audio long (> 1 heure) à partir de texte.\n", + " \"\"\"\n", + " import torch\n", + "\n", + " # Configuration\n", + " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Estimation initiale\n", + " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", + " print(f\"\\n📝 Texte: {len(text):,} caractères\")\n", + " print(f\"⏱️ Durée estimée: {ProgressTracker._format_time(estimated_duration)}\")\n", + "\n", + " # Découper le texte\n", + " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", + " print(f\"📦 Chunks: {len(chunks)}\")\n", + "\n", + " # Initialiser la progression\n", + " progress = None\n", + " if show_progress:\n", + " progress = ProgressTracker(len(chunks), \"🎙️ Synthèse\")\n", + "\n", + " # Générer l'audio chunk par chunk\n", + " audio_chunks = []\n", + "\n", + " for i, chunk in enumerate(chunks):\n", + " chunk_start = time.time()\n", + "\n", + " try:\n", + " wav = synthesize_chunk(\n", + " text=chunk,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + " audio_chunks.append(wav)\n", + "\n", + " except Exception as e:\n", + " print(f\"\\n⚠️ Erreur chunk {i+1}: {e}\")\n", + " # Continuer avec les autres chunks\n", + " continue\n", + "\n", + " # Libérer la mémoire GPU périodiquement\n", + " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", + " torch.cuda.empty_cache()\n", + "\n", + " chunk_duration = time.time() - chunk_start\n", + " if progress:\n", + " progress.update(chunk_duration)\n", + "\n", + " if not audio_chunks:\n", + " raise RuntimeError(\"Aucun audio généré\")\n", + "\n", + " print(\"\\n🔗 Concaténation des chunks...\")\n", + "\n", + " # Concaténer avec crossfade\n", + " final_audio = AudioProcessor.concatenate_chunks(\n", + " audio_chunks,\n", + " Config.SAMPLE_RATE,\n", + " Config.CROSSFADE_DURATION\n", + " )\n", + "\n", + " # Libérer les chunks de la mémoire\n", + " del audio_chunks\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " print(\"✨ Post-traitement...\")\n", + " final_audio = AudioProcessor.enhance(\n", + " final_audio,\n", + " Config.SAMPLE_RATE,\n", + " normalize=True,\n", + " warmth=True\n", + " )\n", + " else:\n", + " final_audio = AudioProcessor.normalize(final_audio)\n", + "\n", + " # Convertir en int16\n", + " final_audio = (final_audio * 32767).astype(np.int16)\n", + "\n", + " # Générer le nom de fichier\n", + " if output_path is None:\n", + " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", + " output_path = f\"tts_long_{voice}_{h}.wav\"\n", + "\n", + " # Dossier de sortie\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarder\n", + " print(f\"💾 Sauvegarde: {final_path}\")\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(final_audio.tobytes())\n", + "\n", + " # Calculer la durée réelle\n", + " duration = len(final_audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"\\n✅ Audio généré avec succès!\")\n", + " print(f\" 📁 Fichier: {final_path}\")\n", + " print(f\" ⏱️ Durée: {ProgressTracker._format_time(duration)}\")\n", + " print(f\" 📦 Chunks: {len(chunks)}\")\n", + " print(f\" 🎤 Voix: {voice}\")\n", + "\n", + " return {\n", + " 'path': str(final_path),\n", + " 'sample_rate': Config.SAMPLE_RATE,\n", + " 'duration_seconds': duration,\n", + " 'duration_formatted': ProgressTracker._format_time(duration),\n", + " 'audio_data': final_audio,\n", + " 'voice': voice,\n", + " 'language': language,\n", + " 'device': _device_name,\n", + " 'chunks_count': len(chunks),\n", + " 'text_length': len(text)\n", + " }\n", + "\n", + "\n", + "def text_to_speech(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " enable_text_splitting: bool = True\n", + ") -> dict:\n", + " \"\"\"\n", + " Génère un fichier audio à partir de texte avec XTTS v2.\n", + " \"\"\"\n", + " # Basculer automatiquement vers la version long pour textes > 10000 chars\n", + " if len(text) > 10000:\n", + " print(\"📢 Texte long détecté - utilisation de text_to_speech_long()\")\n", + " return text_to_speech_long(\n", + " text=text,\n", + " voice=voice,\n", + " language=language,\n", + " output_path=output_path,\n", + " enhance=enhance,\n", + " use_gdrive=use_gdrive,\n", + " gdrive_folder=gdrive_folder,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + "\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Générer l'audio avec enable_text_splitting\n", + " wav = synthesize_chunk(\n", + " text=text,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", + " else:\n", + " audio = AudioProcessor.normalize(wav)\n", + "\n", + " audio = (audio * 32767).astype(np.int16)\n", + "\n", + " # Nom de fichier\n", + " if output_path is None:\n", + " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", + " output_path = f\"tts_{voice}_{h}.wav\"\n", + "\n", + " # Dossier de sortie\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarder\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(audio.tobytes())\n", + "\n", + " duration = len(audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"✓ Audio généré: {final_path}\")\n", + " print(f\" Durée: {duration:.2f}s | Voix: {voice}\")\n", + "\n", + " return {\n", + " 'path': str(final_path),\n", + " 'sample_rate': Config.SAMPLE_RATE,\n", + " 'duration_seconds': duration,\n", + " 'audio_data': audio,\n", + " 'voice': voice,\n", + " 'language': language,\n", + " 'device': _device_name\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# UTILITIES\n", + "# ==============================================================================\n", + "\n", + "def preview_audio(result: dict) -> None:\n", + " \"\"\"Prévisualise l'audio dans le notebook.\"\"\"\n", + " from IPython.display import Audio, display\n", + "\n", + " audio = result['audio_data']\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " display(Audio(audio, rate=result['sample_rate']))\n", + "\n", + "\n", + "def list_voices() -> list:\n", + " \"\"\"Liste les voix disponibles.\"\"\"\n", + " return list(Config.PRESET_VOICES.keys())\n", + "\n", + "\n", + "def list_languages() -> list:\n", + " \"\"\"Liste les langues supportées.\"\"\"\n", + " return [\"en\", \"es\", \"fr\", \"de\", \"it\", \"pt\", \"pl\", \"tr\",\n", + " \"ru\", \"nl\", \"cs\", \"ar\", \"zh-cn\", \"ja\", \"hu\", \"ko\", \"hi\"]\n", + "\n", + "\n", + "def clear_cache():\n", + " \"\"\"Libère la mémoire.\"\"\"\n", + " global _tts_model\n", + " import torch\n", + "\n", + " _tts_model = None\n", + " gc.collect()\n", + "\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " print(\"✓ Cache vidé\")\n", + "\n", + "\n", + "def estimate_duration(text: str) -> dict:\n", + " \"\"\"\n", + " Estime la durée audio pour un texte.\n", + " \"\"\"\n", + " duration = TextSplitter.estimate_audio_duration(text)\n", + " chunks = len(TextSplitter.split_for_long_audio(text))\n", + "\n", + " return {\n", + " 'chars': len(text),\n", + " 'estimated_seconds': duration,\n", + " 'estimated_formatted': ProgressTracker._format_time(duration),\n", + " 'chunks_estimate': chunks\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# ALIASES\n", + "# ==============================================================================\n", + "\n", + "tts = text_to_speech\n", + "tts_long = text_to_speech_long\n", + "\n", + "\n", + "# ==============================================================================\n", + "# INITIALIZATION\n", + "# ==============================================================================\n", + "\n", + "def init():\n", + " \"\"\"Initialise le module.\"\"\"\n", + " detect_device()\n", + " print(\"✅ Module XTTS v2 Long Audio chargé\")\n", + " print(f\" Device: {_device_name}\")\n", + " print(f\" Voix: {list_voices()}\")\n", + " print(f\" enable_text_splitting: activé par défaut\")\n", + "\n", + "\n", + "# Auto-init\n", + "if __name__ != \"__main__\":\n", + " try:\n", + " detect_device()\n", + " except:\n", + " pass\n", + "\n", + "\n", + "# ==============================================================================\n", + "# EXAMPLE USAGE\n", + "# ==============================================================================\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Installation si nécessaire\n", + " install_dependencies()\n", + "\n", + " # Initialisation\n", + " init()\n", + "\n", + " # Exemple avec texte court\n", + " print(\"\\n\" + \"=\"*60)\n", + " print(\"EXEMPLE 1: Texte court\")\n", + " print(\"=\"*60)\n", + "\n", + " short_text = \"\"\"\n", + " Ce document présente les Manifold-Constrained Hyper-Connections,\n", + " une architecture novatrice conçue par DeepSeek-AI pour stabiliser\n", + " l'entraînement des grands modèles de langage.\n", + " \"\"\"\n", + "\n", + " result = text_to_speech(\n", + " text=short_text.strip(),\n", + " voice=\"female_fr\",\n", + " enhance=True\n", + " )\n", + "\n", + " print(f\"\\nRésultat: {result['duration_seconds']:.2f}s\")\n", + "\n", + " # Exemple avec texte long (simulé)\n", + " print(\"\\n\" + \"=\"*60)\n", + " print(\"EXEMPLE 2: Estimation pour texte long\")\n", + " print(\"=\"*60)\n", + "\n", + " # Simuler un texte de ~1 heure (environ 54000 caractères)\n", + " long_text = short_text.strip() * 300 # ~54000 chars ≈ 1 heure\n", + "\n", + " estimation = estimate_duration(long_text)\n", + " print(f\"\\nEstimation pour {estimation['chars']:,} caractères:\")\n", + " print(f\" Durée: {estimation['estimated_formatted']}\")\n", + " print(f\" Chunks: {estimation['chunks_estimate']}\")\n", + "\n", + " # Pour générer réellement:\n", + " # result = text_to_speech_long(long_text, voice=\"female_fr\", show_progress=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1FQhKQ1IE4iX", + "outputId": "597f5af5-fd18-4f0d-f6dd-3d4f12297e19" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "📦 Installation de FFmpeg...\n", + "✓ FFmpeg installé\n", + "⚙️ Device: cuda (Tesla T4\n", + "✅ Module XTTS v2 Long Audio chargé\n", + " Device: cuda (Tesla T4\n", + " Voix: ['female_fr', 'male_fr']\n", + " enable_text_splitting: activé par défaut\n", + "\n", + "============================================================\n", + "EXEMPLE 1: Texte court\n", + "============================================================\n", + "📥 Téléchargement de la voix 'female_fr'...\n", + "🔄 Chargement du modèle XTTS v2...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 1.87G/1.87G [00:32<00:00, 57.5MiB/s]\n", + "4.37kiB [00:00, 6.57MiB/s]\n", + "361kiB [00:00, 55.6MiB/s]\n", + "100%|██████████| 32.0/32.0 [00:00<00:00, 74.2kiB/s]\n", + "100%|██████████| 7.75M/7.75M [00:00<00:00, 17.6MiB/s]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "stop" + ], + "metadata": { + "id": "Naxv1wHEp6NQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "ulSK6K1op63B" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Lire le fichier\n", + "with open(\"mon_texte_long.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " texte_complet = f.read()\n", + "\n", + "# Lancer la génération\n", + "text_to_speech_long(\n", + " text=texte_complet,\n", + " voice=\"female_fr\",\n", + " language=\"fr\"\n", + ")" + ], + "metadata": { + "id": "BRhVnXgSE7Yd" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "4o0EdnBHp7la" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "KaWW0-DIMy7R" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From 1dbd077a297a9fb68cddf4b4e3ebacf12bf2d5d8 Mon Sep 17 00:00:00 2001 From: brunombo Date: Mon, 2 Feb 2026 12:58:46 +0100 Subject: [PATCH 4/9] =?UTF-8?q?Cr=C3=A9=C3=A9=20=C3=A0=20l'aide=20de=20Col?= =?UTF-8?q?ab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- long_TTS_xtts_v3.ipynb | 446 +++++++++++++++++++++++++++-------------- 1 file changed, 301 insertions(+), 145 deletions(-) diff --git a/long_TTS_xtts_v3.ipynb b/long_TTS_xtts_v3.ipynb index 75f8cbcc8a44..1c050d7c06b0 100644 --- a/long_TTS_xtts_v3.ipynb +++ b/long_TTS_xtts_v3.ipynb @@ -1,22 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4", - "authorship_tag": "ABX9TyPEqfSBvulICbte1mdWmCvM", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, "cells": [ { "cell_type": "markdown", @@ -30,61 +12,38 @@ }, { "cell_type": "markdown", - "source": [], "metadata": { "id": "s8NfbT3sw2-z" - } + }, + "source": [] }, { "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "XYDOUW523oJP" + }, + "outputs": [], "source": [ "PROMPT = \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", "\n", "voice_gender = 'female_fr'\n", "# ['female_fr', 'male_fr']" - ], - "metadata": { - "id": "XYDOUW523oJP" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { - "id": "jIKtDA5hweJP", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "acd264d8-f686-43ec-98f2-d8c267422276" + "id": "jIKtDA5hweJP" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m862.8/862.8 kB\u001b[0m \u001b[31m28.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m345.1/345.1 kB\u001b[0m \u001b[31m31.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.2/56.2 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m997.3/997.3 kB\u001b[0m \u001b[31m59.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m648.4/648.4 kB\u001b[0m \u001b[31m53.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m41.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h" - ] - } - ], + "outputs": [], "source": [ "# Installation des dépendances\n", "!pip install -q scipy noisereduce\n", "\n", "# Installation du fork maintenu (supporte Python 3.12+)\n", "!pip install -q coqui-tts\n", - "!pip install -q torchcodec\n", "\n", "\n", "# Installation des dépendances\n", @@ -102,32 +61,34 @@ }, { "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1FQhKQ1IE4iX", + "outputId": "5216e20f-aaf1-46e7-f518-4d37a9ebc66e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "📦 Installation de FFmpeg...\n", + "✓ FFmpeg installé\n", + "⚙️ Device: cuda (Tesla T4\n", + "✅ Module XTTS v2 Long Audio chargé\n", + " Device: cuda (Tesla T4\n", + " Voix: ['female_fr', 'male_fr']\n", + " enable_text_splitting: activé par défaut\n", + "💡 torchaudio backend is expected to use 'soundfile' as torchcodec is no longer installed.\n", + "\n", + "============================================================\n", + "EXEMPLE 1: Texte court\n" + ] + } + ], "source": [ - "# -*- coding: utf-8 -*-\n", - "\"\"\"\n", - "TTS XTTS v2 - Version Long Audio (> 1 heure)\n", - "=============================================\n", - "\n", - "Module de synthèse vocale haute qualité utilisant Coqui XTTS v2.\n", - "Optimisé pour la génération d'audio longs avec:\n", - "- enable_text_splitting=True pour découpage automatique\n", - "- Chunking intelligent par paragraphes pour textes très longs\n", - "- Concaténation audio avec crossfade\n", - "- Barre de progression et estimation temps restant\n", - "- Gestion mémoire optimisée\n", - "- Correction du bug d'argument 'language' sur l'API synthesizer\n", - "\n", - "Auteur: Bruno\n", - "Date: Janvier 2025\n", - "Correction: Gemini\n", - "\"\"\"\n", - "\n", - "# ==============================================================================\n", - "# IMPORTS\n", - "# ==============================================================================\n", - "\n", - "from __future__ import annotations\n", - "\n", "import os\n", "import re\n", "import gc\n", @@ -926,6 +887,17 @@ " print(f\" Device: {_device_name}\")\n", " print(f\" Voix: {list_voices()}\")\n", " print(f\" enable_text_splitting: activé par défaut\")\n", + " # Add this line to explicitly set torchaudio backend\n", + " try:\n", + " import torchaudio\n", + " # This line is intentionally commented out as set_audio_backend is not available in all torchaudio versions.\n", + " # The `soundfile` library should be picked up automatically if torchcodec is not installed.\n", + " # torchaudio.set_audio_backend(\"soundfile\")\n", + " print(\"💡 torchaudio backend is expected to use 'soundfile' as torchcodec is no longer installed.\")\n", + " except ImportError:\n", + " print(\"⚠️ torchaudio not found.\")\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de la configuration de torchaudio: {e}\")\n", "\n", "\n", "# Auto-init\n", @@ -950,75 +922,191 @@ " # Exemple avec texte court\n", " print(\"\\n\" + \"=\"*60)\n", " print(\"EXEMPLE 1: Texte court\")\n", - " print(\"=\"*60)\n", - "\n", - " short_text = \"\"\"\n", - " Ce document présente les Manifold-Constrained Hyper-Connections,\n", - " une architecture novatrice conçue par DeepSeek-AI pour stabiliser\n", - " l'entraînement des grands modèles de langage.\n", - " \"\"\"\n", + "" + ] + }, + { + "cell_type": "code", + "source": [ "\n", - " result = text_to_speech(\n", - " text=short_text.strip(),\n", - " voice=\"female_fr\",\n", - " enhance=True\n", - " )\n", + "text_to_speech_to_synthetise= \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", "\n", - " print(f\"\\nRésultat: {result['duration_seconds']:.2f}s\")\n", + "voice_gender = 'female_fr'\n", + "# ['female_fr', 'male_fr']" + ], + "metadata": { + "id": "FREsMU-QLEc4" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# -----------------------------------------------------------------------------\n", + "# CELLULE 3: Exemples d'utilisation\n", + "# -----------------------------------------------------------------------------\n", "\n", - " # Exemple avec texte long (simulé)\n", - " print(\"\\n\" + \"=\"*60)\n", - " print(\"EXEMPLE 2: Estimation pour texte long\")\n", - " print(\"=\"*60)\n", + "# Montage Google Drive (optionnel)\n", + "# mount_gdrive()\n", "\n", - " # Simuler un texte de ~1 heure (environ 54000 caractères)\n", - " long_text = short_text.strip() * 300 # ~54000 chars ≈ 1 heure\n", + "# Liste des voix disponibles\n", + "print(\"Voix disponibles:\", list_voices())\n", "\n", - " estimation = estimate_duration(long_text)\n", - " print(f\"\\nEstimation pour {estimation['chars']:,} caractères:\")\n", - " print(f\" Durée: {estimation['estimated_formatted']}\")\n", - " print(f\" Chunks: {estimation['chunks_estimate']}\")\n", + "# Génération simple\n", + "result = text_to_speech(text_to_speech_to_synthetise,voice=voice_gender)\n", "\n", - " # Pour générer réellement:\n", - " # result = text_to_speech_long(long_text, voice=\"female_fr\", show_progress=True)" + "# Prévisualisation\n", + "preview_audio(result)" ], "metadata": { "colab": { - "base_uri": "https://localhost:8080/" + "base_uri": "https://localhost:8080/", + "height": 1000 }, - "id": "1FQhKQ1IE4iX", - "outputId": "597f5af5-fd18-4f0d-f6dd-3d4f12297e19" + "id": "2Any3vzyK8zF", + "outputId": "5c0af666-6e37-4d50-9a96-5e7cecd8fb9d" }, - "execution_count": null, + "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "📦 Installation de FFmpeg...\n", - "✓ FFmpeg installé\n", - "⚙️ Device: cuda (Tesla T4\n", - "✅ Module XTTS v2 Long Audio chargé\n", - " Device: cuda (Tesla T4\n", - " Voix: ['female_fr', 'male_fr']\n", - " enable_text_splitting: activé par défaut\n", + "Voix disponibles: ['female_fr', 'male_fr']\n", + "🔄 Chargement du modèle XTTS v2...\n", + "✓ Modèle chargé\n", + "⚠️ Erreur calcul latents: Could not load libtorchcodec. Likely causes:\n", + " 1. FFmpeg is not properly installed in your environment. We support\n", + " versions 4, 5, 6, 7, and 8, and we attempt to load libtorchcodec\n", + " for each of those versions. Errors for versions not installed on\n", + " your system are expected; only the error for your installed FFmpeg\n", + " version is relevant. On Windows, ensure you've installed the\n", + " \"full-shared\" version which ships DLLs.\n", + " 2. The PyTorch version (2.9.0+cu126) is not compatible with\n", + " this version of TorchCodec. Refer to the version compatibility\n", + " table:\n", + " https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.\n", + " 3. Another runtime dependency; see exceptions below.\n", "\n", - "============================================================\n", - "EXEMPLE 1: Texte court\n", - "============================================================\n", - "📥 Téléchargement de la voix 'female_fr'...\n", - "🔄 Chargement du modèle XTTS v2...\n" + " The following exceptions were raised as we tried to load libtorchcodec:\n", + " \n", + "[start of libtorchcodec loading traceback]\n", + "FFmpeg version 8:\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", + " ctypes.CDLL(path)\n", + " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", + " self._handle = _dlopen(self._name, mode)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "OSError: libavutil.so.60: cannot open shared object file: No such file or directory\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", + " torch.ops.load_library(core_library_path)\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", + " raise OSError(f\"Could not load this library: {path}\") from e\n", + "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core8.so\n", + "\n", + "FFmpeg version 7:\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", + " ctypes.CDLL(path)\n", + " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", + " self._handle = _dlopen(self._name, mode)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "OSError: libavutil.so.59: cannot open shared object file: No such file or directory\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", + " torch.ops.load_library(core_library_path)\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", + " raise OSError(f\"Could not load this library: {path}\") from e\n", + "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core7.so\n", + "\n", + "FFmpeg version 6:\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", + " ctypes.CDLL(path)\n", + " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", + " self._handle = _dlopen(self._name, mode)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "OSError: libavutil.so.58: cannot open shared object file: No such file or directory\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", + " torch.ops.load_library(core_library_path)\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", + " raise OSError(f\"Could not load this library: {path}\") from e\n", + "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core6.so\n", + "\n", + "FFmpeg version 5:\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", + " ctypes.CDLL(path)\n", + " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", + " self._handle = _dlopen(self._name, mode)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "OSError: libavutil.so.57: cannot open shared object file: No such file or directory\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", + " torch.ops.load_library(core_library_path)\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", + " raise OSError(f\"Could not load this library: {path}\") from e\n", + "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core5.so\n", + "\n", + "FFmpeg version 4:\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", + " ctypes.CDLL(path)\n", + " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", + " self._handle = _dlopen(self._name, mode)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "OSError: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so: undefined symbol: _ZN3c1013MessageLogger6streamB5cxx11Ev\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", + " torch.ops.load_library(core_library_path)\n", + " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", + " raise OSError(f\"Could not load this library: {path}\") from e\n", + "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so\n", + "[end of libtorchcodec loading traceback].\n" ] }, { - "output_type": "stream", - "name": "stderr", - "text": [ - "100%|██████████| 1.87G/1.87G [00:32<00:00, 57.5MiB/s]\n", - "4.37kiB [00:00, 6.57MiB/s]\n", - "361kiB [00:00, 55.6MiB/s]\n", - "100%|██████████| 32.0/32.0 [00:00<00:00, 74.2kiB/s]\n", - "100%|██████████| 7.75M/7.75M [00:00<00:00, 17.6MiB/s]\n" + "output_type": "error", + "ename": "RuntimeError", + "evalue": "Could not load libtorchcodec. Likely causes:\n 1. FFmpeg is not properly installed in your environment. We support\n versions 4, 5, 6, 7, and 8, and we attempt to load libtorchcodec\n for each of those versions. Errors for versions not installed on\n your system are expected; only the error for your installed FFmpeg\n version is relevant. On Windows, ensure you've installed the\n \"full-shared\" version which ships DLLs.\n 2. The PyTorch version (2.9.0+cu126) is not compatible with\n this version of TorchCodec. Refer to the version compatibility\n table:\n https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.\n 3. Another runtime dependency; see exceptions below.\n\n The following exceptions were raised as we tried to load libtorchcodec:\n \n[start of libtorchcodec loading traceback]\nFFmpeg version 8:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.60: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core8.so\n\nFFmpeg version 7:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.59: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core7.so\n\nFFmpeg version 6:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.58: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core6.so\n\nFFmpeg version 5:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.57: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core5.so\n\nFFmpeg version 4:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so: undefined symbol: _ZN3c1013MessageLogger6streamB5cxx11Ev\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so\n[end of libtorchcodec loading traceback].", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipython-input-1408888851.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m# Génération simple\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext_to_speech\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_to_speech_to_synthetise\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mvoice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvoice_gender\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# Prévisualisation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/tmp/ipython-input-3651143534.py\u001b[0m in \u001b[0;36mtext_to_speech\u001b[0;34m(text, voice, language, output_path, enhance, use_gdrive, gdrive_folder, enable_text_splitting)\u001b[0m\n\u001b[1;32m 672\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 673\u001b[0m \u001b[0;31m# Générer l'audio avec enable_text_splitting\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 674\u001b[0;31m wav = synthesize_chunk(\n\u001b[0m\u001b[1;32m 675\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 676\u001b[0m \u001b[0mvoice_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvoice_path\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/tmp/ipython-input-3651143534.py\u001b[0m in \u001b[0;36msynthesize_chunk\u001b[0;34m(text, voice_path, language, enable_text_splitting)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 466\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"⚠️ Erreur calcul latents: {e}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 467\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 468\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;31m# 3. Inférence directe\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/tmp/ipython-input-3651143534.py\u001b[0m in \u001b[0;36msynthesize_chunk\u001b[0;34m(text, voice_path, language, enable_text_splitting)\u001b[0m\n\u001b[1;32m 458\u001b[0m \u001b[0;31m# On transforme le fichier WAV en vecteurs mathématiques\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 460\u001b[0;31m gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(\n\u001b[0m\u001b[1;32m 461\u001b[0m \u001b[0maudio_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mvoice_path\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 462\u001b[0m \u001b[0mgpt_cond_len\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py\u001b[0m in \u001b[0;36mdecorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorate_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mctx_factory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 120\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 121\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 122\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdecorate_context\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/TTS/tts/models/xtts.py\u001b[0m in \u001b[0;36mget_conditioning_latents\u001b[0;34m(self, audio_path, max_ref_length, gpt_cond_len, gpt_cond_chunk_len, librosa_trim_db, sound_norm_refs, load_sr)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0mspeaker_embedding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfile_path\u001b[0m \u001b[0;32min\u001b[0m \u001b[0maudio_paths\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 362\u001b[0;31m \u001b[0maudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_audio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mload_sr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 363\u001b[0m \u001b[0maudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maudio\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mload_sr\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mmax_ref_length\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msound_norm_refs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/TTS/tts/models/xtts.py\u001b[0m in \u001b[0;36mload_audio\u001b[0;34m(audiopath, sampling_rate)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;31m# torchaudio should chose proper backend to load audio depending on platform\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0maudio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlsr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorchaudio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudiopath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;31m# stereo to mono if needed\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/__init__.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0mby\u001b[0m \u001b[0mTorchCodec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \"\"\"\n\u001b[0;32m---> 86\u001b[0;31m return load_with_torchcodec(\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0muri\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0mframe_offset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mframe_offset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/_torchcodec.py\u001b[0m in \u001b[0;36mload_with_torchcodec\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;31m# Import torchcodec here to provide clear error if not available\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 82\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtorchcodec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecoders\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudioDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 83\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m raise ImportError(\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;31m# but that results in circular import.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_frame\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudioSamples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mFrame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mFrameBatch\u001b[0m \u001b[0;31m# usort:skip # noqa\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdecoders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msamplers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransforms\u001b[0m \u001b[0;31m# noqa\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/decoders/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# LICENSE file in the root directory of this source tree.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_core\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudioStreamMetadata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mVideoStreamMetadata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_audio_decoder\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudioDecoder\u001b[0m \u001b[0;31m# noqa\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_decoder_utils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mset_cuda_backend\u001b[0m \u001b[0;31m# noqa\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/_core/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m from ._metadata import (\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mAudioStreamMetadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mContainerMetadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/_core/_metadata.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m from torchcodec._core.ops import (\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0m_get_container_json_metadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0m_get_stream_json_metadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mexpose_ffmpeg_dlls\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 109\u001b[0;31m \u001b[0mffmpeg_major_version\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcore_library_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_torchcodec_shared_libraries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 110\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\u001b[0m in \u001b[0;36mload_torchcodec_shared_libraries\u001b[0;34m()\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"[end of libtorchcodec loading traceback].\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m )\n\u001b[0;32m---> 76\u001b[0;31m raise RuntimeError(\n\u001b[0m\u001b[1;32m 77\u001b[0m f\"\"\"Could not load libtorchcodec. Likely causes:\n\u001b[1;32m 78\u001b[0m \u001b[0;36m1.\u001b[0m \u001b[0mFFmpeg\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mproperly\u001b[0m \u001b[0minstalled\u001b[0m \u001b[0;32min\u001b[0m \u001b[0myour\u001b[0m \u001b[0menvironment\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mWe\u001b[0m \u001b[0msupport\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: Could not load libtorchcodec. Likely causes:\n 1. FFmpeg is not properly installed in your environment. We support\n versions 4, 5, 6, 7, and 8, and we attempt to load libtorchcodec\n for each of those versions. Errors for versions not installed on\n your system are expected; only the error for your installed FFmpeg\n version is relevant. On Windows, ensure you've installed the\n \"full-shared\" version which ships DLLs.\n 2. The PyTorch version (2.9.0+cu126) is not compatible with\n this version of TorchCodec. Refer to the version compatibility\n table:\n https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.\n 3. Another runtime dependency; see exceptions below.\n\n The following exceptions were raised as we tried to load libtorchcodec:\n \n[start of libtorchcodec loading traceback]\nFFmpeg version 8:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.60: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core8.so\n\nFFmpeg version 7:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.59: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core7.so\n\nFFmpeg version 6:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.58: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core6.so\n\nFFmpeg version 5:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.57: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core5.so\n\nFFmpeg version 4:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so: undefined symbol: _ZN3c1013MessageLogger6streamB5cxx11Ev\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so\n[end of libtorchcodec loading traceback]." ] } ] @@ -1026,25 +1114,80 @@ { "cell_type": "code", "source": [ - "stop" + "\n", + "# -----------------------------------------------------------------------------\n", + "# CELLULE 3: Exemples d'utilisation\n", + "# -----------------------------------------------------------------------------\n", + "\n", + "# Montage Google Drive (optionnel)\n", + "# mount_gdrive()\n", + "\n", + "# Liste des voix disponibles\n", + "print(\"Voix disponibles:\", list_voices())\n", + "\n", + "# Génération simple\n", + "result = text_to_speech(text_to_speech_to_synthetise,voice=voice_gender)\n", + "\n", + "# Prévisualisation\n", + "preview_audio(result)" ], "metadata": { - "id": "Naxv1wHEp6NQ" + "colab": { + "base_uri": "https://localhost:8080/", + "height": 228 + }, + "id": "2565nagRK0eb", + "outputId": "caca5354-0b04-44f3-e42a-23376259b47b" }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Voix disponibles: ['female_fr', 'male_fr']\n" + ] + }, + { + "output_type": "error", + "ename": "NameError", + "evalue": "name 'text_to_speech_to_synthetise' is not defined", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipython-input-2666739373.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m# Génération simple\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext_to_speech\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_to_speech_to_synthetise\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mvoice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvoice_gender\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# Prévisualisation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'text_to_speech_to_synthetise' is not defined" + ] + } + ] + }, + { + "cell_type": "code", "execution_count": null, - "outputs": [] + "metadata": { + "id": "Naxv1wHEp6NQ" + }, + "outputs": [], + "source": [ + "stop" + ] }, { "cell_type": "code", - "source": [], + "execution_count": null, "metadata": { "id": "ulSK6K1op63B" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BRhVnXgSE7Yd" + }, + "outputs": [], "source": [ "# Lire le fichier\n", "with open(\"mon_texte_long.txt\", \"r\", encoding=\"utf-8\") as f:\n", @@ -1056,30 +1199,43 @@ " voice=\"female_fr\",\n", " language=\"fr\"\n", ")" - ], - "metadata": { - "id": "BRhVnXgSE7Yd" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [], + "execution_count": null, "metadata": { "id": "4o0EdnBHp7la" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] }, { "cell_type": "code", - "source": [], + "execution_count": null, "metadata": { "id": "KaWW0-DIMy7R" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [], + "authorship_tag": "ABX9TyMWr0Dv6vrMdJNJnihMF5Pg", + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file From a2f9d1a206fc4d9beb34c0c1008933938bbdf349 Mon Sep 17 00:00:00 2001 From: brunombo Date: Mon, 2 Feb 2026 16:09:53 +0100 Subject: [PATCH 5/9] =?UTF-8?q?Cr=C3=A9=C3=A9=20=C3=A0=20l'aide=20de=20Col?= =?UTF-8?q?ab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- long_TTS_xtts_V6.ipynb | 1371 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1371 insertions(+) create mode 100644 long_TTS_xtts_V6.ipynb diff --git a/long_TTS_xtts_V6.ipynb b/long_TTS_xtts_V6.ipynb new file mode 100644 index 000000000000..2a5fe44be33a --- /dev/null +++ b/long_TTS_xtts_V6.ipynb @@ -0,0 +1,1371 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "header" + }, + "source": [ + "# TTS XTTS v2 - Long Audio Generator v4\n", + "\n", + "**Version 4.0** - Compatible PyTorch 2.9+ (Colab 2026)\n", + "\n", + "Fonctionnalites:\n", + "- Generation audio longue duree (> 1 heure)\n", + "- Fix torchcodec/torchaudio pour PyTorch 2.9+\n", + "- Chunking intelligent par paragraphes\n", + "- Crossfade entre chunks\n", + "- Barre de progression avec ETA\n", + "- Support Google Drive\n", + "\n", + "**Auteur:** Bruno | **Corrections:** Gemini, Claude" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "install_deps", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "55cd944b-c5fa-498c-fb32-78d151e5787e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.8 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m81.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for docopt (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "torch = 2.9.0+cu126 -> installation de torchcodec==0.9.* via https://download.pytorch.org/whl/cu126\n", + "torchcodec detecte: True\n", + "torchcodec version: 0.9.1+cu126\n" + ] + } + ], + "source": [ + "# Installation des dependances\n", + "# --------------------------------------------------------------\n", + "# Remarque importante (PyTorch>=2.9) :\n", + "# - Coqui TTS exige la bibliotheque `torchcodec` pour l'I/O audio. (cf. message d'erreur)\n", + "# - La version de torchcodec doit etre compatible avec votre version de torch.\n", + "#\n", + "# Sources (documentation officielle) :\n", + "# - coqui-tts: installer torch, torchaudio et (seulement pour torch>=2.9) torchcodec.\n", + "# - torchcodec: table de compatibilite torch <-> torchcodec + note CUDA/CPU.\n", + "\n", + "!pip install -q -U pip\n", + "!pip install -q numpy==2.0.2 scipy soundfile noisereduce\n", + "!pip install -q -U coqui-tts\n", + "\n", + "# Installer torchcodec dans une version compatible avec torch (et CUDA si detecte)\n", + "import sys, subprocess, re\n", + "\n", + "try:\n", + " import torch\n", + "except Exception as e:\n", + " raise RuntimeError(\n", + " \"PyTorch (torch) n'est pas importable. Installez d'abord torch/torchaudio, \"\n", + " \"puis relancez cette cellule.\"\n", + " ) from e\n", + "\n", + "def _torch_major_minor(ver: str) -> str:\n", + " base = ver.split(\"+\")[0]\n", + " parts = base.split(\".\")\n", + " return \".\".join(parts[:2]) if len(parts) >= 2 else base\n", + "\n", + "torch_ver = torch.__version__\n", + "mm = _torch_major_minor(torch_ver)\n", + "\n", + "# Mapping base sur la table de compatibilite officielle torchcodec.\n", + "if mm == \"2.10\":\n", + " torchcodec_spec = \"torchcodec==0.10.*\"\n", + "elif mm == \"2.9\":\n", + " torchcodec_spec = \"torchcodec==0.9.*\"\n", + "elif mm == \"2.8\":\n", + " torchcodec_spec = \"torchcodec==0.7.*\"\n", + "else:\n", + " torchcodec_spec = \"torchcodec\"\n", + "\n", + "# Si votre torch est un build CUDA (ex: 2.9.0+cu126), on tente d'installer torchcodec\n", + "# depuis l'index PyTorch correspondant. Sinon, on installe la version CPU depuis PyPI.\n", + "index_url = None\n", + "if \"+\" in torch_ver:\n", + " build = torch_ver.split(\"+\", 1)[1]\n", + " if build.startswith(\"cu\"):\n", + " index_url = f\"https://download.pytorch.org/whl/{build}\"\n", + "\n", + "print(f\"torch = {torch_ver} -> installation de {torchcodec_spec}\" + (f\" via {index_url}\" if index_url else \" (CPU PyPI)\"))\n", + "\n", + "def _pip_install_torchcodec():\n", + " cmd = [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", torchcodec_spec]\n", + " if index_url:\n", + " cmd += [\"--index-url\", index_url]\n", + " subprocess.check_call(cmd)\n", + "\n", + "try:\n", + " _pip_install_torchcodec()\n", + "except Exception as e:\n", + " # Fallback : essayer sans index_url (CPU PyPI).\n", + " if index_url:\n", + " print(f\"⚠️ Echec avec l'index PyTorch ({index_url}). Tentative CPU via PyPI…\")\n", + " index_url = None\n", + " _pip_install_torchcodec()\n", + " else:\n", + " raise\n", + "\n", + "# Verification (metadonnees pip)\n", + "import importlib.util, importlib.metadata\n", + "print(\"torchcodec detecte:\", importlib.util.find_spec(\"torchcodec\") is not None)\n", + "print(\"torchcodec version:\", importlib.metadata.version(\"torchcodec\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "main_module", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f17c6fea-cd4e-438a-f628-070f6bfde7b3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✓ Patch torchaudio applique (backend: soundfile)\n", + "⚙️ Device: cuda (Tesla T4)\n", + "\n", + "============================================================\n", + "TTS XTTS v2 - Long Audio Generator v4\n", + "Compatible PyTorch 2.9+ (fix torchcodec)\n", + "============================================================\n", + "Voix disponibles: ['female_fr', 'male_fr']\n" + ] + } + ], + "source": [ + "# -*- coding: utf-8 -*-\n", + "\"\"\"\n", + "TTS XTTS v2 - Version Long Audio v4\n", + "====================================\n", + "\n", + "Module de synthese vocale haute qualite utilisant Coqui XTTS v2.\n", + "Compatible avec PyTorch 2.9+ (fix torchcodec/torchaudio).\n", + "\n", + "Auteur: Bruno\n", + "Date: Janvier 2026\n", + "Corrections: Gemini, Claude\n", + "\"\"\"\n", + "\n", + "# ==============================================================================\n", + "# IMPORTS STANDARDS (APRES LE FIX)\n", + "# ==============================================================================\n", + "\n", + "import os\n", + "import re\n", + "import gc\n", + "import wave\n", + "import time\n", + "import hashlib\n", + "import warnings\n", + "import inspect\n", + "from pathlib import Path\n", + "from typing import Optional, List\n", + "from dataclasses import dataclass\n", + "import numpy as np\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "\n", + "# ==============================================================================\n", + "# TORCHAUDIO FIX - Backend soundfile\n", + "# ==============================================================================\n", + "\n", + "def _patch_torchaudio():\n", + " \"\"\"\n", + " Patch torchaudio.load pour utiliser le backend soundfile au lieu de torchcodec.\n", + " Resout l'erreur: \"Could not load libtorchcodec\" sur Colab avec PyTorch 2.9+.\n", + " \"\"\"\n", + " try:\n", + " import torchaudio\n", + "\n", + " # Verifier si deja patche\n", + " if hasattr(torchaudio, '_original_load_patched'):\n", + " return\n", + "\n", + " # Sauvegarder la fonction originale\n", + " _original_load = torchaudio.load\n", + "\n", + " def _patched_load(filepath, *args, **kwargs):\n", + " \"\"\"\n", + " Version patchee de torchaudio.load qui utilise soundfile comme backend.\n", + " \"\"\"\n", + " # Forcer le backend soundfile si non specifie\n", + " if 'backend' not in kwargs:\n", + " kwargs['backend'] = 'soundfile'\n", + "\n", + " try:\n", + " return _original_load(filepath, *args, **kwargs)\n", + " except Exception as e:\n", + " # Si soundfile echoue, essayer sans specifier de backend\n", + " if 'backend' in kwargs:\n", + " del kwargs['backend']\n", + " try:\n", + " return _original_load(filepath, *args, **kwargs)\n", + " except:\n", + " pass\n", + " raise e\n", + "\n", + " # Appliquer le patch\n", + " torchaudio.load = _patched_load\n", + " torchaudio._original_load_patched = True\n", + " print(\"✓ Patch torchaudio applique (backend: soundfile)\")\n", + "\n", + " except ImportError:\n", + " pass\n", + " except Exception as e:\n", + " print(f\"⚠️ Impossible de patcher torchaudio: {e}\")\n", + "\n", + "# Appliquer le patch torchaudio\n", + "_patch_torchaudio()\n", + "\n", + "# ==============================================================================\n", + "# CONFIGURATION\n", + "# ==============================================================================\n", + "\n", + "@dataclass\n", + "class TTSConfig:\n", + " \"\"\"Configuration globale du module TTS.\"\"\"\n", + " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", + " SAMPLE_RATE: int = 24000\n", + " DEFAULT_LANGUAGE: str = \"fr\"\n", + " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", + " MAX_CHARS_PER_CHUNK: int = 500\n", + " CROSSFADE_DURATION: float = 0.05\n", + " ENABLE_TEXT_SPLITTING: bool = True\n", + " PRESET_VOICES: dict = None\n", + "\n", + " def __post_init__(self):\n", + " self.PRESET_VOICES = {\n", + " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", + " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", + " }\n", + "\n", + "Config = TTSConfig()\n", + "\n", + "# ==============================================================================\n", + "# DEVICE MANAGEMENT\n", + "# ==============================================================================\n", + "\n", + "_device = None\n", + "_device_name = \"cpu\"\n", + "\n", + "def detect_device():\n", + " \"\"\"Detecte le meilleur device disponible.\"\"\"\n", + " global _device, _device_name\n", + " import torch\n", + "\n", + " # Essayer TPU\n", + " try:\n", + " import torch_xla.core.xla_model as xm\n", + " _device = xm.xla_device()\n", + " _device_name = \"tpu\"\n", + " print(f\"⚙️ Device: TPU\")\n", + " return\n", + " except:\n", + " pass\n", + "\n", + " # Essayer CUDA\n", + " if torch.cuda.is_available():\n", + " _device = torch.device(\"cuda\")\n", + " _device_name = f\"cuda ({torch.cuda.get_device_name(0)})\"\n", + " print(f\"⚙️ Device: {_device_name}\")\n", + " return\n", + "\n", + " # Fallback CPU\n", + " _device = torch.device(\"cpu\")\n", + " _device_name = \"cpu\"\n", + " print(f\"⚙️ Device: CPU\")\n", + "\n", + "# ==============================================================================\n", + "# TEXT SPLITTING UTILITIES\n", + "# ==============================================================================\n", + "\n", + "class TextSplitter:\n", + " \"\"\"Utilitaire pour decouper intelligemment les textes longs.\"\"\"\n", + "\n", + " @staticmethod\n", + " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", + " \"\"\"Estime la duree audio en secondes.\"\"\"\n", + " return len(text) / chars_per_second\n", + "\n", + " @staticmethod\n", + " def split_into_sentences(text: str) -> List[str]:\n", + " \"\"\"Decoupe le texte en phrases.\"\"\"\n", + " pattern = r'(?<=[.!?])\\s+'\n", + " sentences = re.split(pattern, text)\n", + " return [s.strip() for s in sentences if s.strip()]\n", + "\n", + " @staticmethod\n", + " def split_into_paragraphs(text: str) -> List[str]:\n", + " \"\"\"Decoupe le texte en paragraphes.\"\"\"\n", + " paragraphs = re.split(r'\\n\\s*\\n', text)\n", + " return [p.strip() for p in paragraphs if p.strip()]\n", + "\n", + " @classmethod\n", + " def split_for_long_audio(cls, text: str, max_chars: int = 500, preserve_sentences: bool = True) -> List[str]:\n", + " \"\"\"Decoupe le texte pour generation audio longue.\"\"\"\n", + " if len(text) <= max_chars:\n", + " return [text]\n", + "\n", + " chunks = []\n", + " if preserve_sentences:\n", + " sentences = cls.split_into_sentences(text)\n", + " current_chunk = \"\"\n", + "\n", + " for sentence in sentences:\n", + " if len(sentence) > max_chars:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = \"\"\n", + " # Decouper la phrase trop longue par mots\n", + " words = sentence.split()\n", + " sub_chunk = \"\"\n", + " for word in words:\n", + " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", + " sub_chunk += \" \" + word if sub_chunk else word\n", + " else:\n", + " if sub_chunk:\n", + " chunks.append(sub_chunk.strip())\n", + " sub_chunk = word\n", + " if sub_chunk:\n", + " current_chunk = sub_chunk\n", + " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", + " current_chunk += \" \" + sentence if current_chunk else sentence\n", + " else:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = sentence\n", + "\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " else:\n", + " for i in range(0, len(text), max_chars):\n", + " chunks.append(text[i:i + max_chars])\n", + "\n", + " return chunks\n", + "\n", + "# ==============================================================================\n", + "# AUDIO PROCESSING\n", + "# ==============================================================================\n", + "\n", + "class AudioProcessor:\n", + " \"\"\"Processeur audio pour post-traitement et concatenation.\"\"\"\n", + "\n", + " @staticmethod\n", + " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", + " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target_linear = 10 ** (target_db / 20)\n", + " audio = audio * (target_linear / peak)\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + " @staticmethod\n", + " def crossfade(audio1: np.ndarray, audio2: np.ndarray, sample_rate: int, duration: float = 0.05) -> np.ndarray:\n", + " \"\"\"Concatene deux segments audio avec crossfade.\"\"\"\n", + " if audio1.dtype == np.int16:\n", + " audio1 = audio1.astype(np.float32) / 32768.0\n", + " if audio2.dtype == np.int16:\n", + " audio2 = audio2.astype(np.float32) / 32768.0\n", + "\n", + " fade_samples = int(sample_rate * duration)\n", + " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", + " return np.concatenate([audio1, audio2])\n", + "\n", + " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", + " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", + " audio1_end = audio1[-fade_samples:] * fade_out\n", + " audio2_start = audio2[:fade_samples] * fade_in\n", + "\n", + " return np.concatenate([audio1[:-fade_samples], audio1_end + audio2_start, audio2[fade_samples:]])\n", + "\n", + " @classmethod\n", + " def concatenate_chunks(cls, audio_chunks: List[np.ndarray], sample_rate: int, crossfade_duration: float = 0.05) -> np.ndarray:\n", + " \"\"\"Concatene plusieurs chunks audio avec crossfade.\"\"\"\n", + " if not audio_chunks:\n", + " return np.array([], dtype=np.float32)\n", + " if len(audio_chunks) == 1:\n", + " audio = audio_chunks[0]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " return audio\n", + "\n", + " result = audio_chunks[0]\n", + " if result.dtype == np.int16:\n", + " result = result.astype(np.float32) / 32768.0\n", + " for chunk in audio_chunks[1:]:\n", + " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", + " return result\n", + "\n", + " @staticmethod\n", + " def enhance(audio: np.ndarray, sample_rate: int, normalize: bool = True, warmth: bool = True) -> np.ndarray:\n", + " \"\"\"Ameliore l'audio avec normalisation et warmth.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " # Ajouter de la chaleur (boost basses frequences)\n", + " if warmth:\n", + " try:\n", + " from scipy import signal\n", + " nyquist = sample_rate / 2\n", + " cutoff = min(300, nyquist * 0.9) / nyquist\n", + " b, a = signal.butter(2, cutoff, btype='low')\n", + " bass = signal.filtfilt(b, a, audio)\n", + " audio = audio + 0.15 * bass\n", + " except ImportError:\n", + " pass\n", + "\n", + " # Normaliser\n", + " if normalize:\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target = 10 ** (-3.0 / 20)\n", + " audio = audio * (target / peak)\n", + "\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + "# ==============================================================================\n", + "# PROGRESS TRACKER\n", + "# ==============================================================================\n", + "\n", + "class ProgressTracker:\n", + " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", + "\n", + " def __init__(self, total: int, description: str = \"\"):\n", + " self.total = total\n", + " self.current = 0\n", + " self.description = description\n", + " self.start_time = time.time()\n", + " self.chunk_times = []\n", + "\n", + " def update(self, chunk_duration: float = None):\n", + " \"\"\"Met a jour la progression.\"\"\"\n", + " self.current += 1\n", + " if chunk_duration:\n", + " self.chunk_times.append(chunk_duration)\n", + " self._display()\n", + "\n", + " def _display(self):\n", + " \"\"\"Affiche la barre de progression.\"\"\"\n", + " elapsed = time.time() - self.start_time\n", + " percent = (self.current / self.total) * 100\n", + "\n", + " if self.chunk_times:\n", + " avg_time = np.mean(self.chunk_times)\n", + " remaining = avg_time * (self.total - self.current)\n", + " eta_str = self._format_time(remaining)\n", + " else:\n", + " eta_str = \"...\"\n", + "\n", + " bar_length = 30\n", + " filled = int(bar_length * self.current / self.total)\n", + " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", + " elapsed_str = self._format_time(elapsed)\n", + "\n", + " print(f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", + " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\", end=\"\")\n", + "\n", + " if self.current >= self.total:\n", + " print()\n", + "\n", + " @staticmethod\n", + " def _format_time(seconds: float) -> str:\n", + " \"\"\"Formate les secondes en HH:MM:SS.\"\"\"\n", + " hours = int(seconds // 3600)\n", + " minutes = int((seconds % 3600) // 60)\n", + " secs = int(seconds % 60)\n", + " if hours > 0:\n", + " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", + " return f\"{minutes:02d}:{secs:02d}\"\n", + "\n", + "# ==============================================================================\n", + "# TTS ENGINE\n", + "# ==============================================================================\n", + "\n", + "_tts_model = None\n", + "_voices_cache = {}\n", + "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", + "\n", + "def get_model():\n", + " \"\"\"Charge le modele XTTS v2 avec cache.\"\"\"\n", + " global _tts_model\n", + "\n", + " if _tts_model is None:\n", + " print(\"🔄 Chargement du modele XTTS v2...\")\n", + "\n", + " from TTS.api import TTS\n", + " _tts_model = TTS(Config.MODEL_NAME)\n", + "\n", + " # Deplacement sur GPU (selon la version, .to() peut etre sur le wrapper ou sur le sous-modele)\n", + " if _device is not None and _device_name.startswith(\"cuda\"):\n", + " try:\n", + " if hasattr(_tts_model, \"to\"):\n", + " _tts_model = _tts_model.to(_device)\n", + " elif hasattr(_tts_model, \"tts_model\") and hasattr(_tts_model.tts_model, \"to\"):\n", + " _tts_model.tts_model = _tts_model.tts_model.to(_device)\n", + " except Exception as e:\n", + " print(f\"⚠️ Impossible de deplacer le modele sur CUDA: {e}\")\n", + "\n", + " print(\"✓ Modele charge\")\n", + "\n", + " return _tts_model\n", + "\n", + "\n", + "def get_voice_path(voice: str) -> str:\n", + " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", + " global _voices_cache\n", + " import urllib.request\n", + "\n", + " if voice in _voices_cache:\n", + " return _voices_cache[voice]\n", + "\n", + " if os.path.isfile(voice):\n", + " _voices_cache[voice] = voice\n", + " return voice\n", + "\n", + " if voice in Config.PRESET_VOICES:\n", + " url = Config.PRESET_VOICES[voice]\n", + " path = f\"/tmp/{voice}.wav\"\n", + " if not os.path.exists(path):\n", + " print(f\"📥 Telechargement de la voix '{voice}'...\")\n", + " urllib.request.urlretrieve(url, path)\n", + " _voices_cache[voice] = path\n", + " return path\n", + "\n", + " raise FileNotFoundError(f\"Voix '{voice}' non trouvee\")\n", + "\n", + "# ==============================================================================\n", + "# MAIN SYNTHESIS FUNCTIONS\n", + "# ==============================================================================\n", + "\n", + "def _filter_kwargs(fn, kwargs: dict) -> dict:\n", + " \"\"\"Garde uniquement les kwargs acceptes par fn (compatibilite entre versions).\"\"\"\n", + " try:\n", + " sig = inspect.signature(fn)\n", + " return {k: v for k, v in kwargs.items() if k in sig.parameters}\n", + " except (TypeError, ValueError):\n", + " # Signature indisponible (ex: fonction C++) -> on ne filtre pas\n", + " return kwargs\n", + "\n", + "\n", + "def _get_conditioning_latents_compat(xtts_model, voice_path: str):\n", + " \"\"\"Compat: get_conditioning_latents() a change de signature selon les versions.\"\"\"\n", + " fn = getattr(xtts_model, \"get_conditioning_latents\", None)\n", + " if fn is None:\n", + " raise AttributeError(\"Le modele XTTS ne fournit pas get_conditioning_latents().\")\n", + "\n", + " base_kwargs = {\"gpt_cond_len\": 30, \"max_ref_length\": 60}\n", + "\n", + " # Tentative par introspection\n", + " try:\n", + " sig = inspect.signature(fn)\n", + " params = sig.parameters\n", + "\n", + " if \"audio_path\" in params:\n", + " # Certaines versions veulent une liste, d'autres une str\n", + " try:\n", + " return fn(audio_path=[voice_path], **_filter_kwargs(fn, base_kwargs))\n", + " except TypeError:\n", + " return fn(audio_path=voice_path, **_filter_kwargs(fn, base_kwargs))\n", + "\n", + " if \"audio_paths\" in params:\n", + " return fn(audio_paths=[voice_path], **_filter_kwargs(fn, base_kwargs))\n", + "\n", + " if \"speaker_wav\" in params:\n", + " return fn(speaker_wav=voice_path, **_filter_kwargs(fn, base_kwargs))\n", + "\n", + " except (TypeError, ValueError):\n", + " pass\n", + "\n", + " # Fallback brut (plus permissif)\n", + " try:\n", + " return fn(audio_path=[voice_path], gpt_cond_len=30, max_ref_length=60)\n", + " except Exception:\n", + " try:\n", + " return fn(audio_path=voice_path, gpt_cond_len=30, max_ref_length=60)\n", + " except Exception:\n", + " return fn(voice_path)\n", + "\n", + "\n", + "def synthesize_chunk(text: str, voice_path: str, language: str = \"fr\", enable_text_splitting: bool = True) -> np.ndarray:\n", + " \"\"\"Synthetise un chunk de texte en audio via l'inference directe.\"\"\"\n", + " model_wrapper = get_model()\n", + "\n", + " # Acceder au modele XTTS directement (bypass SpeakerManager bug)\n", + " if hasattr(model_wrapper, 'synthesizer'):\n", + " xtts_model = model_wrapper.synthesizer.tts_model\n", + " else:\n", + " xtts_model = model_wrapper.tts_model\n", + "\n", + " # Calculer les latents de conditionnement (compat multi-versions)\n", + " try:\n", + " gpt_cond_latent, speaker_embedding = _get_conditioning_latents_compat(xtts_model, voice_path)\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur calcul latents: {e}\")\n", + " raise e\n", + "\n", + " # Inference directe (filtrage des kwargs selon la signature)\n", + " try:\n", + " inference_kwargs = {\n", + " \"text\": text,\n", + " \"language\": language,\n", + " \"gpt_cond_latent\": gpt_cond_latent,\n", + " \"speaker_embedding\": speaker_embedding,\n", + " \"temperature\": 0.7,\n", + " \"length_penalty\": 1.0,\n", + " \"repetition_penalty\": 2.0,\n", + " \"top_k\": 50,\n", + " \"top_p\": 0.8,\n", + " \"enable_text_splitting\": enable_text_splitting,\n", + " }\n", + "\n", + " # Alias possibles selon versions\n", + " try:\n", + " sig = inspect.signature(xtts_model.inference)\n", + " params = sig.parameters\n", + " if \"speaker_embedding\" not in params and \"speaker_latents\" in params:\n", + " inference_kwargs[\"speaker_latents\"] = inference_kwargs.pop(\"speaker_embedding\")\n", + " except (TypeError, ValueError):\n", + " pass\n", + "\n", + " out = xtts_model.inference(**_filter_kwargs(xtts_model.inference, inference_kwargs))\n", + "\n", + " if isinstance(out, dict) and 'wav' in out:\n", + " wav = out['wav']\n", + " else:\n", + " wav = out\n", + "\n", + " if hasattr(wav, 'cpu'):\n", + " wav = wav.cpu().numpy()\n", + " if isinstance(wav, list):\n", + " wav = np.array(wav, dtype=np.float32)\n", + "\n", + " return wav\n", + "\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'inference directe : {e}\")\n", + " raise e\n", + "\n", + "\n", + "def text_to_speech_long(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " max_chars_per_chunk: int = None,\n", + " show_progress: bool = True,\n", + " enable_text_splitting: bool = True\n", + ") -> dict:\n", + " \"\"\"Genere un fichier audio long (> 1 heure) a partir de texte.\"\"\"\n", + " import torch\n", + "\n", + " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Estimation\n", + " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", + " print(f\"\\n📝 Texte: {len(text):,} caracteres\")\n", + " print(f\"⏱️ Duree estimee: {ProgressTracker._format_time(estimated_duration)}\")\n", + "\n", + " # Decoupage\n", + " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", + " print(f\"📦 Chunks: {len(chunks)}\")\n", + "\n", + " # Synthese\n", + " progress = ProgressTracker(len(chunks), \"🎙️ Synthese\") if show_progress else None\n", + " audio_chunks = []\n", + "\n", + " for i, chunk in enumerate(chunks):\n", + " chunk_start = time.time()\n", + " try:\n", + " wav = synthesize_chunk(\n", + " text=chunk,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + " audio_chunks.append(wav)\n", + " except Exception as e:\n", + " print(f\"\\n⚠️ Erreur chunk {i+1}: {e}\")\n", + " continue\n", + "\n", + " # Nettoyage memoire periodique\n", + " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", + " torch.cuda.empty_cache()\n", + "\n", + " if progress:\n", + " progress.update(time.time() - chunk_start)\n", + "\n", + " if not audio_chunks:\n", + " raise RuntimeError(\"Aucun audio genere\")\n", + "\n", + " # Concatenation\n", + " print(\"\\n🔗 Concatenation des chunks...\")\n", + " final_audio = AudioProcessor.concatenate_chunks(\n", + " audio_chunks, Config.SAMPLE_RATE, Config.CROSSFADE_DURATION\n", + " )\n", + "\n", + " # Nettoyage memoire\n", + " del audio_chunks\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " print(\"✨ Post-traitement...\")\n", + " final_audio = AudioProcessor.enhance(\n", + " final_audio, Config.SAMPLE_RATE, normalize=True, warmth=True\n", + " )\n", + " else:\n", + " final_audio = AudioProcessor.normalize(final_audio)\n", + "\n", + " # Conversion en int16\n", + " final_audio = (final_audio * 32767).astype(np.int16)\n", + "\n", + " # Chemin de sortie\n", + " if output_path is None:\n", + " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", + " output_path = f\"tts_long_{voice}_{h}.wav\"\n", + "\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarde WAV\n", + " print(f\"💾 Sauvegarde: {final_path}\")\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(final_audio.tobytes())\n", + "\n", + " duration = len(final_audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"\\n✅ Audio genere avec succes!\")\n", + " print(f\" 📁 Fichier: {final_path}\")\n", + " print(f\" ⏱️ Duree: {ProgressTracker._format_time(duration)}\")\n", + " print(f\" 📦 Chunks: {len(chunks)}\")\n", + " print(f\" 🎤 Voix: {voice}\")\n", + "\n", + " return {\n", + " 'path': str(final_path),\n", + " 'sample_rate': Config.SAMPLE_RATE,\n", + " 'duration_seconds': duration,\n", + " 'duration_formatted': ProgressTracker._format_time(duration),\n", + " 'audio_data': final_audio,\n", + " 'voice': voice,\n", + " 'language': language,\n", + " 'device': _device_name,\n", + " 'chunks_count': len(chunks),\n", + " 'text_length': len(text)\n", + " }\n", + "\n", + "\n", + "def text_to_speech(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " enable_text_splitting: bool = True\n", + ") -> dict:\n", + " \"\"\"Genere un fichier audio a partir de texte avec XTTS v2.\"\"\"\n", + " # Rediriger vers version longue si necessaire\n", + " if len(text) > 10000:\n", + " print(\"📢 Texte long detecte - utilisation de text_to_speech_long()\")\n", + " return text_to_speech_long(\n", + " text=text, voice=voice, language=language, output_path=output_path,\n", + " enhance=enhance, use_gdrive=use_gdrive, gdrive_folder=gdrive_folder,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + "\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Synthese\n", + " wav = synthesize_chunk(\n", + " text=text,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting\n", + " )\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", + " else:\n", + " audio = AudioProcessor.normalize(wav)\n", + "\n", + " audio = (audio * 32767).astype(np.int16)\n", + "\n", + " # Chemin de sortie\n", + " if output_path is None:\n", + " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", + " output_path = f\"tts_{voice}_{h}.wav\"\n", + "\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarde WAV\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(audio.tobytes())\n", + "\n", + " duration = len(audio) / Config.SAMPLE_RATE\n", + " print(f\"✓ Audio genere: {final_path}\")\n", + " print(f\" Duree: {duration:.2f}s | Voix: {voice}\")\n", + "\n", + " return {\n", + " 'path': str(final_path),\n", + " 'sample_rate': Config.SAMPLE_RATE,\n", + " 'duration_seconds': duration,\n", + " 'audio_data': audio,\n", + " 'voice': voice,\n", + " 'language': language,\n", + " 'device': _device_name\n", + " }\n", + "\n", + "# ==============================================================================\n", + "# UTILITIES\n", + "# ==============================================================================\n", + "\n", + "def preview_audio(result: dict) -> None:\n", + " \"\"\"Previsualise l'audio dans le notebook.\"\"\"\n", + " from IPython.display import Audio, display\n", + " audio = result['audio_data']\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " display(Audio(audio, rate=result['sample_rate']))\n", + "\n", + "def list_voices() -> list:\n", + " \"\"\"Liste les voix disponibles.\"\"\"\n", + " return list(Config.PRESET_VOICES.keys())\n", + "\n", + "def list_languages() -> list:\n", + " \"\"\"Liste les langues supportees.\"\"\"\n", + " return [\"en\", \"es\", \"fr\", \"de\", \"it\", \"pt\", \"pl\", \"tr\", \"ru\", \"nl\", \"cs\", \"ar\", \"zh-cn\", \"ja\", \"hu\", \"ko\", \"hi\"]\n", + "\n", + "def clear_cache():\n", + " \"\"\"Vide le cache du modele.\"\"\"\n", + " global _tts_model\n", + " import torch\n", + " _tts_model = None\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + " print(\"✓ Cache vide\")\n", + "\n", + "def estimate_duration(text: str) -> dict:\n", + " \"\"\"Estime la duree audio pour un texte.\"\"\"\n", + " duration = TextSplitter.estimate_audio_duration(text)\n", + " chunks = len(TextSplitter.split_for_long_audio(text))\n", + " return {\n", + " 'chars': len(text),\n", + " 'estimated_seconds': duration,\n", + " 'estimated_formatted': ProgressTracker._format_time(duration),\n", + " 'chunks_estimate': chunks\n", + " }\n", + "\n", + "# Aliases\n", + "tts = text_to_speech\n", + "tts_long = text_to_speech_long\n", + "\n", + "# ==============================================================================\n", + "# INITIALIZATION\n", + "# ==============================================================================\n", + "\n", + "def init():\n", + " \"\"\"Initialise le module.\"\"\"\n", + " detect_device()\n", + " print(\"✅ Module XTTS v2 Long Audio v4 charge\")\n", + " print(f\" Device: {_device_name}\")\n", + " print(f\" Voix disponibles: {list_voices()}\")\n", + " print(f\" enable_text_splitting: active par defaut\")\n", + " print(f\" Fix torchcodec: actif\")\n", + "\n", + "# Auto-init\n", + "try:\n", + " detect_device()\n", + "except:\n", + " pass\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"TTS XTTS v2 - Long Audio Generator v4\")\n", + "print(\"Compatible PyTorch 2.9+ (fix torchcodec)\")\n", + "print(\"=\"*60)\n", + "print(f\"Voix disponibles: {list_voices()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "mount_drive", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "483dc2dd-86ae-456f-86b5-e6f92e96f257" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "# Monter Google Drive (optionnel)\n", + "# Le notebook peut aussi s'executer hors Colab : dans ce cas on ignore simplement le montage.\n", + "\n", + "try:\n", + " from google.colab import drive # type: ignore\n", + " drive.mount('/content/drive')\n", + "except Exception as e:\n", + " print(\"ℹ️ Google Colab non detecte ou Drive indisponible -> montage ignore.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "init_module", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "33a7cc61-9054-4601-f338-aaa653bef831" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "⚙️ Device: cuda (Tesla T4)\n", + "✅ Module XTTS v2 Long Audio v4 charge\n", + " Device: cuda (Tesla T4)\n", + " Voix disponibles: ['female_fr', 'male_fr']\n", + " enable_text_splitting: active par defaut\n", + " Fix torchcodec: actif\n" + ] + } + ], + "source": [ + "# Initialisation du module\n", + "init()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "example_short", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "outputId": "577e3721-4fce-40f1-979b-79c5aef7cf2e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "📥 Telechargement de la voix 'female_fr'...\n", + "🔄 Chargement du modele XTTS v2...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 1.87G/1.87G [00:31<00:00, 59.9MiB/s]\n", + "4.37kiB [00:00, 7.24MiB/s]\n", + "361kiB [00:00, 107MiB/s]\n", + "100%|██████████| 32.0/32.0 [00:00<00:00, 91.7kiB/s]\n", + "100%|██████████| 7.75M/7.75M [00:00<00:00, 16.2MiB/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✓ Modele charge\n", + "✓ Audio genere: tts_female_fr_3f623356.wav\n", + " Duree: 9.28s | Voix: female_fr\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + } + ], + "source": [ + "# ==============================================================================\n", + "# EXEMPLE 1: Texte court\n", + "# ==============================================================================\n", + "\n", + "text_court = \"\"\"\n", + "Bonjour! Ceci est un test de synthese vocale avec XTTS v2.\n", + "Le module est maintenant compatible avec PyTorch 2.9 et superieur.\n", + "\"\"\"\n", + "\n", + "# Choisir la voix\n", + "voice_gender = \"female_fr\" # ou \"male_fr\"\n", + "\n", + "# Generation\n", + "result = text_to_speech(text_court.strip(), voice=voice_gender, enhance=True)\n", + "\n", + "# Previsualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "example_long", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1c25049d-6271-4864-d733-ef701a7b9a28" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "📊 Estimation:\n", + " Caracteres: 956\n", + " Duree estimee: 01:03\n", + " Chunks estimes: 2\n" + ] + } + ], + "source": [ + "# ==============================================================================\n", + "# EXEMPLE 2: Texte long\n", + "# ==============================================================================\n", + "\n", + "text_long = \"\"\"\n", + "La synthese vocale, egalement appelee text-to-speech ou TTS, est une technologie\n", + "qui permet de convertir du texte ecrit en parole audible. Cette technologie a\n", + "considerablement evolue au fil des annees, passant de voix robotiques et\n", + "mecaniques a des voix naturelles et expressives.\n", + "\n", + "XTTS v2 est l'un des modeles les plus avances dans ce domaine. Developpe par\n", + "Coqui AI, il utilise des techniques d'apprentissage profond pour generer une\n", + "parole de haute qualite dans plusieurs langues. Le modele peut meme cloner\n", + "des voix a partir d'un court echantillon audio de reference.\n", + "\n", + "Les applications de la synthese vocale sont nombreuses: assistants virtuels,\n", + "livres audio, accessibilite pour les personnes malvoyantes, doublage video,\n", + "et bien d'autres encore. Avec les avancees recentes en intelligence artificielle,\n", + "la qualite de la synthese vocale continue de s'ameliorer, rendant la distinction\n", + "entre voix humaine et voix synthetique de plus en plus difficile.\n", + "\"\"\"\n", + "\n", + "# Estimation avant generation\n", + "estimation = estimate_duration(text_long)\n", + "print(f\"📊 Estimation:\")\n", + "print(f\" Caracteres: {estimation['chars']:,}\")\n", + "print(f\" Duree estimee: {estimation['estimated_formatted']}\")\n", + "print(f\" Chunks estimes: {estimation['chunks_estimate']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "generate_long", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 130 + }, + "outputId": "5f0391f1-1deb-448f-c71c-528ee69050aa" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "🔄 Chargement du modele XTTS v2...\n", + "✓ Modele charge\n", + "✓ Audio genere: /content/drive/MyDrive/TTS_Output/tts_female_fr_28bb1d44.wav\n", + " Duree: 54.82s | Voix: female_fr\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + } + ], + "source": [ + "# Generation du texte long\n", + "result_long = text_to_speech(\n", + " text_long.strip(),\n", + " voice=\"female_fr\",\n", + " enhance=True,\n", + " use_gdrive=True # Mettre True pour sauvegarder sur Drive\n", + ")\n", + "\n", + "# Previsualisation\n", + "preview_audio(result_long)" + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# ==========================================\n", + "# 3. INTERFACE UTILISATEUR\n", + "# ==========================================\n", + "\n", + "text_input = widgets.Textarea(\n", + " placeholder='Entrez votre texte ici...',\n", + " value=\"texte\",\n", + " layout=widgets.Layout(width='100%', height='150px')\n", + ")\n", + "button = widgets.Button(description='Générer Audio', button_style='success', icon='check')\n", + "progress_bar = widgets.IntProgress(value=0, min=0, max=100, layout=widgets.Layout(width='100%', visibility='hidden'))\n", + "output = widgets.Output()\n", + "\n", + "def on_click(b):\n", + " with output:\n", + " clear_output()\n", + " if not text_input.value.strip():\n", + " print(\"❌ Le texte est vide.\")\n", + " return\n", + "\n", + " button.disabled = True\n", + " progress_bar.layout.visibility = 'visible'\n", + " progress_bar.bar_style = 'info'\n", + " progress_bar.value = 10\n", + "\n", + " try:\n", + " # 1. Drive\n", + " if not os.path.exists(\"/content/drive\"): drive.mount('/content/drive')\n", + " if not os.path.exists(DRIVE_FOLDER): os.makedirs(DRIVE_FOLDER)\n", + " progress_bar.value = 20\n", + "\n", + " # 2. Gemini\n", + " print(\"🧠 Optimisation du texte (Gemini)...\")\n", + " res_ia = traiter_via_gemini_pour_elevenlabs(text_input.value)\n", + " titre = res_ia.get('titre', 'Audio_Output')\n", + " texte_final = res_ia.get('texte_optimise', text_input.value)\n", + " progress_bar.value = 50\n", + "\n", + " # 3. ElevenLabs\n", + " print(f\"🎙️ Génération avec Voix ID: {VOICE_ID} ({MODEL_ID})...\")\n", + " nom_fichier = f\"{assainir_nom_fichier(titre)}.mp3\"\n", + " chemin_complet = os.path.join(DRIVE_FOLDER, nom_fichier)\n", + "\n", + " if generer_audio_elevenlabs(texte_final, chemin_complet):\n", + " progress_bar.value = 100\n", + " progress_bar.bar_style = 'success'\n", + " print(f\"✅ Fichier sauvegardé : {chemin_complet}\")\n", + " display(Audio(chemin_complet))\n", + " else:\n", + " raise Exception(\"Erreur API ElevenLabs\")\n", + "\n", + " except Exception as e:\n", + " progress_bar.bar_style = 'danger'\n", + " print(f\"❌ Erreur : {e}\")\n", + " finally:\n", + " button.disabled = False\n", + "\n", + "button.on_click(on_click)\n", + "\n", + "# Affichage avec rappel de la config chargée\n", + "display(widgets.VBox([\n", + " widgets.HTML(f\"

Générateur TTS - Config : {MODEL_ID}

\"),\n", + " widgets.Label(f\"Voix chargée : {VOICE_ID} (Guillaume)\"),\n", + " text_input,\n", + " button,\n", + " progress_bar,\n", + " output\n", + "]))" + ], + "metadata": { + "id": "umnfhCzn2WGG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "custom_text", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 96 + }, + "outputId": "2a865273-027a-4958-9da2-7985d9a507d6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✓ Audio genere: tts_female_fr_a37ea6fe.wav\n", + " Duree: 1.34s | Voix: female_fr\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + } + ], + "source": [ + "# ==============================================================================\n", + "# VOTRE TEXTE PERSONNALISE\n", + "# ==============================================================================\n", + "\n", + "# Entrez votre texte ici\n", + "text_to_synthesize = \"\"\"\n", + "Votre texte ici...\n", + "\"\"\"\n", + "\n", + "# Configuration\n", + "voice_gender = \"female_fr\" # \"female_fr\" ou \"male_fr\"\n", + "save_to_drive = False # True pour sauvegarder sur Google Drive\n", + "\n", + "# Generation\n", + "result = text_to_speech(\n", + " text_to_synthesize.strip(),\n", + " voice=voice_gender,\n", + " enhance=True,\n", + " use_gdrive=save_to_drive\n", + ")\n", + "\n", + "# Previsualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "cleanup", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6a7d6774-2254-4715-a5be-ea378f6a3eee" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✓ Cache vide\n" + ] + } + ], + "source": [ + "# Nettoyer le cache si necessaire (libere la memoire GPU)\n", + "clear_cache()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From a5537ef2e4ed2395eecfe882f5f39e610f7a77dd Mon Sep 17 00:00:00 2001 From: brunombo Date: Thu, 12 Feb 2026 14:19:28 +0100 Subject: [PATCH 6/9] build OK claude --- long_TTS_xtts_v3.ipynb | 260 +++++++++++++++++------------------------ 1 file changed, 104 insertions(+), 156 deletions(-) diff --git a/long_TTS_xtts_v3.ipynb b/long_TTS_xtts_v3.ipynb index 1c050d7c06b0..fc0ac929cb20 100644 --- a/long_TTS_xtts_v3.ipynb +++ b/long_TTS_xtts_v3.ipynb @@ -35,9 +35,29 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "id": "jIKtDA5hweJP" + "id": "jIKtDA5hweJP", + "outputId": "8b9cbf18-4496-4c26-d2c8-50b97a3710d2", + "colab": { + "base_uri": "https://localhost:8080/" + } }, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m862.8/862.8 kB\u001b[0m \u001b[31m57.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m345.1/345.1 kB\u001b[0m \u001b[31m25.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.2/56.2 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m997.3/997.3 kB\u001b[0m \u001b[31m62.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m648.4/648.4 kB\u001b[0m \u001b[31m55.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m18.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], "source": [ "# Installation des dépendances\n", "!pip install -q scipy noisereduce\n", @@ -61,13 +81,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1FQhKQ1IE4iX", - "outputId": "5216e20f-aaf1-46e7-f518-4d37a9ebc66e" + "outputId": "68446e3c-ac8f-474b-a329-dfa6b8d6aec9" }, "outputs": [ { @@ -921,8 +941,7 @@ "\n", " # Exemple avec texte court\n", " print(\"\\n\" + \"=\"*60)\n", - " print(\"EXEMPLE 1: Texte court\")\n", - "" + " print(\"EXEMPLE 1: Texte court\")\n" ] }, { @@ -937,7 +956,7 @@ "metadata": { "id": "FREsMU-QLEc4" }, - "execution_count": 8, + "execution_count": 4, "outputs": [] }, { @@ -962,152 +981,58 @@ "metadata": { "colab": { "base_uri": "https://localhost:8080/", - "height": 1000 + "height": 250 }, "id": "2Any3vzyK8zF", - "outputId": "5c0af666-6e37-4d50-9a96-5e7cecd8fb9d" + "outputId": "2d3ff69d-09bc-4334-f13f-7be141559de8" }, - "execution_count": 9, + "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Voix disponibles: ['female_fr', 'male_fr']\n", - "🔄 Chargement du modèle XTTS v2...\n", - "✓ Modèle chargé\n", - "⚠️ Erreur calcul latents: Could not load libtorchcodec. Likely causes:\n", - " 1. FFmpeg is not properly installed in your environment. We support\n", - " versions 4, 5, 6, 7, and 8, and we attempt to load libtorchcodec\n", - " for each of those versions. Errors for versions not installed on\n", - " your system are expected; only the error for your installed FFmpeg\n", - " version is relevant. On Windows, ensure you've installed the\n", - " \"full-shared\" version which ships DLLs.\n", - " 2. The PyTorch version (2.9.0+cu126) is not compatible with\n", - " this version of TorchCodec. Refer to the version compatibility\n", - " table:\n", - " https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.\n", - " 3. Another runtime dependency; see exceptions below.\n", - "\n", - " The following exceptions were raised as we tried to load libtorchcodec:\n", - " \n", - "[start of libtorchcodec loading traceback]\n", - "FFmpeg version 8:\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", - " ctypes.CDLL(path)\n", - " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", - " self._handle = _dlopen(self._name, mode)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", - "OSError: libavutil.so.60: cannot open shared object file: No such file or directory\n", - "\n", - "The above exception was the direct cause of the following exception:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", - " torch.ops.load_library(core_library_path)\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", - " raise OSError(f\"Could not load this library: {path}\") from e\n", - "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core8.so\n", - "\n", - "FFmpeg version 7:\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", - " ctypes.CDLL(path)\n", - " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", - " self._handle = _dlopen(self._name, mode)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", - "OSError: libavutil.so.59: cannot open shared object file: No such file or directory\n", - "\n", - "The above exception was the direct cause of the following exception:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", - " torch.ops.load_library(core_library_path)\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", - " raise OSError(f\"Could not load this library: {path}\") from e\n", - "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core7.so\n", - "\n", - "FFmpeg version 6:\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", - " ctypes.CDLL(path)\n", - " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", - " self._handle = _dlopen(self._name, mode)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", - "OSError: libavutil.so.58: cannot open shared object file: No such file or directory\n", - "\n", - "The above exception was the direct cause of the following exception:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", - " torch.ops.load_library(core_library_path)\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", - " raise OSError(f\"Could not load this library: {path}\") from e\n", - "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core6.so\n", - "\n", - "FFmpeg version 5:\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", - " ctypes.CDLL(path)\n", - " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", - " self._handle = _dlopen(self._name, mode)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", - "OSError: libavutil.so.57: cannot open shared object file: No such file or directory\n", - "\n", - "The above exception was the direct cause of the following exception:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", - " torch.ops.load_library(core_library_path)\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", - " raise OSError(f\"Could not load this library: {path}\") from e\n", - "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core5.so\n", - "\n", - "FFmpeg version 4:\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n", - " ctypes.CDLL(path)\n", - " File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n", - " self._handle = _dlopen(self._name, mode)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^\n", - "OSError: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so: undefined symbol: _ZN3c1013MessageLogger6streamB5cxx11Ev\n", - "\n", - "The above exception was the direct cause of the following exception:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n", - " torch.ops.load_library(core_library_path)\n", - " File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n", - " raise OSError(f\"Could not load this library: {path}\") from e\n", - "OSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so\n", - "[end of libtorchcodec loading traceback].\n" + "📥 Téléchargement de la voix 'female_fr'...\n", + "🔄 Chargement du modèle XTTS v2...\n" ] }, { - "output_type": "error", - "ename": "RuntimeError", - "evalue": "Could not load libtorchcodec. Likely causes:\n 1. FFmpeg is not properly installed in your environment. We support\n versions 4, 5, 6, 7, and 8, and we attempt to load libtorchcodec\n for each of those versions. Errors for versions not installed on\n your system are expected; only the error for your installed FFmpeg\n version is relevant. On Windows, ensure you've installed the\n \"full-shared\" version which ships DLLs.\n 2. The PyTorch version (2.9.0+cu126) is not compatible with\n this version of TorchCodec. Refer to the version compatibility\n table:\n https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.\n 3. Another runtime dependency; see exceptions below.\n\n The following exceptions were raised as we tried to load libtorchcodec:\n \n[start of libtorchcodec loading traceback]\nFFmpeg version 8:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.60: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core8.so\n\nFFmpeg version 7:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.59: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core7.so\n\nFFmpeg version 6:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.58: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core6.so\n\nFFmpeg version 5:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.57: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core5.so\n\nFFmpeg version 4:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so: undefined symbol: _ZN3c1013MessageLogger6streamB5cxx11Ev\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so\n[end of libtorchcodec loading traceback].", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipython-input-1408888851.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m# Génération simple\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext_to_speech\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_to_speech_to_synthetise\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mvoice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvoice_gender\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# Prévisualisation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/tmp/ipython-input-3651143534.py\u001b[0m in \u001b[0;36mtext_to_speech\u001b[0;34m(text, voice, language, output_path, enhance, use_gdrive, gdrive_folder, enable_text_splitting)\u001b[0m\n\u001b[1;32m 672\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 673\u001b[0m \u001b[0;31m# Générer l'audio avec enable_text_splitting\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 674\u001b[0;31m wav = synthesize_chunk(\n\u001b[0m\u001b[1;32m 675\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 676\u001b[0m \u001b[0mvoice_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvoice_path\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/tmp/ipython-input-3651143534.py\u001b[0m in \u001b[0;36msynthesize_chunk\u001b[0;34m(text, voice_path, language, enable_text_splitting)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 466\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"⚠️ Erreur calcul latents: {e}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 467\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 468\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;31m# 3. Inférence directe\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/tmp/ipython-input-3651143534.py\u001b[0m in \u001b[0;36msynthesize_chunk\u001b[0;34m(text, voice_path, language, enable_text_splitting)\u001b[0m\n\u001b[1;32m 458\u001b[0m \u001b[0;31m# On transforme le fichier WAV en vecteurs mathématiques\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 460\u001b[0;31m gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(\n\u001b[0m\u001b[1;32m 461\u001b[0m \u001b[0maudio_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mvoice_path\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 462\u001b[0m \u001b[0mgpt_cond_len\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py\u001b[0m in \u001b[0;36mdecorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorate_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mctx_factory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 120\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 121\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 122\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdecorate_context\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/TTS/tts/models/xtts.py\u001b[0m in \u001b[0;36mget_conditioning_latents\u001b[0;34m(self, audio_path, max_ref_length, gpt_cond_len, gpt_cond_chunk_len, librosa_trim_db, sound_norm_refs, load_sr)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0mspeaker_embedding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfile_path\u001b[0m \u001b[0;32min\u001b[0m \u001b[0maudio_paths\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 362\u001b[0;31m \u001b[0maudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_audio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mload_sr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 363\u001b[0m \u001b[0maudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maudio\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mload_sr\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mmax_ref_length\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msound_norm_refs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/TTS/tts/models/xtts.py\u001b[0m in \u001b[0;36mload_audio\u001b[0;34m(audiopath, sampling_rate)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;31m# torchaudio should chose proper backend to load audio depending on platform\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0maudio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlsr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorchaudio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maudiopath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 85\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[0;31m# stereo to mono if needed\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/__init__.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0mby\u001b[0m \u001b[0mTorchCodec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \"\"\"\n\u001b[0;32m---> 86\u001b[0;31m return load_with_torchcodec(\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0muri\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0mframe_offset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mframe_offset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchaudio/_torchcodec.py\u001b[0m in \u001b[0;36mload_with_torchcodec\u001b[0;34m(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size, backend)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;31m# Import torchcodec here to provide clear error if not available\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 81\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 82\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtorchcodec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecoders\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudioDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 83\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m raise ImportError(\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;31m# but that results in circular import.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_frame\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudioSamples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mFrame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mFrameBatch\u001b[0m \u001b[0;31m# usort:skip # noqa\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdecoders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoders\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msamplers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransforms\u001b[0m \u001b[0;31m# noqa\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/decoders/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# LICENSE file in the root directory of this source tree.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_core\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudioStreamMetadata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mVideoStreamMetadata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_audio_decoder\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAudioDecoder\u001b[0m \u001b[0;31m# noqa\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_decoder_utils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mset_cuda_backend\u001b[0m \u001b[0;31m# noqa\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/_core/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m from ._metadata import (\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mAudioStreamMetadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mContainerMetadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/_core/_metadata.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m from torchcodec._core.ops import (\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0m_get_container_json_metadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0m_get_stream_json_metadata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mexpose_ffmpeg_dlls\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 109\u001b[0;31m \u001b[0mffmpeg_major_version\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcore_library_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_torchcodec_shared_libraries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 110\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\u001b[0m in \u001b[0;36mload_torchcodec_shared_libraries\u001b[0;34m()\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"[end of libtorchcodec loading traceback].\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m )\n\u001b[0;32m---> 76\u001b[0;31m raise RuntimeError(\n\u001b[0m\u001b[1;32m 77\u001b[0m f\"\"\"Could not load libtorchcodec. Likely causes:\n\u001b[1;32m 78\u001b[0m \u001b[0;36m1.\u001b[0m \u001b[0mFFmpeg\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mproperly\u001b[0m \u001b[0minstalled\u001b[0m \u001b[0;32min\u001b[0m \u001b[0myour\u001b[0m \u001b[0menvironment\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mWe\u001b[0m \u001b[0msupport\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mRuntimeError\u001b[0m: Could not load libtorchcodec. Likely causes:\n 1. FFmpeg is not properly installed in your environment. We support\n versions 4, 5, 6, 7, and 8, and we attempt to load libtorchcodec\n for each of those versions. Errors for versions not installed on\n your system are expected; only the error for your installed FFmpeg\n version is relevant. On Windows, ensure you've installed the\n \"full-shared\" version which ships DLLs.\n 2. The PyTorch version (2.9.0+cu126) is not compatible with\n this version of TorchCodec. Refer to the version compatibility\n table:\n https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.\n 3. Another runtime dependency; see exceptions below.\n\n The following exceptions were raised as we tried to load libtorchcodec:\n \n[start of libtorchcodec loading traceback]\nFFmpeg version 8:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.60: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core8.so\n\nFFmpeg version 7:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.59: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core7.so\n\nFFmpeg version 6:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.58: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core6.so\n\nFFmpeg version 5:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: libavutil.so.57: cannot open shared object file: No such file or directory\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core5.so\n\nFFmpeg version 4:\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1488, in load_library\n ctypes.CDLL(path)\n File \"/usr/lib/python3.12/ctypes/__init__.py\", line 379, in __init__\n self._handle = _dlopen(self._name, mode)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\nOSError: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so: undefined symbol: _ZN3c1013MessageLogger6streamB5cxx11Ev\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/usr/local/lib/python3.12/dist-packages/torchcodec/_core/ops.py\", line 57, in load_torchcodec_shared_libraries\n torch.ops.load_library(core_library_path)\n File \"/usr/local/lib/python3.12/dist-packages/torch/_ops.py\", line 1490, in load_library\n raise OSError(f\"Could not load this library: {path}\") from e\nOSError: Could not load this library: /usr/local/lib/python3.12/dist-packages/torchcodec/libtorchcodec_core4.so\n[end of libtorchcodec loading traceback]." + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 1.87G/1.87G [00:35<00:00, 52.2MiB/s]\n", + "4.37kiB [00:00, 5.36MiB/s]\n", + "361kiB [00:00, 98.7MiB/s]\n", + "100%|██████████| 32.0/32.0 [00:00<00:00, 63.3kiB/s]\n", + "100%|██████████| 7.75M/7.75M [00:00<00:00, 102MiB/s]\n" ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✓ Modèle chargé\n", + "✓ Audio généré: tts_female_fr_a22d596a.wav\n", + " Durée: 61.17s | Voix: female_fr\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} } ] }, @@ -1133,11 +1058,10 @@ ], "metadata": { "colab": { - "base_uri": "https://localhost:8080/", - "height": 228 + "base_uri": "https://localhost:8080/" }, "id": "2565nagRK0eb", - "outputId": "caca5354-0b04-44f3-e42a-23376259b47b" + "outputId": "38fb657a-44c2-4ce0-c39b-3cebe22149d2" }, "execution_count": 6, "outputs": [ @@ -1145,29 +1069,53 @@ "output_type": "stream", "name": "stdout", "text": [ - "Voix disponibles: ['female_fr', 'male_fr']\n" + "Voix disponibles: ['female_fr', 'male_fr']\n", + "✓ Audio généré: tts_female_fr_a22d596a.wav\n", + " Durée: 63.25s | Voix: female_fr\n" ] }, { - "output_type": "error", - "ename": "NameError", - "evalue": "name 'text_to_speech_to_synthetise' is not defined", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipython-input-2666739373.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m# Génération simple\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext_to_speech\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext_to_speech_to_synthetise\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mvoice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvoice_gender\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# Prévisualisation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'text_to_speech_to_synthetise' is not defined" - ] + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} } ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { - "id": "Naxv1wHEp6NQ" + "id": "Naxv1wHEp6NQ", + "outputId": "33d8bb87-0781-465a-b3d5-9b35f85de770", + "colab": { + "base_uri": "https://localhost:8080/" + } }, - "outputs": [], + "outputs": [ + { + "output_type": "error", + "ename": "NameError", + "evalue": "name 'stop' is not defined", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipython-input-3957423419.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstop\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'stop' is not defined" + ] + } + ], "source": [ "stop" ] From 506826f6f3ce77776d293375682116d090f2e7fc Mon Sep 17 00:00:00 2001 From: brunombo Date: Thu, 12 Feb 2026 15:23:18 +0100 Subject: [PATCH 7/9] =?UTF-8?q?Cr=C3=A9=C3=A9=20=C3=A0=20l'aide=20de=20Col?= =?UTF-8?q?ab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- long_TTS_xtts_V6.ipynb | 239 +++++++++++++++++++++-------------------- 1 file changed, 125 insertions(+), 114 deletions(-) diff --git a/long_TTS_xtts_V6.ipynb b/long_TTS_xtts_V6.ipynb index 2a5fe44be33a..f6208a1b90c0 100644 --- a/long_TTS_xtts_V6.ipynb +++ b/long_TTS_xtts_V6.ipynb @@ -16,7 +16,7 @@ "id": "header" }, "source": [ - "# TTS XTTS v2 - Long Audio Generator v4\n", + "# TTS XTTS v2 - Long Audio Generator v6\n", "\n", "**Version 4.0** - Compatible PyTorch 2.9+ (Colab 2026)\n", "\n", @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "install_deps", "colab": { @@ -136,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "id": "main_module", "colab": { @@ -941,7 +941,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "mount_drive", "colab": { @@ -971,7 +971,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "id": "init_module", "colab": { @@ -1000,7 +1000,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "id": "example_short", "colab": { @@ -1076,112 +1076,6 @@ "preview_audio(result)" ] }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "example_long", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "1c25049d-6271-4864-d733-ef701a7b9a28" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "📊 Estimation:\n", - " Caracteres: 956\n", - " Duree estimee: 01:03\n", - " Chunks estimes: 2\n" - ] - } - ], - "source": [ - "# ==============================================================================\n", - "# EXEMPLE 2: Texte long\n", - "# ==============================================================================\n", - "\n", - "text_long = \"\"\"\n", - "La synthese vocale, egalement appelee text-to-speech ou TTS, est une technologie\n", - "qui permet de convertir du texte ecrit en parole audible. Cette technologie a\n", - "considerablement evolue au fil des annees, passant de voix robotiques et\n", - "mecaniques a des voix naturelles et expressives.\n", - "\n", - "XTTS v2 est l'un des modeles les plus avances dans ce domaine. Developpe par\n", - "Coqui AI, il utilise des techniques d'apprentissage profond pour generer une\n", - "parole de haute qualite dans plusieurs langues. Le modele peut meme cloner\n", - "des voix a partir d'un court echantillon audio de reference.\n", - "\n", - "Les applications de la synthese vocale sont nombreuses: assistants virtuels,\n", - "livres audio, accessibilite pour les personnes malvoyantes, doublage video,\n", - "et bien d'autres encore. Avec les avancees recentes en intelligence artificielle,\n", - "la qualite de la synthese vocale continue de s'ameliorer, rendant la distinction\n", - "entre voix humaine et voix synthetique de plus en plus difficile.\n", - "\"\"\"\n", - "\n", - "# Estimation avant generation\n", - "estimation = estimate_duration(text_long)\n", - "print(f\"📊 Estimation:\")\n", - "print(f\" Caracteres: {estimation['chars']:,}\")\n", - "print(f\" Duree estimee: {estimation['estimated_formatted']}\")\n", - "print(f\" Chunks estimes: {estimation['chunks_estimate']}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "generate_long", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 130 - }, - "outputId": "5f0391f1-1deb-448f-c71c-528ee69050aa" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "🔄 Chargement du modele XTTS v2...\n", - "✓ Modele charge\n", - "✓ Audio genere: /content/drive/MyDrive/TTS_Output/tts_female_fr_28bb1d44.wav\n", - " Duree: 54.82s | Voix: female_fr\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {} - } - ], - "source": [ - "# Generation du texte long\n", - "result_long = text_to_speech(\n", - " text_long.strip(),\n", - " voice=\"female_fr\",\n", - " enhance=True,\n", - " use_gdrive=True # Mettre True pour sauvegarder sur Drive\n", - ")\n", - "\n", - "# Previsualisation\n", - "preview_audio(result_long)" - ] - }, { "cell_type": "code", "source": [ @@ -1263,7 +1157,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "id": "custom_text", "colab": { @@ -1327,7 +1221,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "id": "cleanup", "colab": { @@ -1348,6 +1242,123 @@ "# Nettoyer le cache si necessaire (libere la memoire GPU)\n", "clear_cache()" ] + }, + { + "cell_type": "code", + "source": [ + "stop" + ], + "metadata": { + "id": "Y9x2A2JKO4rd" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "generate_long", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 130 + }, + "outputId": "5f0391f1-1deb-448f-c71c-528ee69050aa" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "🔄 Chargement du modele XTTS v2...\n", + "✓ Modele charge\n", + "✓ Audio genere: /content/drive/MyDrive/TTS_Output/tts_female_fr_28bb1d44.wav\n", + " Duree: 54.82s | Voix: female_fr\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {} + } + ], + "source": [ + "# Generation du texte long\n", + "result_long = text_to_speech(\n", + " text_long.strip(),\n", + " voice=\"female_fr\",\n", + " enhance=True,\n", + " use_gdrive=True # Mettre True pour sauvegarder sur Drive\n", + ")\n", + "\n", + "# Previsualisation\n", + "preview_audio(result_long)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "example_long", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1c25049d-6271-4864-d733-ef701a7b9a28" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "📊 Estimation:\n", + " Caracteres: 956\n", + " Duree estimee: 01:03\n", + " Chunks estimes: 2\n" + ] + } + ], + "source": [ + "# ==============================================================================\n", + "# EXEMPLE 2: Texte long\n", + "# ==============================================================================\n", + "\n", + "text_long = \"\"\"\n", + "La synthese vocale, egalement appelee text-to-speech ou TTS, est une technologie\n", + "qui permet de convertir du texte ecrit en parole audible. Cette technologie a\n", + "considerablement evolue au fil des annees, passant de voix robotiques et\n", + "mecaniques a des voix naturelles et expressives.\n", + "\n", + "XTTS v2 est l'un des modeles les plus avances dans ce domaine. Developpe par\n", + "Coqui AI, il utilise des techniques d'apprentissage profond pour generer une\n", + "parole de haute qualite dans plusieurs langues. Le modele peut meme cloner\n", + "des voix a partir d'un court echantillon audio de reference.\n", + "\n", + "Les applications de la synthese vocale sont nombreuses: assistants virtuels,\n", + "livres audio, accessibilite pour les personnes malvoyantes, doublage video,\n", + "et bien d'autres encore. Avec les avancees recentes en intelligence artificielle,\n", + "la qualite de la synthese vocale continue de s'ameliorer, rendant la distinction\n", + "entre voix humaine et voix synthetique de plus en plus difficile.\n", + "\"\"\"\n", + "\n", + "# Estimation avant generation\n", + "estimation = estimate_duration(text_long)\n", + "print(f\"📊 Estimation:\")\n", + "print(f\" Caracteres: {estimation['chars']:,}\")\n", + "print(f\" Duree estimee: {estimation['estimated_formatted']}\")\n", + "print(f\" Chunks estimes: {estimation['chunks_estimate']}\")" + ] } ], "metadata": { From 4c68a932c635d1dcd545b5c44ab50369ac63ab14 Mon Sep 17 00:00:00 2001 From: brunombo Date: Thu, 12 Feb 2026 15:56:32 +0100 Subject: [PATCH 8/9] =?UTF-8?q?Cr=C3=A9=C3=A9=20=C3=A0=20l'aide=20de=20Col?= =?UTF-8?q?ab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- long_TTS_xtts_V6.ipynb | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/long_TTS_xtts_V6.ipynb b/long_TTS_xtts_V6.ipynb index f6208a1b90c0..bc5a88c9121a 100644 --- a/long_TTS_xtts_V6.ipynb +++ b/long_TTS_xtts_V6.ipynb @@ -1076,6 +1076,21 @@ "preview_audio(result)" ] }, + { + "cell_type": "code", + "source": [ + "# 1. Importation de la bibliothèque principale\n", + "import ipywidgets as widgets\n", + "\n", + "# 2. Importation de la fonction d'affichage (optionnel mais recommandé pour la clarté)\n", + "from IPython.display import display\n" + ], + "metadata": { + "id": "VKh0LumVWdsK" + }, + "execution_count": null, + "outputs": [] + }, { "cell_type": "code", "source": [ From 1d7722cc5768a884a4d238a522e7077d6660ed72 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 22:30:20 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- long_TTS_xtts_V6.ipynb | 2819 +++++++++++++++++++++------------------- long_TTS_xtts_v3.ipynb | 2338 ++++++++++++++++----------------- 2 files changed, 2636 insertions(+), 2521 deletions(-) diff --git a/long_TTS_xtts_V6.ipynb b/long_TTS_xtts_V6.ipynb index bc5a88c9121a..7285e0566c00 100644 --- a/long_TTS_xtts_V6.ipynb +++ b/long_TTS_xtts_V6.ipynb @@ -1,1397 +1,1498 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "header" + }, + "source": [ + "# TTS XTTS v2 - Long Audio Generator v6\n", + "\n", + "**Version 4.0** - Compatible PyTorch 2.9+ (Colab 2026)\n", + "\n", + "Fonctionnalites:\n", + "- Generation audio longue duree (> 1 heure)\n", + "- Fix torchcodec/torchaudio pour PyTorch 2.9+\n", + "- Chunking intelligent par paragraphes\n", + "- Crossfade entre chunks\n", + "- Barre de progression avec ETA\n", + "- Support Google Drive\n", + "\n", + "**Auteur:** Bruno | **Corrections:** Gemini, Claude" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "install_deps", + "outputId": "55cd944b-c5fa-498c-fb32-78d151e5787e" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "header" - }, - "source": [ - "# TTS XTTS v2 - Long Audio Generator v6\n", - "\n", - "**Version 4.0** - Compatible PyTorch 2.9+ (Colab 2026)\n", - "\n", - "Fonctionnalites:\n", - "- Generation audio longue duree (> 1 heure)\n", - "- Fix torchcodec/torchaudio pour PyTorch 2.9+\n", - "- Chunking intelligent par paragraphes\n", - "- Crossfade entre chunks\n", - "- Barre de progression avec ETA\n", - "- Support Google Drive\n", - "\n", - "**Auteur:** Bruno | **Corrections:** Gemini, Claude" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.8 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m81.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for docopt (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "torch = 2.9.0+cu126 -> installation de torchcodec==0.9.* via https://download.pytorch.org/whl/cu126\n", + "torchcodec detecte: True\n", + "torchcodec version: 0.9.1+cu126\n" + ] + } + ], + "source": [ + "# Installation des dependances\n", + "# --------------------------------------------------------------\n", + "# Remarque importante (PyTorch>=2.9) :\n", + "# - Coqui TTS exige la bibliotheque `torchcodec` pour l'I/O audio. (cf. message d'erreur)\n", + "# - La version de torchcodec doit etre compatible avec votre version de torch.\n", + "#\n", + "# Sources (documentation officielle) :\n", + "# - coqui-tts: installer torch, torchaudio et (seulement pour torch>=2.9) torchcodec.\n", + "# - torchcodec: table de compatibilite torch <-> torchcodec + note CUDA/CPU.\n", + "\n", + "!pip install -q -U pip\n", + "!pip install -q numpy==2.0.2 scipy soundfile noisereduce\n", + "!pip install -q -U coqui-tts\n", + "\n", + "# Installer torchcodec dans une version compatible avec torch (et CUDA si detecte)\n", + "import sys, subprocess, re\n", + "\n", + "try:\n", + " import torch\n", + "except Exception as e:\n", + " raise RuntimeError(\n", + " \"PyTorch (torch) n'est pas importable. Installez d'abord torch/torchaudio, \"\n", + " \"puis relancez cette cellule.\"\n", + " ) from e\n", + "\n", + "\n", + "def _torch_major_minor(ver: str) -> str:\n", + " base = ver.split(\"+\")[0]\n", + " parts = base.split(\".\")\n", + " return \".\".join(parts[:2]) if len(parts) >= 2 else base\n", + "\n", + "\n", + "torch_ver = torch.__version__\n", + "mm = _torch_major_minor(torch_ver)\n", + "\n", + "# Mapping base sur la table de compatibilite officielle torchcodec.\n", + "if mm == \"2.10\":\n", + " torchcodec_spec = \"torchcodec==0.10.*\"\n", + "elif mm == \"2.9\":\n", + " torchcodec_spec = \"torchcodec==0.9.*\"\n", + "elif mm == \"2.8\":\n", + " torchcodec_spec = \"torchcodec==0.7.*\"\n", + "else:\n", + " torchcodec_spec = \"torchcodec\"\n", + "\n", + "# Si votre torch est un build CUDA (ex: 2.9.0+cu126), on tente d'installer torchcodec\n", + "# depuis l'index PyTorch correspondant. Sinon, on installe la version CPU depuis PyPI.\n", + "index_url = None\n", + "if \"+\" in torch_ver:\n", + " build = torch_ver.split(\"+\", 1)[1]\n", + " if build.startswith(\"cu\"):\n", + " index_url = f\"https://download.pytorch.org/whl/{build}\"\n", + "\n", + "print(\n", + " f\"torch = {torch_ver} -> installation de {torchcodec_spec}\"\n", + " + (f\" via {index_url}\" if index_url else \" (CPU PyPI)\")\n", + ")\n", + "\n", + "\n", + "def _pip_install_torchcodec():\n", + " cmd = [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", torchcodec_spec]\n", + " if index_url:\n", + " cmd += [\"--index-url\", index_url]\n", + " subprocess.check_call(cmd)\n", + "\n", + "\n", + "try:\n", + " _pip_install_torchcodec()\n", + "except Exception as e:\n", + " # Fallback : essayer sans index_url (CPU PyPI).\n", + " if index_url:\n", + " print(f\"⚠️ Echec avec l'index PyTorch ({index_url}). Tentative CPU via PyPI…\")\n", + " index_url = None\n", + " _pip_install_torchcodec()\n", + " else:\n", + " raise\n", + "\n", + "# Verification (metadonnees pip)\n", + "import importlib.util, importlib.metadata\n", + "\n", + "print(\"torchcodec detecte:\", importlib.util.find_spec(\"torchcodec\") is not None)\n", + "print(\"torchcodec version:\", importlib.metadata.version(\"torchcodec\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "main_module", + "outputId": "f17c6fea-cd4e-438a-f628-070f6bfde7b3" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "install_deps", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "55cd944b-c5fa-498c-fb32-78d151e5787e" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.8 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m81.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for docopt (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "torch = 2.9.0+cu126 -> installation de torchcodec==0.9.* via https://download.pytorch.org/whl/cu126\n", - "torchcodec detecte: True\n", - "torchcodec version: 0.9.1+cu126\n" - ] - } - ], - "source": [ - "# Installation des dependances\n", - "# --------------------------------------------------------------\n", - "# Remarque importante (PyTorch>=2.9) :\n", - "# - Coqui TTS exige la bibliotheque `torchcodec` pour l'I/O audio. (cf. message d'erreur)\n", - "# - La version de torchcodec doit etre compatible avec votre version de torch.\n", - "#\n", - "# Sources (documentation officielle) :\n", - "# - coqui-tts: installer torch, torchaudio et (seulement pour torch>=2.9) torchcodec.\n", - "# - torchcodec: table de compatibilite torch <-> torchcodec + note CUDA/CPU.\n", - "\n", - "!pip install -q -U pip\n", - "!pip install -q numpy==2.0.2 scipy soundfile noisereduce\n", - "!pip install -q -U coqui-tts\n", - "\n", - "# Installer torchcodec dans une version compatible avec torch (et CUDA si detecte)\n", - "import sys, subprocess, re\n", - "\n", - "try:\n", - " import torch\n", - "except Exception as e:\n", - " raise RuntimeError(\n", - " \"PyTorch (torch) n'est pas importable. Installez d'abord torch/torchaudio, \"\n", - " \"puis relancez cette cellule.\"\n", - " ) from e\n", - "\n", - "def _torch_major_minor(ver: str) -> str:\n", - " base = ver.split(\"+\")[0]\n", - " parts = base.split(\".\")\n", - " return \".\".join(parts[:2]) if len(parts) >= 2 else base\n", - "\n", - "torch_ver = torch.__version__\n", - "mm = _torch_major_minor(torch_ver)\n", - "\n", - "# Mapping base sur la table de compatibilite officielle torchcodec.\n", - "if mm == \"2.10\":\n", - " torchcodec_spec = \"torchcodec==0.10.*\"\n", - "elif mm == \"2.9\":\n", - " torchcodec_spec = \"torchcodec==0.9.*\"\n", - "elif mm == \"2.8\":\n", - " torchcodec_spec = \"torchcodec==0.7.*\"\n", - "else:\n", - " torchcodec_spec = \"torchcodec\"\n", - "\n", - "# Si votre torch est un build CUDA (ex: 2.9.0+cu126), on tente d'installer torchcodec\n", - "# depuis l'index PyTorch correspondant. Sinon, on installe la version CPU depuis PyPI.\n", - "index_url = None\n", - "if \"+\" in torch_ver:\n", - " build = torch_ver.split(\"+\", 1)[1]\n", - " if build.startswith(\"cu\"):\n", - " index_url = f\"https://download.pytorch.org/whl/{build}\"\n", - "\n", - "print(f\"torch = {torch_ver} -> installation de {torchcodec_spec}\" + (f\" via {index_url}\" if index_url else \" (CPU PyPI)\"))\n", - "\n", - "def _pip_install_torchcodec():\n", - " cmd = [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", torchcodec_spec]\n", - " if index_url:\n", - " cmd += [\"--index-url\", index_url]\n", - " subprocess.check_call(cmd)\n", - "\n", - "try:\n", - " _pip_install_torchcodec()\n", - "except Exception as e:\n", - " # Fallback : essayer sans index_url (CPU PyPI).\n", - " if index_url:\n", - " print(f\"⚠️ Echec avec l'index PyTorch ({index_url}). Tentative CPU via PyPI…\")\n", - " index_url = None\n", - " _pip_install_torchcodec()\n", - " else:\n", - " raise\n", - "\n", - "# Verification (metadonnees pip)\n", - "import importlib.util, importlib.metadata\n", - "print(\"torchcodec detecte:\", importlib.util.find_spec(\"torchcodec\") is not None)\n", - "print(\"torchcodec version:\", importlib.metadata.version(\"torchcodec\"))\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Patch torchaudio applique (backend: soundfile)\n", + "⚙️ Device: cuda (Tesla T4)\n", + "\n", + "============================================================\n", + "TTS XTTS v2 - Long Audio Generator v4\n", + "Compatible PyTorch 2.9+ (fix torchcodec)\n", + "============================================================\n", + "Voix disponibles: ['female_fr', 'male_fr']\n" + ] + } + ], + "source": [ + "# -*- coding: utf-8 -*-\n", + "\"\"\"\n", + "TTS XTTS v2 - Version Long Audio v4\n", + "====================================\n", + "\n", + "Module de synthese vocale haute qualite utilisant Coqui XTTS v2.\n", + "Compatible avec PyTorch 2.9+ (fix torchcodec/torchaudio).\n", + "\n", + "Auteur: Bruno\n", + "Date: Janvier 2026\n", + "Corrections: Gemini, Claude\n", + "\"\"\"\n", + "\n", + "# ==============================================================================\n", + "# IMPORTS STANDARDS (APRES LE FIX)\n", + "# ==============================================================================\n", + "\n", + "import os\n", + "import re\n", + "import gc\n", + "import wave\n", + "import time\n", + "import hashlib\n", + "import warnings\n", + "import inspect\n", + "from pathlib import Path\n", + "from typing import Optional, List\n", + "from dataclasses import dataclass\n", + "import numpy as np\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "\n", + "# ==============================================================================\n", + "# TORCHAUDIO FIX - Backend soundfile\n", + "# ==============================================================================\n", + "\n", + "\n", + "def _patch_torchaudio():\n", + " \"\"\"\n", + " Patch torchaudio.load pour utiliser le backend soundfile au lieu de torchcodec.\n", + " Resout l'erreur: \"Could not load libtorchcodec\" sur Colab avec PyTorch 2.9+.\n", + " \"\"\"\n", + " try:\n", + " import torchaudio\n", + "\n", + " # Verifier si deja patche\n", + " if hasattr(torchaudio, \"_original_load_patched\"):\n", + " return\n", + "\n", + " # Sauvegarder la fonction originale\n", + " _original_load = torchaudio.load\n", + "\n", + " def _patched_load(filepath, *args, **kwargs):\n", + " \"\"\"\n", + " Version patchee de torchaudio.load qui utilise soundfile comme backend.\n", + " \"\"\"\n", + " # Forcer le backend soundfile si non specifie\n", + " if \"backend\" not in kwargs:\n", + " kwargs[\"backend\"] = \"soundfile\"\n", + "\n", + " try:\n", + " return _original_load(filepath, *args, **kwargs)\n", + " except Exception as e:\n", + " # Si soundfile echoue, essayer sans specifier de backend\n", + " if \"backend\" in kwargs:\n", + " del kwargs[\"backend\"]\n", + " try:\n", + " return _original_load(filepath, *args, **kwargs)\n", + " except:\n", + " pass\n", + " raise e\n", + "\n", + " # Appliquer le patch\n", + " torchaudio.load = _patched_load\n", + " torchaudio._original_load_patched = True\n", + " print(\"✓ Patch torchaudio applique (backend: soundfile)\")\n", + "\n", + " except ImportError:\n", + " pass\n", + " except Exception as e:\n", + " print(f\"⚠️ Impossible de patcher torchaudio: {e}\")\n", + "\n", + "\n", + "# Appliquer le patch torchaudio\n", + "_patch_torchaudio()\n", + "\n", + "# ==============================================================================\n", + "# CONFIGURATION\n", + "# ==============================================================================\n", + "\n", + "\n", + "@dataclass\n", + "class TTSConfig:\n", + " \"\"\"Configuration globale du module TTS.\"\"\"\n", + "\n", + " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", + " SAMPLE_RATE: int = 24000\n", + " DEFAULT_LANGUAGE: str = \"fr\"\n", + " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", + " MAX_CHARS_PER_CHUNK: int = 500\n", + " CROSSFADE_DURATION: float = 0.05\n", + " ENABLE_TEXT_SPLITTING: bool = True\n", + " PRESET_VOICES: dict = None\n", + "\n", + " def __post_init__(self):\n", + " self.PRESET_VOICES = {\n", + " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", + " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", + " }\n", + "\n", + "\n", + "Config = TTSConfig()\n", + "\n", + "# ==============================================================================\n", + "# DEVICE MANAGEMENT\n", + "# ==============================================================================\n", + "\n", + "_device = None\n", + "_device_name = \"cpu\"\n", + "\n", + "\n", + "def detect_device():\n", + " \"\"\"Detecte le meilleur device disponible.\"\"\"\n", + " global _device, _device_name\n", + " import torch\n", + "\n", + " # Essayer TPU\n", + " try:\n", + " import torch_xla.core.xla_model as xm\n", + "\n", + " _device = xm.xla_device()\n", + " _device_name = \"tpu\"\n", + " print(f\"⚙️ Device: TPU\")\n", + " return\n", + " except:\n", + " pass\n", + "\n", + " # Essayer CUDA\n", + " if torch.cuda.is_available():\n", + " _device = torch.device(\"cuda\")\n", + " _device_name = f\"cuda ({torch.cuda.get_device_name(0)})\"\n", + " print(f\"⚙️ Device: {_device_name}\")\n", + " return\n", + "\n", + " # Fallback CPU\n", + " _device = torch.device(\"cpu\")\n", + " _device_name = \"cpu\"\n", + " print(f\"⚙️ Device: CPU\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TEXT SPLITTING UTILITIES\n", + "# ==============================================================================\n", + "\n", + "\n", + "class TextSplitter:\n", + " \"\"\"Utilitaire pour decouper intelligemment les textes longs.\"\"\"\n", + "\n", + " @staticmethod\n", + " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", + " \"\"\"Estime la duree audio en secondes.\"\"\"\n", + " return len(text) / chars_per_second\n", + "\n", + " @staticmethod\n", + " def split_into_sentences(text: str) -> List[str]:\n", + " \"\"\"Decoupe le texte en phrases.\"\"\"\n", + " pattern = r\"(?<=[.!?])\\s+\"\n", + " sentences = re.split(pattern, text)\n", + " return [s.strip() for s in sentences if s.strip()]\n", + "\n", + " @staticmethod\n", + " def split_into_paragraphs(text: str) -> List[str]:\n", + " \"\"\"Decoupe le texte en paragraphes.\"\"\"\n", + " paragraphs = re.split(r\"\\n\\s*\\n\", text)\n", + " return [p.strip() for p in paragraphs if p.strip()]\n", + "\n", + " @classmethod\n", + " def split_for_long_audio(\n", + " cls, text: str, max_chars: int = 500, preserve_sentences: bool = True\n", + " ) -> List[str]:\n", + " \"\"\"Decoupe le texte pour generation audio longue.\"\"\"\n", + " if len(text) <= max_chars:\n", + " return [text]\n", + "\n", + " chunks = []\n", + " if preserve_sentences:\n", + " sentences = cls.split_into_sentences(text)\n", + " current_chunk = \"\"\n", + "\n", + " for sentence in sentences:\n", + " if len(sentence) > max_chars:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = \"\"\n", + " # Decouper la phrase trop longue par mots\n", + " words = sentence.split()\n", + " sub_chunk = \"\"\n", + " for word in words:\n", + " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", + " sub_chunk += \" \" + word if sub_chunk else word\n", + " else:\n", + " if sub_chunk:\n", + " chunks.append(sub_chunk.strip())\n", + " sub_chunk = word\n", + " if sub_chunk:\n", + " current_chunk = sub_chunk\n", + " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", + " current_chunk += \" \" + sentence if current_chunk else sentence\n", + " else:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = sentence\n", + "\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " else:\n", + " for i in range(0, len(text), max_chars):\n", + " chunks.append(text[i : i + max_chars])\n", + "\n", + " return chunks\n", + "\n", + "\n", + "# ==============================================================================\n", + "# AUDIO PROCESSING\n", + "# ==============================================================================\n", + "\n", + "\n", + "class AudioProcessor:\n", + " \"\"\"Processeur audio pour post-traitement et concatenation.\"\"\"\n", + "\n", + " @staticmethod\n", + " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", + " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target_linear = 10 ** (target_db / 20)\n", + " audio = audio * (target_linear / peak)\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + " @staticmethod\n", + " def crossfade(\n", + " audio1: np.ndarray, audio2: np.ndarray, sample_rate: int, duration: float = 0.05\n", + " ) -> np.ndarray:\n", + " \"\"\"Concatene deux segments audio avec crossfade.\"\"\"\n", + " if audio1.dtype == np.int16:\n", + " audio1 = audio1.astype(np.float32) / 32768.0\n", + " if audio2.dtype == np.int16:\n", + " audio2 = audio2.astype(np.float32) / 32768.0\n", + "\n", + " fade_samples = int(sample_rate * duration)\n", + " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", + " return np.concatenate([audio1, audio2])\n", + "\n", + " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", + " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", + " audio1_end = audio1[-fade_samples:] * fade_out\n", + " audio2_start = audio2[:fade_samples] * fade_in\n", + "\n", + " return np.concatenate(\n", + " [audio1[:-fade_samples], audio1_end + audio2_start, audio2[fade_samples:]]\n", + " )\n", + "\n", + " @classmethod\n", + " def concatenate_chunks(\n", + " cls,\n", + " audio_chunks: List[np.ndarray],\n", + " sample_rate: int,\n", + " crossfade_duration: float = 0.05,\n", + " ) -> np.ndarray:\n", + " \"\"\"Concatene plusieurs chunks audio avec crossfade.\"\"\"\n", + " if not audio_chunks:\n", + " return np.array([], dtype=np.float32)\n", + " if len(audio_chunks) == 1:\n", + " audio = audio_chunks[0]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " return audio\n", + "\n", + " result = audio_chunks[0]\n", + " if result.dtype == np.int16:\n", + " result = result.astype(np.float32) / 32768.0\n", + " for chunk in audio_chunks[1:]:\n", + " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", + " return result\n", + "\n", + " @staticmethod\n", + " def enhance(\n", + " audio: np.ndarray, sample_rate: int, normalize: bool = True, warmth: bool = True\n", + " ) -> np.ndarray:\n", + " \"\"\"Ameliore l'audio avec normalisation et warmth.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " # Ajouter de la chaleur (boost basses frequences)\n", + " if warmth:\n", + " try:\n", + " from scipy import signal\n", + "\n", + " nyquist = sample_rate / 2\n", + " cutoff = min(300, nyquist * 0.9) / nyquist\n", + " b, a = signal.butter(2, cutoff, btype=\"low\")\n", + " bass = signal.filtfilt(b, a, audio)\n", + " audio = audio + 0.15 * bass\n", + " except ImportError:\n", + " pass\n", + "\n", + " # Normaliser\n", + " if normalize:\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target = 10 ** (-3.0 / 20)\n", + " audio = audio * (target / peak)\n", + "\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + "\n", + "# ==============================================================================\n", + "# PROGRESS TRACKER\n", + "# ==============================================================================\n", + "\n", + "\n", + "class ProgressTracker:\n", + " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", + "\n", + " def __init__(self, total: int, description: str = \"\"):\n", + " self.total = total\n", + " self.current = 0\n", + " self.description = description\n", + " self.start_time = time.time()\n", + " self.chunk_times = []\n", + "\n", + " def update(self, chunk_duration: float = None):\n", + " \"\"\"Met a jour la progression.\"\"\"\n", + " self.current += 1\n", + " if chunk_duration:\n", + " self.chunk_times.append(chunk_duration)\n", + " self._display()\n", + "\n", + " def _display(self):\n", + " \"\"\"Affiche la barre de progression.\"\"\"\n", + " elapsed = time.time() - self.start_time\n", + " percent = (self.current / self.total) * 100\n", + "\n", + " if self.chunk_times:\n", + " avg_time = np.mean(self.chunk_times)\n", + " remaining = avg_time * (self.total - self.current)\n", + " eta_str = self._format_time(remaining)\n", + " else:\n", + " eta_str = \"...\"\n", + "\n", + " bar_length = 30\n", + " filled = int(bar_length * self.current / self.total)\n", + " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", + " elapsed_str = self._format_time(elapsed)\n", + "\n", + " print(\n", + " f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", + " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\",\n", + " end=\"\",\n", + " )\n", + "\n", + " if self.current >= self.total:\n", + " print()\n", + "\n", + " @staticmethod\n", + " def _format_time(seconds: float) -> str:\n", + " \"\"\"Formate les secondes en HH:MM:SS.\"\"\"\n", + " hours = int(seconds // 3600)\n", + " minutes = int((seconds % 3600) // 60)\n", + " secs = int(seconds % 60)\n", + " if hours > 0:\n", + " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", + " return f\"{minutes:02d}:{secs:02d}\"\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TTS ENGINE\n", + "# ==============================================================================\n", + "\n", + "_tts_model = None\n", + "_voices_cache = {}\n", + "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", + "\n", + "\n", + "def get_model():\n", + " \"\"\"Charge le modele XTTS v2 avec cache.\"\"\"\n", + " global _tts_model\n", + "\n", + " if _tts_model is None:\n", + " print(\"🔄 Chargement du modele XTTS v2...\")\n", + "\n", + " from TTS.api import TTS\n", + "\n", + " _tts_model = TTS(Config.MODEL_NAME)\n", + "\n", + " # Deplacement sur GPU (selon la version, .to() peut etre sur le wrapper ou sur le sous-modele)\n", + " if _device is not None and _device_name.startswith(\"cuda\"):\n", + " try:\n", + " if hasattr(_tts_model, \"to\"):\n", + " _tts_model = _tts_model.to(_device)\n", + " elif hasattr(_tts_model, \"tts_model\") and hasattr(\n", + " _tts_model.tts_model, \"to\"\n", + " ):\n", + " _tts_model.tts_model = _tts_model.tts_model.to(_device)\n", + " except Exception as e:\n", + " print(f\"⚠️ Impossible de deplacer le modele sur CUDA: {e}\")\n", + "\n", + " print(\"✓ Modele charge\")\n", + "\n", + " return _tts_model\n", + "\n", + "\n", + "def get_voice_path(voice: str) -> str:\n", + " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", + " global _voices_cache\n", + " import urllib.request\n", + "\n", + " if voice in _voices_cache:\n", + " return _voices_cache[voice]\n", + "\n", + " if os.path.isfile(voice):\n", + " _voices_cache[voice] = voice\n", + " return voice\n", + "\n", + " if voice in Config.PRESET_VOICES:\n", + " url = Config.PRESET_VOICES[voice]\n", + " path = f\"/tmp/{voice}.wav\"\n", + " if not os.path.exists(path):\n", + " print(f\"📥 Telechargement de la voix '{voice}'...\")\n", + " urllib.request.urlretrieve(url, path)\n", + " _voices_cache[voice] = path\n", + " return path\n", + "\n", + " raise FileNotFoundError(f\"Voix '{voice}' non trouvee\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# MAIN SYNTHESIS FUNCTIONS\n", + "# ==============================================================================\n", + "\n", + "\n", + "def _filter_kwargs(fn, kwargs: dict) -> dict:\n", + " \"\"\"Garde uniquement les kwargs acceptes par fn (compatibilite entre versions).\"\"\"\n", + " try:\n", + " sig = inspect.signature(fn)\n", + " return {k: v for k, v in kwargs.items() if k in sig.parameters}\n", + " except (TypeError, ValueError):\n", + " # Signature indisponible (ex: fonction C++) -> on ne filtre pas\n", + " return kwargs\n", + "\n", + "\n", + "def _get_conditioning_latents_compat(xtts_model, voice_path: str):\n", + " \"\"\"Compat: get_conditioning_latents() a change de signature selon les versions.\"\"\"\n", + " fn = getattr(xtts_model, \"get_conditioning_latents\", None)\n", + " if fn is None:\n", + " raise AttributeError(\n", + " \"Le modele XTTS ne fournit pas get_conditioning_latents().\"\n", + " )\n", + "\n", + " base_kwargs = {\"gpt_cond_len\": 30, \"max_ref_length\": 60}\n", + "\n", + " # Tentative par introspection\n", + " try:\n", + " sig = inspect.signature(fn)\n", + " params = sig.parameters\n", + "\n", + " if \"audio_path\" in params:\n", + " # Certaines versions veulent une liste, d'autres une str\n", + " try:\n", + " return fn(audio_path=[voice_path], **_filter_kwargs(fn, base_kwargs))\n", + " except TypeError:\n", + " return fn(audio_path=voice_path, **_filter_kwargs(fn, base_kwargs))\n", + "\n", + " if \"audio_paths\" in params:\n", + " return fn(audio_paths=[voice_path], **_filter_kwargs(fn, base_kwargs))\n", + "\n", + " if \"speaker_wav\" in params:\n", + " return fn(speaker_wav=voice_path, **_filter_kwargs(fn, base_kwargs))\n", + "\n", + " except (TypeError, ValueError):\n", + " pass\n", + "\n", + " # Fallback brut (plus permissif)\n", + " try:\n", + " return fn(audio_path=[voice_path], gpt_cond_len=30, max_ref_length=60)\n", + " except Exception:\n", + " try:\n", + " return fn(audio_path=voice_path, gpt_cond_len=30, max_ref_length=60)\n", + " except Exception:\n", + " return fn(voice_path)\n", + "\n", + "\n", + "def synthesize_chunk(\n", + " text: str, voice_path: str, language: str = \"fr\", enable_text_splitting: bool = True\n", + ") -> np.ndarray:\n", + " \"\"\"Synthetise un chunk de texte en audio via l'inference directe.\"\"\"\n", + " model_wrapper = get_model()\n", + "\n", + " # Acceder au modele XTTS directement (bypass SpeakerManager bug)\n", + " if hasattr(model_wrapper, \"synthesizer\"):\n", + " xtts_model = model_wrapper.synthesizer.tts_model\n", + " else:\n", + " xtts_model = model_wrapper.tts_model\n", + "\n", + " # Calculer les latents de conditionnement (compat multi-versions)\n", + " try:\n", + " gpt_cond_latent, speaker_embedding = _get_conditioning_latents_compat(\n", + " xtts_model, voice_path\n", + " )\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur calcul latents: {e}\")\n", + " raise e\n", + "\n", + " # Inference directe (filtrage des kwargs selon la signature)\n", + " try:\n", + " inference_kwargs = {\n", + " \"text\": text,\n", + " \"language\": language,\n", + " \"gpt_cond_latent\": gpt_cond_latent,\n", + " \"speaker_embedding\": speaker_embedding,\n", + " \"temperature\": 0.7,\n", + " \"length_penalty\": 1.0,\n", + " \"repetition_penalty\": 2.0,\n", + " \"top_k\": 50,\n", + " \"top_p\": 0.8,\n", + " \"enable_text_splitting\": enable_text_splitting,\n", + " }\n", + "\n", + " # Alias possibles selon versions\n", + " try:\n", + " sig = inspect.signature(xtts_model.inference)\n", + " params = sig.parameters\n", + " if \"speaker_embedding\" not in params and \"speaker_latents\" in params:\n", + " inference_kwargs[\"speaker_latents\"] = inference_kwargs.pop(\n", + " \"speaker_embedding\"\n", + " )\n", + " except (TypeError, ValueError):\n", + " pass\n", + "\n", + " out = xtts_model.inference(\n", + " **_filter_kwargs(xtts_model.inference, inference_kwargs)\n", + " )\n", + "\n", + " if isinstance(out, dict) and \"wav\" in out:\n", + " wav = out[\"wav\"]\n", + " else:\n", + " wav = out\n", + "\n", + " if hasattr(wav, \"cpu\"):\n", + " wav = wav.cpu().numpy()\n", + " if isinstance(wav, list):\n", + " wav = np.array(wav, dtype=np.float32)\n", + "\n", + " return wav\n", + "\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'inference directe : {e}\")\n", + " raise e\n", + "\n", + "\n", + "def text_to_speech_long(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " max_chars_per_chunk: int = None,\n", + " show_progress: bool = True,\n", + " enable_text_splitting: bool = True,\n", + ") -> dict:\n", + " \"\"\"Genere un fichier audio long (> 1 heure) a partir de texte.\"\"\"\n", + " import torch\n", + "\n", + " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Estimation\n", + " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", + " print(f\"\\n📝 Texte: {len(text):,} caracteres\")\n", + " print(f\"⏱️ Duree estimee: {ProgressTracker._format_time(estimated_duration)}\")\n", + "\n", + " # Decoupage\n", + " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", + " print(f\"📦 Chunks: {len(chunks)}\")\n", + "\n", + " # Synthese\n", + " progress = ProgressTracker(len(chunks), \"🎙️ Synthese\") if show_progress else None\n", + " audio_chunks = []\n", + "\n", + " for i, chunk in enumerate(chunks):\n", + " chunk_start = time.time()\n", + " try:\n", + " wav = synthesize_chunk(\n", + " text=chunk,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + " audio_chunks.append(wav)\n", + " except Exception as e:\n", + " print(f\"\\n⚠️ Erreur chunk {i + 1}: {e}\")\n", + " continue\n", + "\n", + " # Nettoyage memoire periodique\n", + " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", + " torch.cuda.empty_cache()\n", + "\n", + " if progress:\n", + " progress.update(time.time() - chunk_start)\n", + "\n", + " if not audio_chunks:\n", + " raise RuntimeError(\"Aucun audio genere\")\n", + "\n", + " # Concatenation\n", + " print(\"\\n🔗 Concatenation des chunks...\")\n", + " final_audio = AudioProcessor.concatenate_chunks(\n", + " audio_chunks, Config.SAMPLE_RATE, Config.CROSSFADE_DURATION\n", + " )\n", + "\n", + " # Nettoyage memoire\n", + " del audio_chunks\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " print(\"✨ Post-traitement...\")\n", + " final_audio = AudioProcessor.enhance(\n", + " final_audio, Config.SAMPLE_RATE, normalize=True, warmth=True\n", + " )\n", + " else:\n", + " final_audio = AudioProcessor.normalize(final_audio)\n", + "\n", + " # Conversion en int16\n", + " final_audio = (final_audio * 32767).astype(np.int16)\n", + "\n", + " # Chemin de sortie\n", + " if output_path is None:\n", + " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", + " output_path = f\"tts_long_{voice}_{h}.wav\"\n", + "\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarde WAV\n", + " print(f\"💾 Sauvegarde: {final_path}\")\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(final_audio.tobytes())\n", + "\n", + " duration = len(final_audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"\\n✅ Audio genere avec succes!\")\n", + " print(f\" 📁 Fichier: {final_path}\")\n", + " print(f\" ⏱️ Duree: {ProgressTracker._format_time(duration)}\")\n", + " print(f\" 📦 Chunks: {len(chunks)}\")\n", + " print(f\" 🎤 Voix: {voice}\")\n", + "\n", + " return {\n", + " \"path\": str(final_path),\n", + " \"sample_rate\": Config.SAMPLE_RATE,\n", + " \"duration_seconds\": duration,\n", + " \"duration_formatted\": ProgressTracker._format_time(duration),\n", + " \"audio_data\": final_audio,\n", + " \"voice\": voice,\n", + " \"language\": language,\n", + " \"device\": _device_name,\n", + " \"chunks_count\": len(chunks),\n", + " \"text_length\": len(text),\n", + " }\n", + "\n", + "\n", + "def text_to_speech(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " enable_text_splitting: bool = True,\n", + ") -> dict:\n", + " \"\"\"Genere un fichier audio a partir de texte avec XTTS v2.\"\"\"\n", + " # Rediriger vers version longue si necessaire\n", + " if len(text) > 10000:\n", + " print(\"📢 Texte long detecte - utilisation de text_to_speech_long()\")\n", + " return text_to_speech_long(\n", + " text=text,\n", + " voice=voice,\n", + " language=language,\n", + " output_path=output_path,\n", + " enhance=enhance,\n", + " use_gdrive=use_gdrive,\n", + " gdrive_folder=gdrive_folder,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Synthese\n", + " wav = synthesize_chunk(\n", + " text=text,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", + " else:\n", + " audio = AudioProcessor.normalize(wav)\n", + "\n", + " audio = (audio * 32767).astype(np.int16)\n", + "\n", + " # Chemin de sortie\n", + " if output_path is None:\n", + " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", + " output_path = f\"tts_{voice}_{h}.wav\"\n", + "\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarde WAV\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(audio.tobytes())\n", + "\n", + " duration = len(audio) / Config.SAMPLE_RATE\n", + " print(f\"✓ Audio genere: {final_path}\")\n", + " print(f\" Duree: {duration:.2f}s | Voix: {voice}\")\n", + "\n", + " return {\n", + " \"path\": str(final_path),\n", + " \"sample_rate\": Config.SAMPLE_RATE,\n", + " \"duration_seconds\": duration,\n", + " \"audio_data\": audio,\n", + " \"voice\": voice,\n", + " \"language\": language,\n", + " \"device\": _device_name,\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# UTILITIES\n", + "# ==============================================================================\n", + "\n", + "\n", + "def preview_audio(result: dict) -> None:\n", + " \"\"\"Previsualise l'audio dans le notebook.\"\"\"\n", + " from IPython.display import Audio, display\n", + "\n", + " audio = result[\"audio_data\"]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " display(Audio(audio, rate=result[\"sample_rate\"]))\n", + "\n", + "\n", + "def list_voices() -> list:\n", + " \"\"\"Liste les voix disponibles.\"\"\"\n", + " return list(Config.PRESET_VOICES.keys())\n", + "\n", + "\n", + "def list_languages() -> list:\n", + " \"\"\"Liste les langues supportees.\"\"\"\n", + " return [\n", + " \"en\",\n", + " \"es\",\n", + " \"fr\",\n", + " \"de\",\n", + " \"it\",\n", + " \"pt\",\n", + " \"pl\",\n", + " \"tr\",\n", + " \"ru\",\n", + " \"nl\",\n", + " \"cs\",\n", + " \"ar\",\n", + " \"zh-cn\",\n", + " \"ja\",\n", + " \"hu\",\n", + " \"ko\",\n", + " \"hi\",\n", + " ]\n", + "\n", + "\n", + "def clear_cache():\n", + " \"\"\"Vide le cache du modele.\"\"\"\n", + " global _tts_model\n", + " import torch\n", + "\n", + " _tts_model = None\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + " print(\"✓ Cache vide\")\n", + "\n", + "\n", + "def estimate_duration(text: str) -> dict:\n", + " \"\"\"Estime la duree audio pour un texte.\"\"\"\n", + " duration = TextSplitter.estimate_audio_duration(text)\n", + " chunks = len(TextSplitter.split_for_long_audio(text))\n", + " return {\n", + " \"chars\": len(text),\n", + " \"estimated_seconds\": duration,\n", + " \"estimated_formatted\": ProgressTracker._format_time(duration),\n", + " \"chunks_estimate\": chunks,\n", + " }\n", + "\n", + "\n", + "# Aliases\n", + "tts = text_to_speech\n", + "tts_long = text_to_speech_long\n", + "\n", + "# ==============================================================================\n", + "# INITIALIZATION\n", + "# ==============================================================================\n", + "\n", + "\n", + "def init():\n", + " \"\"\"Initialise le module.\"\"\"\n", + " detect_device()\n", + " print(\"✅ Module XTTS v2 Long Audio v4 charge\")\n", + " print(f\" Device: {_device_name}\")\n", + " print(f\" Voix disponibles: {list_voices()}\")\n", + " print(f\" enable_text_splitting: active par defaut\")\n", + " print(f\" Fix torchcodec: actif\")\n", + "\n", + "\n", + "# Auto-init\n", + "try:\n", + " detect_device()\n", + "except:\n", + " pass\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"TTS XTTS v2 - Long Audio Generator v4\")\n", + "print(\"Compatible PyTorch 2.9+ (fix torchcodec)\")\n", + "print(\"=\" * 60)\n", + "print(f\"Voix disponibles: {list_voices()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "mount_drive", + "outputId": "483dc2dd-86ae-456f-86b5-e6f92e96f257" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "main_module", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "f17c6fea-cd4e-438a-f628-070f6bfde7b3" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "✓ Patch torchaudio applique (backend: soundfile)\n", - "⚙️ Device: cuda (Tesla T4)\n", - "\n", - "============================================================\n", - "TTS XTTS v2 - Long Audio Generator v4\n", - "Compatible PyTorch 2.9+ (fix torchcodec)\n", - "============================================================\n", - "Voix disponibles: ['female_fr', 'male_fr']\n" - ] - } - ], - "source": [ - "# -*- coding: utf-8 -*-\n", - "\"\"\"\n", - "TTS XTTS v2 - Version Long Audio v4\n", - "====================================\n", - "\n", - "Module de synthese vocale haute qualite utilisant Coqui XTTS v2.\n", - "Compatible avec PyTorch 2.9+ (fix torchcodec/torchaudio).\n", - "\n", - "Auteur: Bruno\n", - "Date: Janvier 2026\n", - "Corrections: Gemini, Claude\n", - "\"\"\"\n", - "\n", - "# ==============================================================================\n", - "# IMPORTS STANDARDS (APRES LE FIX)\n", - "# ==============================================================================\n", - "\n", - "import os\n", - "import re\n", - "import gc\n", - "import wave\n", - "import time\n", - "import hashlib\n", - "import warnings\n", - "import inspect\n", - "from pathlib import Path\n", - "from typing import Optional, List\n", - "from dataclasses import dataclass\n", - "import numpy as np\n", - "\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", - "\n", - "# ==============================================================================\n", - "# TORCHAUDIO FIX - Backend soundfile\n", - "# ==============================================================================\n", - "\n", - "def _patch_torchaudio():\n", - " \"\"\"\n", - " Patch torchaudio.load pour utiliser le backend soundfile au lieu de torchcodec.\n", - " Resout l'erreur: \"Could not load libtorchcodec\" sur Colab avec PyTorch 2.9+.\n", - " \"\"\"\n", - " try:\n", - " import torchaudio\n", - "\n", - " # Verifier si deja patche\n", - " if hasattr(torchaudio, '_original_load_patched'):\n", - " return\n", - "\n", - " # Sauvegarder la fonction originale\n", - " _original_load = torchaudio.load\n", - "\n", - " def _patched_load(filepath, *args, **kwargs):\n", - " \"\"\"\n", - " Version patchee de torchaudio.load qui utilise soundfile comme backend.\n", - " \"\"\"\n", - " # Forcer le backend soundfile si non specifie\n", - " if 'backend' not in kwargs:\n", - " kwargs['backend'] = 'soundfile'\n", - "\n", - " try:\n", - " return _original_load(filepath, *args, **kwargs)\n", - " except Exception as e:\n", - " # Si soundfile echoue, essayer sans specifier de backend\n", - " if 'backend' in kwargs:\n", - " del kwargs['backend']\n", - " try:\n", - " return _original_load(filepath, *args, **kwargs)\n", - " except:\n", - " pass\n", - " raise e\n", - "\n", - " # Appliquer le patch\n", - " torchaudio.load = _patched_load\n", - " torchaudio._original_load_patched = True\n", - " print(\"✓ Patch torchaudio applique (backend: soundfile)\")\n", - "\n", - " except ImportError:\n", - " pass\n", - " except Exception as e:\n", - " print(f\"⚠️ Impossible de patcher torchaudio: {e}\")\n", - "\n", - "# Appliquer le patch torchaudio\n", - "_patch_torchaudio()\n", - "\n", - "# ==============================================================================\n", - "# CONFIGURATION\n", - "# ==============================================================================\n", - "\n", - "@dataclass\n", - "class TTSConfig:\n", - " \"\"\"Configuration globale du module TTS.\"\"\"\n", - " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", - " SAMPLE_RATE: int = 24000\n", - " DEFAULT_LANGUAGE: str = \"fr\"\n", - " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", - " MAX_CHARS_PER_CHUNK: int = 500\n", - " CROSSFADE_DURATION: float = 0.05\n", - " ENABLE_TEXT_SPLITTING: bool = True\n", - " PRESET_VOICES: dict = None\n", - "\n", - " def __post_init__(self):\n", - " self.PRESET_VOICES = {\n", - " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", - " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", - " }\n", - "\n", - "Config = TTSConfig()\n", - "\n", - "# ==============================================================================\n", - "# DEVICE MANAGEMENT\n", - "# ==============================================================================\n", - "\n", - "_device = None\n", - "_device_name = \"cpu\"\n", - "\n", - "def detect_device():\n", - " \"\"\"Detecte le meilleur device disponible.\"\"\"\n", - " global _device, _device_name\n", - " import torch\n", - "\n", - " # Essayer TPU\n", - " try:\n", - " import torch_xla.core.xla_model as xm\n", - " _device = xm.xla_device()\n", - " _device_name = \"tpu\"\n", - " print(f\"⚙️ Device: TPU\")\n", - " return\n", - " except:\n", - " pass\n", - "\n", - " # Essayer CUDA\n", - " if torch.cuda.is_available():\n", - " _device = torch.device(\"cuda\")\n", - " _device_name = f\"cuda ({torch.cuda.get_device_name(0)})\"\n", - " print(f\"⚙️ Device: {_device_name}\")\n", - " return\n", - "\n", - " # Fallback CPU\n", - " _device = torch.device(\"cpu\")\n", - " _device_name = \"cpu\"\n", - " print(f\"⚙️ Device: CPU\")\n", - "\n", - "# ==============================================================================\n", - "# TEXT SPLITTING UTILITIES\n", - "# ==============================================================================\n", - "\n", - "class TextSplitter:\n", - " \"\"\"Utilitaire pour decouper intelligemment les textes longs.\"\"\"\n", - "\n", - " @staticmethod\n", - " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", - " \"\"\"Estime la duree audio en secondes.\"\"\"\n", - " return len(text) / chars_per_second\n", - "\n", - " @staticmethod\n", - " def split_into_sentences(text: str) -> List[str]:\n", - " \"\"\"Decoupe le texte en phrases.\"\"\"\n", - " pattern = r'(?<=[.!?])\\s+'\n", - " sentences = re.split(pattern, text)\n", - " return [s.strip() for s in sentences if s.strip()]\n", - "\n", - " @staticmethod\n", - " def split_into_paragraphs(text: str) -> List[str]:\n", - " \"\"\"Decoupe le texte en paragraphes.\"\"\"\n", - " paragraphs = re.split(r'\\n\\s*\\n', text)\n", - " return [p.strip() for p in paragraphs if p.strip()]\n", - "\n", - " @classmethod\n", - " def split_for_long_audio(cls, text: str, max_chars: int = 500, preserve_sentences: bool = True) -> List[str]:\n", - " \"\"\"Decoupe le texte pour generation audio longue.\"\"\"\n", - " if len(text) <= max_chars:\n", - " return [text]\n", - "\n", - " chunks = []\n", - " if preserve_sentences:\n", - " sentences = cls.split_into_sentences(text)\n", - " current_chunk = \"\"\n", - "\n", - " for sentence in sentences:\n", - " if len(sentence) > max_chars:\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " current_chunk = \"\"\n", - " # Decouper la phrase trop longue par mots\n", - " words = sentence.split()\n", - " sub_chunk = \"\"\n", - " for word in words:\n", - " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", - " sub_chunk += \" \" + word if sub_chunk else word\n", - " else:\n", - " if sub_chunk:\n", - " chunks.append(sub_chunk.strip())\n", - " sub_chunk = word\n", - " if sub_chunk:\n", - " current_chunk = sub_chunk\n", - " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", - " current_chunk += \" \" + sentence if current_chunk else sentence\n", - " else:\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " current_chunk = sentence\n", - "\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " else:\n", - " for i in range(0, len(text), max_chars):\n", - " chunks.append(text[i:i + max_chars])\n", - "\n", - " return chunks\n", - "\n", - "# ==============================================================================\n", - "# AUDIO PROCESSING\n", - "# ==============================================================================\n", - "\n", - "class AudioProcessor:\n", - " \"\"\"Processeur audio pour post-traitement et concatenation.\"\"\"\n", - "\n", - " @staticmethod\n", - " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", - " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - " peak = np.max(np.abs(audio))\n", - " if peak > 0:\n", - " target_linear = 10 ** (target_db / 20)\n", - " audio = audio * (target_linear / peak)\n", - " return np.clip(audio, -1.0, 1.0)\n", - "\n", - " @staticmethod\n", - " def crossfade(audio1: np.ndarray, audio2: np.ndarray, sample_rate: int, duration: float = 0.05) -> np.ndarray:\n", - " \"\"\"Concatene deux segments audio avec crossfade.\"\"\"\n", - " if audio1.dtype == np.int16:\n", - " audio1 = audio1.astype(np.float32) / 32768.0\n", - " if audio2.dtype == np.int16:\n", - " audio2 = audio2.astype(np.float32) / 32768.0\n", - "\n", - " fade_samples = int(sample_rate * duration)\n", - " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", - " return np.concatenate([audio1, audio2])\n", - "\n", - " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", - " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", - " audio1_end = audio1[-fade_samples:] * fade_out\n", - " audio2_start = audio2[:fade_samples] * fade_in\n", - "\n", - " return np.concatenate([audio1[:-fade_samples], audio1_end + audio2_start, audio2[fade_samples:]])\n", - "\n", - " @classmethod\n", - " def concatenate_chunks(cls, audio_chunks: List[np.ndarray], sample_rate: int, crossfade_duration: float = 0.05) -> np.ndarray:\n", - " \"\"\"Concatene plusieurs chunks audio avec crossfade.\"\"\"\n", - " if not audio_chunks:\n", - " return np.array([], dtype=np.float32)\n", - " if len(audio_chunks) == 1:\n", - " audio = audio_chunks[0]\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - " return audio\n", - "\n", - " result = audio_chunks[0]\n", - " if result.dtype == np.int16:\n", - " result = result.astype(np.float32) / 32768.0\n", - " for chunk in audio_chunks[1:]:\n", - " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", - " return result\n", - "\n", - " @staticmethod\n", - " def enhance(audio: np.ndarray, sample_rate: int, normalize: bool = True, warmth: bool = True) -> np.ndarray:\n", - " \"\"\"Ameliore l'audio avec normalisation et warmth.\"\"\"\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - "\n", - " # Ajouter de la chaleur (boost basses frequences)\n", - " if warmth:\n", - " try:\n", - " from scipy import signal\n", - " nyquist = sample_rate / 2\n", - " cutoff = min(300, nyquist * 0.9) / nyquist\n", - " b, a = signal.butter(2, cutoff, btype='low')\n", - " bass = signal.filtfilt(b, a, audio)\n", - " audio = audio + 0.15 * bass\n", - " except ImportError:\n", - " pass\n", - "\n", - " # Normaliser\n", - " if normalize:\n", - " peak = np.max(np.abs(audio))\n", - " if peak > 0:\n", - " target = 10 ** (-3.0 / 20)\n", - " audio = audio * (target / peak)\n", - "\n", - " return np.clip(audio, -1.0, 1.0)\n", - "\n", - "# ==============================================================================\n", - "# PROGRESS TRACKER\n", - "# ==============================================================================\n", - "\n", - "class ProgressTracker:\n", - " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", - "\n", - " def __init__(self, total: int, description: str = \"\"):\n", - " self.total = total\n", - " self.current = 0\n", - " self.description = description\n", - " self.start_time = time.time()\n", - " self.chunk_times = []\n", - "\n", - " def update(self, chunk_duration: float = None):\n", - " \"\"\"Met a jour la progression.\"\"\"\n", - " self.current += 1\n", - " if chunk_duration:\n", - " self.chunk_times.append(chunk_duration)\n", - " self._display()\n", - "\n", - " def _display(self):\n", - " \"\"\"Affiche la barre de progression.\"\"\"\n", - " elapsed = time.time() - self.start_time\n", - " percent = (self.current / self.total) * 100\n", - "\n", - " if self.chunk_times:\n", - " avg_time = np.mean(self.chunk_times)\n", - " remaining = avg_time * (self.total - self.current)\n", - " eta_str = self._format_time(remaining)\n", - " else:\n", - " eta_str = \"...\"\n", - "\n", - " bar_length = 30\n", - " filled = int(bar_length * self.current / self.total)\n", - " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", - " elapsed_str = self._format_time(elapsed)\n", - "\n", - " print(f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", - " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\", end=\"\")\n", - "\n", - " if self.current >= self.total:\n", - " print()\n", - "\n", - " @staticmethod\n", - " def _format_time(seconds: float) -> str:\n", - " \"\"\"Formate les secondes en HH:MM:SS.\"\"\"\n", - " hours = int(seconds // 3600)\n", - " minutes = int((seconds % 3600) // 60)\n", - " secs = int(seconds % 60)\n", - " if hours > 0:\n", - " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", - " return f\"{minutes:02d}:{secs:02d}\"\n", - "\n", - "# ==============================================================================\n", - "# TTS ENGINE\n", - "# ==============================================================================\n", - "\n", - "_tts_model = None\n", - "_voices_cache = {}\n", - "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", - "\n", - "def get_model():\n", - " \"\"\"Charge le modele XTTS v2 avec cache.\"\"\"\n", - " global _tts_model\n", - "\n", - " if _tts_model is None:\n", - " print(\"🔄 Chargement du modele XTTS v2...\")\n", - "\n", - " from TTS.api import TTS\n", - " _tts_model = TTS(Config.MODEL_NAME)\n", - "\n", - " # Deplacement sur GPU (selon la version, .to() peut etre sur le wrapper ou sur le sous-modele)\n", - " if _device is not None and _device_name.startswith(\"cuda\"):\n", - " try:\n", - " if hasattr(_tts_model, \"to\"):\n", - " _tts_model = _tts_model.to(_device)\n", - " elif hasattr(_tts_model, \"tts_model\") and hasattr(_tts_model.tts_model, \"to\"):\n", - " _tts_model.tts_model = _tts_model.tts_model.to(_device)\n", - " except Exception as e:\n", - " print(f\"⚠️ Impossible de deplacer le modele sur CUDA: {e}\")\n", - "\n", - " print(\"✓ Modele charge\")\n", - "\n", - " return _tts_model\n", - "\n", - "\n", - "def get_voice_path(voice: str) -> str:\n", - " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", - " global _voices_cache\n", - " import urllib.request\n", - "\n", - " if voice in _voices_cache:\n", - " return _voices_cache[voice]\n", - "\n", - " if os.path.isfile(voice):\n", - " _voices_cache[voice] = voice\n", - " return voice\n", - "\n", - " if voice in Config.PRESET_VOICES:\n", - " url = Config.PRESET_VOICES[voice]\n", - " path = f\"/tmp/{voice}.wav\"\n", - " if not os.path.exists(path):\n", - " print(f\"📥 Telechargement de la voix '{voice}'...\")\n", - " urllib.request.urlretrieve(url, path)\n", - " _voices_cache[voice] = path\n", - " return path\n", - "\n", - " raise FileNotFoundError(f\"Voix '{voice}' non trouvee\")\n", - "\n", - "# ==============================================================================\n", - "# MAIN SYNTHESIS FUNCTIONS\n", - "# ==============================================================================\n", - "\n", - "def _filter_kwargs(fn, kwargs: dict) -> dict:\n", - " \"\"\"Garde uniquement les kwargs acceptes par fn (compatibilite entre versions).\"\"\"\n", - " try:\n", - " sig = inspect.signature(fn)\n", - " return {k: v for k, v in kwargs.items() if k in sig.parameters}\n", - " except (TypeError, ValueError):\n", - " # Signature indisponible (ex: fonction C++) -> on ne filtre pas\n", - " return kwargs\n", - "\n", - "\n", - "def _get_conditioning_latents_compat(xtts_model, voice_path: str):\n", - " \"\"\"Compat: get_conditioning_latents() a change de signature selon les versions.\"\"\"\n", - " fn = getattr(xtts_model, \"get_conditioning_latents\", None)\n", - " if fn is None:\n", - " raise AttributeError(\"Le modele XTTS ne fournit pas get_conditioning_latents().\")\n", - "\n", - " base_kwargs = {\"gpt_cond_len\": 30, \"max_ref_length\": 60}\n", - "\n", - " # Tentative par introspection\n", - " try:\n", - " sig = inspect.signature(fn)\n", - " params = sig.parameters\n", - "\n", - " if \"audio_path\" in params:\n", - " # Certaines versions veulent une liste, d'autres une str\n", - " try:\n", - " return fn(audio_path=[voice_path], **_filter_kwargs(fn, base_kwargs))\n", - " except TypeError:\n", - " return fn(audio_path=voice_path, **_filter_kwargs(fn, base_kwargs))\n", - "\n", - " if \"audio_paths\" in params:\n", - " return fn(audio_paths=[voice_path], **_filter_kwargs(fn, base_kwargs))\n", - "\n", - " if \"speaker_wav\" in params:\n", - " return fn(speaker_wav=voice_path, **_filter_kwargs(fn, base_kwargs))\n", - "\n", - " except (TypeError, ValueError):\n", - " pass\n", - "\n", - " # Fallback brut (plus permissif)\n", - " try:\n", - " return fn(audio_path=[voice_path], gpt_cond_len=30, max_ref_length=60)\n", - " except Exception:\n", - " try:\n", - " return fn(audio_path=voice_path, gpt_cond_len=30, max_ref_length=60)\n", - " except Exception:\n", - " return fn(voice_path)\n", - "\n", - "\n", - "def synthesize_chunk(text: str, voice_path: str, language: str = \"fr\", enable_text_splitting: bool = True) -> np.ndarray:\n", - " \"\"\"Synthetise un chunk de texte en audio via l'inference directe.\"\"\"\n", - " model_wrapper = get_model()\n", - "\n", - " # Acceder au modele XTTS directement (bypass SpeakerManager bug)\n", - " if hasattr(model_wrapper, 'synthesizer'):\n", - " xtts_model = model_wrapper.synthesizer.tts_model\n", - " else:\n", - " xtts_model = model_wrapper.tts_model\n", - "\n", - " # Calculer les latents de conditionnement (compat multi-versions)\n", - " try:\n", - " gpt_cond_latent, speaker_embedding = _get_conditioning_latents_compat(xtts_model, voice_path)\n", - " except Exception as e:\n", - " print(f\"⚠️ Erreur calcul latents: {e}\")\n", - " raise e\n", - "\n", - " # Inference directe (filtrage des kwargs selon la signature)\n", - " try:\n", - " inference_kwargs = {\n", - " \"text\": text,\n", - " \"language\": language,\n", - " \"gpt_cond_latent\": gpt_cond_latent,\n", - " \"speaker_embedding\": speaker_embedding,\n", - " \"temperature\": 0.7,\n", - " \"length_penalty\": 1.0,\n", - " \"repetition_penalty\": 2.0,\n", - " \"top_k\": 50,\n", - " \"top_p\": 0.8,\n", - " \"enable_text_splitting\": enable_text_splitting,\n", - " }\n", - "\n", - " # Alias possibles selon versions\n", - " try:\n", - " sig = inspect.signature(xtts_model.inference)\n", - " params = sig.parameters\n", - " if \"speaker_embedding\" not in params and \"speaker_latents\" in params:\n", - " inference_kwargs[\"speaker_latents\"] = inference_kwargs.pop(\"speaker_embedding\")\n", - " except (TypeError, ValueError):\n", - " pass\n", - "\n", - " out = xtts_model.inference(**_filter_kwargs(xtts_model.inference, inference_kwargs))\n", - "\n", - " if isinstance(out, dict) and 'wav' in out:\n", - " wav = out['wav']\n", - " else:\n", - " wav = out\n", - "\n", - " if hasattr(wav, 'cpu'):\n", - " wav = wav.cpu().numpy()\n", - " if isinstance(wav, list):\n", - " wav = np.array(wav, dtype=np.float32)\n", - "\n", - " return wav\n", - "\n", - " except Exception as e:\n", - " print(f\"⚠️ Erreur lors de l'inference directe : {e}\")\n", - " raise e\n", - "\n", - "\n", - "def text_to_speech_long(\n", - " text: str,\n", - " voice: str = \"female_fr\",\n", - " language: str = \"fr\",\n", - " output_path: Optional[str] = None,\n", - " enhance: bool = False,\n", - " use_gdrive: bool = False,\n", - " gdrive_folder: str = None,\n", - " max_chars_per_chunk: int = None,\n", - " show_progress: bool = True,\n", - " enable_text_splitting: bool = True\n", - ") -> dict:\n", - " \"\"\"Genere un fichier audio long (> 1 heure) a partir de texte.\"\"\"\n", - " import torch\n", - "\n", - " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", - " voice_path = get_voice_path(voice)\n", - "\n", - " # Estimation\n", - " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", - " print(f\"\\n📝 Texte: {len(text):,} caracteres\")\n", - " print(f\"⏱️ Duree estimee: {ProgressTracker._format_time(estimated_duration)}\")\n", - "\n", - " # Decoupage\n", - " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", - " print(f\"📦 Chunks: {len(chunks)}\")\n", - "\n", - " # Synthese\n", - " progress = ProgressTracker(len(chunks), \"🎙️ Synthese\") if show_progress else None\n", - " audio_chunks = []\n", - "\n", - " for i, chunk in enumerate(chunks):\n", - " chunk_start = time.time()\n", - " try:\n", - " wav = synthesize_chunk(\n", - " text=chunk,\n", - " voice_path=voice_path,\n", - " language=language,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - " audio_chunks.append(wav)\n", - " except Exception as e:\n", - " print(f\"\\n⚠️ Erreur chunk {i+1}: {e}\")\n", - " continue\n", - "\n", - " # Nettoyage memoire periodique\n", - " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", - " torch.cuda.empty_cache()\n", - "\n", - " if progress:\n", - " progress.update(time.time() - chunk_start)\n", - "\n", - " if not audio_chunks:\n", - " raise RuntimeError(\"Aucun audio genere\")\n", - "\n", - " # Concatenation\n", - " print(\"\\n🔗 Concatenation des chunks...\")\n", - " final_audio = AudioProcessor.concatenate_chunks(\n", - " audio_chunks, Config.SAMPLE_RATE, Config.CROSSFADE_DURATION\n", - " )\n", - "\n", - " # Nettoyage memoire\n", - " del audio_chunks\n", - " gc.collect()\n", - " if _device_name.startswith(\"cuda\"):\n", - " torch.cuda.empty_cache()\n", - "\n", - " # Post-traitement\n", - " if enhance:\n", - " print(\"✨ Post-traitement...\")\n", - " final_audio = AudioProcessor.enhance(\n", - " final_audio, Config.SAMPLE_RATE, normalize=True, warmth=True\n", - " )\n", - " else:\n", - " final_audio = AudioProcessor.normalize(final_audio)\n", - "\n", - " # Conversion en int16\n", - " final_audio = (final_audio * 32767).astype(np.int16)\n", - "\n", - " # Chemin de sortie\n", - " if output_path is None:\n", - " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", - " output_path = f\"tts_long_{voice}_{h}.wav\"\n", - "\n", - " if use_gdrive:\n", - " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", - " folder.mkdir(parents=True, exist_ok=True)\n", - " final_path = folder / Path(output_path).name\n", - " else:\n", - " final_path = Path(output_path)\n", - "\n", - " # Sauvegarde WAV\n", - " print(f\"💾 Sauvegarde: {final_path}\")\n", - " with wave.open(str(final_path), \"wb\") as wav_file:\n", - " wav_file.setnchannels(1)\n", - " wav_file.setsampwidth(2)\n", - " wav_file.setframerate(Config.SAMPLE_RATE)\n", - " wav_file.writeframes(final_audio.tobytes())\n", - "\n", - " duration = len(final_audio) / Config.SAMPLE_RATE\n", - "\n", - " print(f\"\\n✅ Audio genere avec succes!\")\n", - " print(f\" 📁 Fichier: {final_path}\")\n", - " print(f\" ⏱️ Duree: {ProgressTracker._format_time(duration)}\")\n", - " print(f\" 📦 Chunks: {len(chunks)}\")\n", - " print(f\" 🎤 Voix: {voice}\")\n", - "\n", - " return {\n", - " 'path': str(final_path),\n", - " 'sample_rate': Config.SAMPLE_RATE,\n", - " 'duration_seconds': duration,\n", - " 'duration_formatted': ProgressTracker._format_time(duration),\n", - " 'audio_data': final_audio,\n", - " 'voice': voice,\n", - " 'language': language,\n", - " 'device': _device_name,\n", - " 'chunks_count': len(chunks),\n", - " 'text_length': len(text)\n", - " }\n", - "\n", - "\n", - "def text_to_speech(\n", - " text: str,\n", - " voice: str = \"female_fr\",\n", - " language: str = \"fr\",\n", - " output_path: Optional[str] = None,\n", - " enhance: bool = False,\n", - " use_gdrive: bool = False,\n", - " gdrive_folder: str = None,\n", - " enable_text_splitting: bool = True\n", - ") -> dict:\n", - " \"\"\"Genere un fichier audio a partir de texte avec XTTS v2.\"\"\"\n", - " # Rediriger vers version longue si necessaire\n", - " if len(text) > 10000:\n", - " print(\"📢 Texte long detecte - utilisation de text_to_speech_long()\")\n", - " return text_to_speech_long(\n", - " text=text, voice=voice, language=language, output_path=output_path,\n", - " enhance=enhance, use_gdrive=use_gdrive, gdrive_folder=gdrive_folder,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - "\n", - " voice_path = get_voice_path(voice)\n", - "\n", - " # Synthese\n", - " wav = synthesize_chunk(\n", - " text=text,\n", - " voice_path=voice_path,\n", - " language=language,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - "\n", - " # Post-traitement\n", - " if enhance:\n", - " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", - " else:\n", - " audio = AudioProcessor.normalize(wav)\n", - "\n", - " audio = (audio * 32767).astype(np.int16)\n", - "\n", - " # Chemin de sortie\n", - " if output_path is None:\n", - " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", - " output_path = f\"tts_{voice}_{h}.wav\"\n", - "\n", - " if use_gdrive:\n", - " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", - " folder.mkdir(parents=True, exist_ok=True)\n", - " final_path = folder / Path(output_path).name\n", - " else:\n", - " final_path = Path(output_path)\n", - "\n", - " # Sauvegarde WAV\n", - " with wave.open(str(final_path), \"wb\") as wav_file:\n", - " wav_file.setnchannels(1)\n", - " wav_file.setsampwidth(2)\n", - " wav_file.setframerate(Config.SAMPLE_RATE)\n", - " wav_file.writeframes(audio.tobytes())\n", - "\n", - " duration = len(audio) / Config.SAMPLE_RATE\n", - " print(f\"✓ Audio genere: {final_path}\")\n", - " print(f\" Duree: {duration:.2f}s | Voix: {voice}\")\n", - "\n", - " return {\n", - " 'path': str(final_path),\n", - " 'sample_rate': Config.SAMPLE_RATE,\n", - " 'duration_seconds': duration,\n", - " 'audio_data': audio,\n", - " 'voice': voice,\n", - " 'language': language,\n", - " 'device': _device_name\n", - " }\n", - "\n", - "# ==============================================================================\n", - "# UTILITIES\n", - "# ==============================================================================\n", - "\n", - "def preview_audio(result: dict) -> None:\n", - " \"\"\"Previsualise l'audio dans le notebook.\"\"\"\n", - " from IPython.display import Audio, display\n", - " audio = result['audio_data']\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - " display(Audio(audio, rate=result['sample_rate']))\n", - "\n", - "def list_voices() -> list:\n", - " \"\"\"Liste les voix disponibles.\"\"\"\n", - " return list(Config.PRESET_VOICES.keys())\n", - "\n", - "def list_languages() -> list:\n", - " \"\"\"Liste les langues supportees.\"\"\"\n", - " return [\"en\", \"es\", \"fr\", \"de\", \"it\", \"pt\", \"pl\", \"tr\", \"ru\", \"nl\", \"cs\", \"ar\", \"zh-cn\", \"ja\", \"hu\", \"ko\", \"hi\"]\n", - "\n", - "def clear_cache():\n", - " \"\"\"Vide le cache du modele.\"\"\"\n", - " global _tts_model\n", - " import torch\n", - " _tts_model = None\n", - " gc.collect()\n", - " if _device_name.startswith(\"cuda\"):\n", - " torch.cuda.empty_cache()\n", - " print(\"✓ Cache vide\")\n", - "\n", - "def estimate_duration(text: str) -> dict:\n", - " \"\"\"Estime la duree audio pour un texte.\"\"\"\n", - " duration = TextSplitter.estimate_audio_duration(text)\n", - " chunks = len(TextSplitter.split_for_long_audio(text))\n", - " return {\n", - " 'chars': len(text),\n", - " 'estimated_seconds': duration,\n", - " 'estimated_formatted': ProgressTracker._format_time(duration),\n", - " 'chunks_estimate': chunks\n", - " }\n", - "\n", - "# Aliases\n", - "tts = text_to_speech\n", - "tts_long = text_to_speech_long\n", - "\n", - "# ==============================================================================\n", - "# INITIALIZATION\n", - "# ==============================================================================\n", - "\n", - "def init():\n", - " \"\"\"Initialise le module.\"\"\"\n", - " detect_device()\n", - " print(\"✅ Module XTTS v2 Long Audio v4 charge\")\n", - " print(f\" Device: {_device_name}\")\n", - " print(f\" Voix disponibles: {list_voices()}\")\n", - " print(f\" enable_text_splitting: active par defaut\")\n", - " print(f\" Fix torchcodec: actif\")\n", - "\n", - "# Auto-init\n", - "try:\n", - " detect_device()\n", - "except:\n", - " pass\n", - "\n", - "print(\"\\n\" + \"=\"*60)\n", - "print(\"TTS XTTS v2 - Long Audio Generator v4\")\n", - "print(\"Compatible PyTorch 2.9+ (fix torchcodec)\")\n", - "print(\"=\"*60)\n", - "print(f\"Voix disponibles: {list_voices()}\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "# Monter Google Drive (optionnel)\n", + "# Le notebook peut aussi s'executer hors Colab : dans ce cas on ignore simplement le montage.\n", + "\n", + "try:\n", + " from google.colab import drive # type: ignore\n", + "\n", + " drive.mount(\"/content/drive\")\n", + "except Exception as e:\n", + " print(\"ℹ️ Google Colab non detecte ou Drive indisponible -> montage ignore.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "init_module", + "outputId": "33a7cc61-9054-4601-f338-aaa653bef831" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mount_drive", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "483dc2dd-86ae-456f-86b5-e6f92e96f257" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Mounted at /content/drive\n" - ] - } - ], - "source": [ - "# Monter Google Drive (optionnel)\n", - "# Le notebook peut aussi s'executer hors Colab : dans ce cas on ignore simplement le montage.\n", - "\n", - "try:\n", - " from google.colab import drive # type: ignore\n", - " drive.mount('/content/drive')\n", - "except Exception as e:\n", - " print(\"ℹ️ Google Colab non detecte ou Drive indisponible -> montage ignore.\")\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "⚙️ Device: cuda (Tesla T4)\n", + "✅ Module XTTS v2 Long Audio v4 charge\n", + " Device: cuda (Tesla T4)\n", + " Voix disponibles: ['female_fr', 'male_fr']\n", + " enable_text_splitting: active par defaut\n", + " Fix torchcodec: actif\n" + ] + } + ], + "source": [ + "# Initialisation du module\n", + "init()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 }, + "id": "example_short", + "outputId": "577e3721-4fce-40f1-979b-79c5aef7cf2e" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "init_module", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "33a7cc61-9054-4601-f338-aaa653bef831" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "⚙️ Device: cuda (Tesla T4)\n", - "✅ Module XTTS v2 Long Audio v4 charge\n", - " Device: cuda (Tesla T4)\n", - " Voix disponibles: ['female_fr', 'male_fr']\n", - " enable_text_splitting: active par defaut\n", - " Fix torchcodec: actif\n" - ] - } - ], - "source": [ - "# Initialisation du module\n", - "init()" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "📥 Telechargement de la voix 'female_fr'...\n", + "🔄 Chargement du modele XTTS v2...\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "example_short", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 235 - }, - "outputId": "577e3721-4fce-40f1-979b-79c5aef7cf2e" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "📥 Telechargement de la voix 'female_fr'...\n", - "🔄 Chargement du modele XTTS v2...\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "100%|██████████| 1.87G/1.87G [00:31<00:00, 59.9MiB/s]\n", - "4.37kiB [00:00, 7.24MiB/s]\n", - "361kiB [00:00, 107MiB/s]\n", - "100%|██████████| 32.0/32.0 [00:00<00:00, 91.7kiB/s]\n", - "100%|██████████| 7.75M/7.75M [00:00<00:00, 16.2MiB/s]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "✓ Modele charge\n", - "✓ Audio genere: tts_female_fr_3f623356.wav\n", - " Duree: 9.28s | Voix: female_fr\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {} - } - ], - "source": [ - "# ==============================================================================\n", - "# EXEMPLE 1: Texte court\n", - "# ==============================================================================\n", - "\n", - "text_court = \"\"\"\n", - "Bonjour! Ceci est un test de synthese vocale avec XTTS v2.\n", - "Le module est maintenant compatible avec PyTorch 2.9 et superieur.\n", - "\"\"\"\n", - "\n", - "# Choisir la voix\n", - "voice_gender = \"female_fr\" # ou \"male_fr\"\n", - "\n", - "# Generation\n", - "result = text_to_speech(text_court.strip(), voice=voice_gender, enhance=True)\n", - "\n", - "# Previsualisation\n", - "preview_audio(result)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1.87G/1.87G [00:31<00:00, 59.9MiB/s]\n", + "4.37kiB [00:00, 7.24MiB/s]\n", + "361kiB [00:00, 107MiB/s]\n", + "100%|██████████| 32.0/32.0 [00:00<00:00, 91.7kiB/s]\n", + "100%|██████████| 7.75M/7.75M [00:00<00:00, 16.2MiB/s]\n" + ] }, { - "cell_type": "code", - "source": [ - "# 1. Importation de la bibliothèque principale\n", - "import ipywidgets as widgets\n", - "\n", - "# 2. Importation de la fonction d'affichage (optionnel mais recommandé pour la clarté)\n", - "from IPython.display import display\n" - ], - "metadata": { - "id": "VKh0LumVWdsK" - }, - "execution_count": null, - "outputs": [] + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Modele charge\n", + "✓ Audio genere: tts_female_fr_3f623356.wav\n", + " Duree: 9.28s | Voix: female_fr\n" + ] }, { - "cell_type": "code", - "source": [ - "\n", - "# ==========================================\n", - "# 3. INTERFACE UTILISATEUR\n", - "# ==========================================\n", - "\n", - "text_input = widgets.Textarea(\n", - " placeholder='Entrez votre texte ici...',\n", - " value=\"texte\",\n", - " layout=widgets.Layout(width='100%', height='150px')\n", - ")\n", - "button = widgets.Button(description='Générer Audio', button_style='success', icon='check')\n", - "progress_bar = widgets.IntProgress(value=0, min=0, max=100, layout=widgets.Layout(width='100%', visibility='hidden'))\n", - "output = widgets.Output()\n", - "\n", - "def on_click(b):\n", - " with output:\n", - " clear_output()\n", - " if not text_input.value.strip():\n", - " print(\"❌ Le texte est vide.\")\n", - " return\n", - "\n", - " button.disabled = True\n", - " progress_bar.layout.visibility = 'visible'\n", - " progress_bar.bar_style = 'info'\n", - " progress_bar.value = 10\n", - "\n", - " try:\n", - " # 1. Drive\n", - " if not os.path.exists(\"/content/drive\"): drive.mount('/content/drive')\n", - " if not os.path.exists(DRIVE_FOLDER): os.makedirs(DRIVE_FOLDER)\n", - " progress_bar.value = 20\n", - "\n", - " # 2. Gemini\n", - " print(\"🧠 Optimisation du texte (Gemini)...\")\n", - " res_ia = traiter_via_gemini_pour_elevenlabs(text_input.value)\n", - " titre = res_ia.get('titre', 'Audio_Output')\n", - " texte_final = res_ia.get('texte_optimise', text_input.value)\n", - " progress_bar.value = 50\n", - "\n", - " # 3. ElevenLabs\n", - " print(f\"🎙️ Génération avec Voix ID: {VOICE_ID} ({MODEL_ID})...\")\n", - " nom_fichier = f\"{assainir_nom_fichier(titre)}.mp3\"\n", - " chemin_complet = os.path.join(DRIVE_FOLDER, nom_fichier)\n", - "\n", - " if generer_audio_elevenlabs(texte_final, chemin_complet):\n", - " progress_bar.value = 100\n", - " progress_bar.bar_style = 'success'\n", - " print(f\"✅ Fichier sauvegardé : {chemin_complet}\")\n", - " display(Audio(chemin_complet))\n", - " else:\n", - " raise Exception(\"Erreur API ElevenLabs\")\n", - "\n", - " except Exception as e:\n", - " progress_bar.bar_style = 'danger'\n", - " print(f\"❌ Erreur : {e}\")\n", - " finally:\n", - " button.disabled = False\n", - "\n", - "button.on_click(on_click)\n", - "\n", - "# Affichage avec rappel de la config chargée\n", - "display(widgets.VBox([\n", - " widgets.HTML(f\"

Générateur TTS - Config : {MODEL_ID}

\"),\n", - " widgets.Label(f\"Voix chargée : {VOICE_ID} (Guillaume)\"),\n", - " text_input,\n", - " button,\n", - " progress_bar,\n", - " output\n", - "]))" + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "metadata": { - "id": "umnfhCzn2WGG" - }, - "execution_count": null, - "outputs": [] + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# ==============================================================================\n", + "# EXEMPLE 1: Texte court\n", + "# ==============================================================================\n", + "\n", + "text_court = \"\"\"\n", + "Bonjour! Ceci est un test de synthese vocale avec XTTS v2.\n", + "Le module est maintenant compatible avec PyTorch 2.9 et superieur.\n", + "\"\"\"\n", + "\n", + "# Choisir la voix\n", + "voice_gender = \"female_fr\" # ou \"male_fr\"\n", + "\n", + "# Generation\n", + "result = text_to_speech(text_court.strip(), voice=voice_gender, enhance=True)\n", + "\n", + "# Previsualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VKh0LumVWdsK" + }, + "outputs": [], + "source": [ + "# 1. Importation de la bibliothèque principale\n", + "import ipywidgets as widgets\n", + "\n", + "# 2. Importation de la fonction d'affichage (optionnel mais recommandé pour la clarté)\n", + "from IPython.display import display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "umnfhCzn2WGG" + }, + "outputs": [], + "source": [ + "# ==========================================\n", + "# 3. INTERFACE UTILISATEUR\n", + "# ==========================================\n", + "\n", + "text_input = widgets.Textarea(\n", + " placeholder=\"Entrez votre texte ici...\",\n", + " value=\"texte\",\n", + " layout=widgets.Layout(width=\"100%\", height=\"150px\"),\n", + ")\n", + "button = widgets.Button(\n", + " description=\"Générer Audio\", button_style=\"success\", icon=\"check\"\n", + ")\n", + "progress_bar = widgets.IntProgress(\n", + " value=0, min=0, max=100, layout=widgets.Layout(width=\"100%\", visibility=\"hidden\")\n", + ")\n", + "output = widgets.Output()\n", + "\n", + "\n", + "def on_click(b):\n", + " with output:\n", + " clear_output()\n", + " if not text_input.value.strip():\n", + " print(\"❌ Le texte est vide.\")\n", + " return\n", + "\n", + " button.disabled = True\n", + " progress_bar.layout.visibility = \"visible\"\n", + " progress_bar.bar_style = \"info\"\n", + " progress_bar.value = 10\n", + "\n", + " try:\n", + " # 1. Drive\n", + " if not os.path.exists(\"/content/drive\"):\n", + " drive.mount(\"/content/drive\")\n", + " if not os.path.exists(DRIVE_FOLDER):\n", + " os.makedirs(DRIVE_FOLDER)\n", + " progress_bar.value = 20\n", + "\n", + " # 2. Gemini\n", + " print(\"🧠 Optimisation du texte (Gemini)...\")\n", + " res_ia = traiter_via_gemini_pour_elevenlabs(text_input.value)\n", + " titre = res_ia.get(\"titre\", \"Audio_Output\")\n", + " texte_final = res_ia.get(\"texte_optimise\", text_input.value)\n", + " progress_bar.value = 50\n", + "\n", + " # 3. ElevenLabs\n", + " print(f\"🎙️ Génération avec Voix ID: {VOICE_ID} ({MODEL_ID})...\")\n", + " nom_fichier = f\"{assainir_nom_fichier(titre)}.mp3\"\n", + " chemin_complet = os.path.join(DRIVE_FOLDER, nom_fichier)\n", + "\n", + " if generer_audio_elevenlabs(texte_final, chemin_complet):\n", + " progress_bar.value = 100\n", + " progress_bar.bar_style = \"success\"\n", + " print(f\"✅ Fichier sauvegardé : {chemin_complet}\")\n", + " display(Audio(chemin_complet))\n", + " else:\n", + " raise Exception(\"Erreur API ElevenLabs\")\n", + "\n", + " except Exception as e:\n", + " progress_bar.bar_style = \"danger\"\n", + " print(f\"❌ Erreur : {e}\")\n", + " finally:\n", + " button.disabled = False\n", + "\n", + "\n", + "button.on_click(on_click)\n", + "\n", + "# Affichage avec rappel de la config chargée\n", + "display(\n", + " widgets.VBox(\n", + " [\n", + " widgets.HTML(f\"

Générateur TTS - Config : {MODEL_ID}

\"),\n", + " widgets.Label(f\"Voix chargée : {VOICE_ID} (Guillaume)\"),\n", + " text_input,\n", + " button,\n", + " progress_bar,\n", + " output,\n", + " ]\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 96 }, + "id": "custom_text", + "outputId": "2a865273-027a-4958-9da2-7985d9a507d6" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "custom_text", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 96 - }, - "outputId": "2a865273-027a-4958-9da2-7985d9a507d6" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "✓ Audio genere: tts_female_fr_a37ea6fe.wav\n", - " Duree: 1.34s | Voix: female_fr\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {} - } - ], - "source": [ - "# ==============================================================================\n", - "# VOTRE TEXTE PERSONNALISE\n", - "# ==============================================================================\n", - "\n", - "# Entrez votre texte ici\n", - "text_to_synthesize = \"\"\"\n", - "Votre texte ici...\n", - "\"\"\"\n", - "\n", - "# Configuration\n", - "voice_gender = \"female_fr\" # \"female_fr\" ou \"male_fr\"\n", - "save_to_drive = False # True pour sauvegarder sur Google Drive\n", - "\n", - "# Generation\n", - "result = text_to_speech(\n", - " text_to_synthesize.strip(),\n", - " voice=voice_gender,\n", - " enhance=True,\n", - " use_gdrive=save_to_drive\n", - ")\n", - "\n", - "# Previsualisation\n", - "preview_audio(result)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Audio genere: tts_female_fr_a37ea6fe.wav\n", + " Duree: 1.34s | Voix: female_fr\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cleanup", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "6a7d6774-2254-4715-a5be-ea378f6a3eee" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "✓ Cache vide\n" - ] - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "# Nettoyer le cache si necessaire (libere la memoire GPU)\n", - "clear_cache()" + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# ==============================================================================\n", + "# VOTRE TEXTE PERSONNALISE\n", + "# ==============================================================================\n", + "\n", + "# Entrez votre texte ici\n", + "text_to_synthesize = \"\"\"\n", + "Votre texte ici...\n", + "\"\"\"\n", + "\n", + "# Configuration\n", + "voice_gender = \"female_fr\" # \"female_fr\" ou \"male_fr\"\n", + "save_to_drive = False # True pour sauvegarder sur Google Drive\n", + "\n", + "# Generation\n", + "result = text_to_speech(\n", + " text_to_synthesize.strip(),\n", + " voice=voice_gender,\n", + " enhance=True,\n", + " use_gdrive=save_to_drive,\n", + ")\n", + "\n", + "# Previsualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "cleanup", + "outputId": "6a7d6774-2254-4715-a5be-ea378f6a3eee" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "stop" - ], - "metadata": { - "id": "Y9x2A2JKO4rd" - }, - "execution_count": null, - "outputs": [] + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Cache vide\n" + ] + } + ], + "source": [ + "# Nettoyer le cache si necessaire (libere la memoire GPU)\n", + "clear_cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Y9x2A2JKO4rd" + }, + "outputs": [], + "source": [ + "stop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 130 }, + "id": "generate_long", + "outputId": "5f0391f1-1deb-448f-c71c-528ee69050aa" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "generate_long", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 130 - }, - "outputId": "5f0391f1-1deb-448f-c71c-528ee69050aa" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "🔄 Chargement du modele XTTS v2...\n", - "✓ Modele charge\n", - "✓ Audio genere: /content/drive/MyDrive/TTS_Output/tts_female_fr_28bb1d44.wav\n", - " Duree: 54.82s | Voix: female_fr\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {} - } - ], - "source": [ - "# Generation du texte long\n", - "result_long = text_to_speech(\n", - " text_long.strip(),\n", - " voice=\"female_fr\",\n", - " enhance=True,\n", - " use_gdrive=True # Mettre True pour sauvegarder sur Drive\n", - ")\n", - "\n", - "# Previsualisation\n", - "preview_audio(result_long)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "🔄 Chargement du modele XTTS v2...\n", + "✓ Modele charge\n", + "✓ Audio genere: /content/drive/MyDrive/TTS_Output/tts_female_fr_28bb1d44.wav\n", + " Duree: 54.82s | Voix: female_fr\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "example_long", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "1c25049d-6271-4864-d733-ef701a7b9a28" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "📊 Estimation:\n", - " Caracteres: 956\n", - " Duree estimee: 01:03\n", - " Chunks estimes: 2\n" - ] - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "# ==============================================================================\n", - "# EXEMPLE 2: Texte long\n", - "# ==============================================================================\n", - "\n", - "text_long = \"\"\"\n", - "La synthese vocale, egalement appelee text-to-speech ou TTS, est une technologie\n", - "qui permet de convertir du texte ecrit en parole audible. Cette technologie a\n", - "considerablement evolue au fil des annees, passant de voix robotiques et\n", - "mecaniques a des voix naturelles et expressives.\n", - "\n", - "XTTS v2 est l'un des modeles les plus avances dans ce domaine. Developpe par\n", - "Coqui AI, il utilise des techniques d'apprentissage profond pour generer une\n", - "parole de haute qualite dans plusieurs langues. Le modele peut meme cloner\n", - "des voix a partir d'un court echantillon audio de reference.\n", - "\n", - "Les applications de la synthese vocale sont nombreuses: assistants virtuels,\n", - "livres audio, accessibilite pour les personnes malvoyantes, doublage video,\n", - "et bien d'autres encore. Avec les avancees recentes en intelligence artificielle,\n", - "la qualite de la synthese vocale continue de s'ameliorer, rendant la distinction\n", - "entre voix humaine et voix synthetique de plus en plus difficile.\n", - "\"\"\"\n", - "\n", - "# Estimation avant generation\n", - "estimation = estimate_duration(text_long)\n", - "print(f\"📊 Estimation:\")\n", - "print(f\" Caracteres: {estimation['chars']:,}\")\n", - "print(f\" Duree estimee: {estimation['estimated_formatted']}\")\n", - "print(f\" Chunks estimes: {estimation['chunks_estimate']}\")" + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" } - ], - "metadata": { - "accelerator": "GPU", + ], + "source": [ + "# Generation du texte long\n", + "result_long = text_to_speech(\n", + " text_long.strip(),\n", + " voice=\"female_fr\",\n", + " enhance=True,\n", + " use_gdrive=True, # Mettre True pour sauvegarder sur Drive\n", + ")\n", + "\n", + "# Previsualisation\n", + "preview_audio(result_long)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "gpuType": "T4", - "provenance": [], - "include_colab_link": true + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.12" + "id": "example_long", + "outputId": "1c25049d-6271-4864-d733-ef701a7b9a28" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📊 Estimation:\n", + " Caracteres: 956\n", + " Duree estimee: 01:03\n", + " Chunks estimes: 2\n" + ] } + ], + "source": [ + "# ==============================================================================\n", + "# EXEMPLE 2: Texte long\n", + "# ==============================================================================\n", + "\n", + "text_long = \"\"\"\n", + "La synthese vocale, egalement appelee text-to-speech ou TTS, est une technologie\n", + "qui permet de convertir du texte ecrit en parole audible. Cette technologie a\n", + "considerablement evolue au fil des annees, passant de voix robotiques et\n", + "mecaniques a des voix naturelles et expressives.\n", + "\n", + "XTTS v2 est l'un des modeles les plus avances dans ce domaine. Developpe par\n", + "Coqui AI, il utilise des techniques d'apprentissage profond pour generer une\n", + "parole de haute qualite dans plusieurs langues. Le modele peut meme cloner\n", + "des voix a partir d'un court echantillon audio de reference.\n", + "\n", + "Les applications de la synthese vocale sont nombreuses: assistants virtuels,\n", + "livres audio, accessibilite pour les personnes malvoyantes, doublage video,\n", + "et bien d'autres encore. Avec les avancees recentes en intelligence artificielle,\n", + "la qualite de la synthese vocale continue de s'ameliorer, rendant la distinction\n", + "entre voix humaine et voix synthetique de plus en plus difficile.\n", + "\"\"\"\n", + "\n", + "# Estimation avant generation\n", + "estimation = estimate_duration(text_long)\n", + "print(f\"📊 Estimation:\")\n", + "print(f\" Caracteres: {estimation['chars']:,}\")\n", + "print(f\" Duree estimee: {estimation['estimated_formatted']}\")\n", + "print(f\" Chunks estimes: {estimation['chunks_estimate']}\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "python", + "version": "3.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/long_TTS_xtts_v3.ipynb b/long_TTS_xtts_v3.ipynb index fc0ac929cb20..2509b7d90991 100644 --- a/long_TTS_xtts_v3.ipynb +++ b/long_TTS_xtts_v3.ipynb @@ -1,1189 +1,1203 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s8NfbT3sw2-z" + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "XYDOUW523oJP" + }, + "outputs": [], + "source": [ + "PROMPT = \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", + "\n", + "voice_gender = \"female_fr\"\n", + "# ['female_fr', 'male_fr']" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "jIKtDA5hweJP", + "outputId": "8b9cbf18-4496-4c26-d2c8-50b97a3710d2" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "s8NfbT3sw2-z" - }, - "source": [] + "name": "stdout", + "output_type": "stream", + "text": [ + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m862.8/862.8 kB\u001b[0m \u001b[31m57.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m345.1/345.1 kB\u001b[0m \u001b[31m25.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.2/56.2 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m997.3/997.3 kB\u001b[0m \u001b[31m62.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m648.4/648.4 kB\u001b[0m \u001b[31m55.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m18.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "# Installation des dépendances\n", + "!pip install -q scipy noisereduce\n", + "\n", + "# Installation du fork maintenu (supporte Python 3.12+)\n", + "!pip install -q coqui-tts\n", + "\n", + "\n", + "# Installation des dépendances\n", + "!pip install -q scipy noisereduce\n", + "!pip install -q numpy==2.0.2\n", + "\n", + "# Installation de soundfile pour le chargement audio (évite le bug torchcodec)\n", + "!pip install -q soundfile\n", + "\n", + "# Installation du fork maintenu (supporte Python 3.12+)\n", + "!pip install -q coqui-tts\n", + "\n", + "# Note: torchcodec n'est plus nécessaire - on utilise soundfile comme backend" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "1FQhKQ1IE4iX", + "outputId": "68446e3c-ac8f-474b-a329-dfa6b8d6aec9" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "XYDOUW523oJP" - }, - "outputs": [], - "source": [ - "PROMPT = \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", - "\n", - "voice_gender = 'female_fr'\n", - "# ['female_fr', 'male_fr']" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "📦 Installation de FFmpeg...\n", + "✓ FFmpeg installé\n", + "⚙️ Device: cuda (Tesla T4\n", + "✅ Module XTTS v2 Long Audio chargé\n", + " Device: cuda (Tesla T4\n", + " Voix: ['female_fr', 'male_fr']\n", + " enable_text_splitting: activé par défaut\n", + "💡 torchaudio backend is expected to use 'soundfile' as torchcodec is no longer installed.\n", + "\n", + "============================================================\n", + "EXEMPLE 1: Texte court\n" + ] + } + ], + "source": [ + "import os\n", + "import re\n", + "import gc\n", + "import wave\n", + "import time\n", + "import hashlib\n", + "import warnings\n", + "from pathlib import Path\n", + "from typing import Optional, Union, List, Callable\n", + "from dataclasses import dataclass\n", + "from enum import Enum\n", + "\n", + "import numpy as np\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "\n", + "# ==============================================================================\n", + "# INSTALLATION (Colab)\n", + "# ==============================================================================\n", + "\n", + "\n", + "def install_dependencies():\n", + " \"\"\"Installe les dépendances si nécessaire (Colab).\"\"\"\n", + " import subprocess\n", + " import sys\n", + "\n", + " # Installer FFmpeg pour torchcodec\n", + " try:\n", + " print(\"📦 Installation de FFmpeg...\")\n", + " subprocess.check_call([\"apt-get\", \"update\", \"-qq\"])\n", + " subprocess.check_call([\"apt-get\", \"install\", \"-qq\", \"ffmpeg\"])\n", + " print(\"✓ FFmpeg installé\")\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'installation de FFmpeg: {e}\")\n", + "\n", + " packages = [\n", + " (\"scipy\", \"scipy\"),\n", + " (\"noisereduce\", \"noisereduce\"),\n", + " (\"TTS\", \"coqui-tts\"),\n", + " ]\n", + "\n", + " for module, package in packages:\n", + " try:\n", + " __import__(module)\n", + " except ImportError:\n", + " print(f\"📦 Installation de {package}...\")\n", + " subprocess.check_call(\n", + " [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", package]\n", + " )\n", + "\n", + " # numpy compatible\n", + " # The previous attempt to install a specific numpy version was causing compatibility issues.\n", + " # Removing this line to allow torchcodec and other libraries to install a compatible numpy version.\n", + " # try:\n", + " # subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"numpy==2.0.2\"])\n", + " # except:\n", + " # pass\n", + "\n", + "\n", + "# ==============================================================================\n", + "# CONFIGURATION\n", + "# ==============================================================================\n", + "\n", + "\n", + "@dataclass\n", + "class TTSConfig:\n", + " \"\"\"Configuration globale du module TTS.\"\"\"\n", + "\n", + " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", + " SAMPLE_RATE: int = 24000\n", + " DEFAULT_LANGUAGE: str = \"fr\"\n", + " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", + "\n", + " # Configuration pour audio longs\n", + " MAX_CHARS_PER_CHUNK: int = 500 # Caractères max par chunk pour textes très longs\n", + " CROSSFADE_DURATION: float = 0.05 # Durée du crossfade en secondes\n", + " ENABLE_TEXT_SPLITTING: bool = True # Activer le split natif XTTS\n", + "\n", + " PRESET_VOICES: dict = None\n", + "\n", + " def __post_init__(self):\n", + " self.PRESET_VOICES = {\n", + " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", + " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", + " }\n", + "\n", + "\n", + "Config = TTSConfig()\n", + "\n", + "# ==============================================================================\n", + "# DEVICE MANAGEMENT\n", + "# ==============================================================================\n", + "\n", + "_device = None\n", + "_device_name = \"cpu\"\n", + "\n", + "\n", + "def detect_device():\n", + " \"\"\"Détecte le meilleur device disponible.\"\"\"\n", + " global _device, _device_name\n", + " import torch\n", + "\n", + " # Essayer TPU\n", + " try:\n", + " import torch_xla.core.xla_model as xm\n", + "\n", + " _device = xm.xla_device()\n", + " _device_name = \"tpu\"\n", + " print(f\"⚙️ Device: TPU\")\n", + " return\n", + " except:\n", + " pass\n", + "\n", + " # Essayer CUDA\n", + " if torch.cuda.is_available():\n", + " _device = torch.device(\"cuda\")\n", + " _device_name = f\"cuda ({torch.cuda.get_device_name(0)}\"\n", + " print(f\"⚙️ Device: {_device_name}\")\n", + " return\n", + "\n", + " # Fallback CPU\n", + " _device = torch.device(\"cpu\")\n", + " _device_name = \"cpu\"\n", + " print(f\"⚙️ Device: CPU\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TEXT SPLITTING UTILITIES\n", + "# ==============================================================================\n", + "\n", + "\n", + "class TextSplitter:\n", + " \"\"\"\n", + " Utilitaire pour découper intelligemment les textes longs.\n", + " Préserve la cohérence des phrases et paragraphes.\n", + " \"\"\"\n", + "\n", + " @staticmethod\n", + " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", + " \"\"\"\n", + " Estime la durée audio pour un texte donné.\n", + " \"\"\"\n", + " return len(text) / chars_per_second\n", + "\n", + " @staticmethod\n", + " def split_into_sentences(text: str) -> List[str]:\n", + " \"\"\"Découpe le texte en phrases.\"\"\"\n", + " # Pattern pour fin de phrase\n", + " pattern = r\"(?<=[.!?])\\s+\"\n", + " sentences = re.split(pattern, text)\n", + " return [s.strip() for s in sentences if s.strip()]\n", + "\n", + " @staticmethod\n", + " def split_into_paragraphs(text: str) -> List[str]:\n", + " \"\"\"Découpe le texte en paragraphes.\"\"\"\n", + " paragraphs = re.split(r\"\\n\\s*\\n\", text)\n", + " return [p.strip() for p in paragraphs if p.strip()]\n", + "\n", + " @classmethod\n", + " def split_for_long_audio(\n", + " cls, text: str, max_chars: int = 500, preserve_sentences: bool = True\n", + " ) -> List[str]:\n", + " \"\"\"\n", + " Découpe un texte long en chunks optimaux pour la synthèse.\n", + " \"\"\"\n", + " # Si texte court, retourner tel quel\n", + " if len(text) <= max_chars:\n", + " return [text]\n", + "\n", + " chunks = []\n", + "\n", + " if preserve_sentences:\n", + " sentences = cls.split_into_sentences(text)\n", + " current_chunk = \"\"\n", + "\n", + " for sentence in sentences:\n", + " # Si la phrase seule dépasse max_chars, la découper\n", + " if len(sentence) > max_chars:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = \"\"\n", + " # Découper la phrase longue par mots\n", + " words = sentence.split()\n", + " sub_chunk = \"\"\n", + " for word in words:\n", + " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", + " sub_chunk += \" \" + word if sub_chunk else word\n", + " else:\n", + " if sub_chunk:\n", + " chunks.append(sub_chunk.strip())\n", + " sub_chunk = word\n", + " if sub_chunk:\n", + " current_chunk = sub_chunk\n", + " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", + " current_chunk += \" \" + sentence if current_chunk else sentence\n", + " else:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = sentence\n", + "\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " else:\n", + " # Découpage simple par caractères\n", + " for i in range(0, len(text), max_chars):\n", + " chunks.append(text[i : i + max_chars])\n", + "\n", + " return chunks\n", + "\n", + "\n", + "# ==============================================================================\n", + "# AUDIO PROCESSING\n", + "# ==============================================================================\n", + "\n", + "\n", + "class AudioProcessor:\n", + " \"\"\"Processeur audio pour post-traitement et concaténation.\"\"\"\n", + "\n", + " @staticmethod\n", + " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", + " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target_linear = 10 ** (target_db / 20)\n", + " audio = audio * (target_linear / peak)\n", + "\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + " @staticmethod\n", + " def crossfade(\n", + " audio1: np.ndarray, audio2: np.ndarray, sample_rate: int, duration: float = 0.05\n", + " ) -> np.ndarray:\n", + " \"\"\"\n", + " Concatène deux segments audio avec crossfade.\n", + " \"\"\"\n", + " # Convertir en float si nécessaire\n", + " if audio1.dtype == np.int16:\n", + " audio1 = audio1.astype(np.float32) / 32768.0\n", + " if audio2.dtype == np.int16:\n", + " audio2 = audio2.astype(np.float32) / 32768.0\n", + "\n", + " fade_samples = int(sample_rate * duration)\n", + "\n", + " # Si audio trop court pour crossfade, concaténer simplement\n", + " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", + " return np.concatenate([audio1, audio2])\n", + "\n", + " # Créer les courbes de fade\n", + " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", + " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", + "\n", + " # Appliquer le crossfade\n", + " audio1_end = audio1[-fade_samples:] * fade_out\n", + " audio2_start = audio2[:fade_samples] * fade_in\n", + "\n", + " # Assembler\n", + " result = np.concatenate(\n", + " [audio1[:-fade_samples], audio1_end + audio2_start, audio2[fade_samples:]]\n", + " )\n", + "\n", + " return result\n", + "\n", + " @classmethod\n", + " def concatenate_chunks(\n", + " cls,\n", + " audio_chunks: List[np.ndarray],\n", + " sample_rate: int,\n", + " crossfade_duration: float = 0.05,\n", + " ) -> np.ndarray:\n", + " \"\"\"\n", + " Concatène plusieurs chunks audio avec crossfade.\n", + " \"\"\"\n", + " if not audio_chunks:\n", + " return np.array([], dtype=np.float32)\n", + "\n", + " if len(audio_chunks) == 1:\n", + " audio = audio_chunks[0]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " return audio\n", + "\n", + " result = audio_chunks[0]\n", + " if result.dtype == np.int16:\n", + " result = result.astype(np.float32) / 32768.0\n", + "\n", + " for chunk in audio_chunks[1:]:\n", + " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", + "\n", + " return result\n", + "\n", + " @staticmethod\n", + " def enhance(\n", + " audio: np.ndarray, sample_rate: int, normalize: bool = True, warmth: bool = True\n", + " ) -> np.ndarray:\n", + " \"\"\"Améliore la qualité audio.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " if warmth:\n", + " try:\n", + " from scipy import signal\n", + "\n", + " nyquist = sample_rate / 2\n", + " cutoff = min(300, nyquist * 0.9) / nyquist\n", + " b, a = signal.butter(2, cutoff, btype=\"low\")\n", + " bass = signal.filtfilt(b, a, audio)\n", + " audio = audio + 0.15 * bass\n", + " except ImportError:\n", + " pass\n", + "\n", + " if normalize:\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target = 10 ** (-3.0 / 20)\n", + " audio = audio * (target / peak)\n", + "\n", + " audio = np.clip(audio, -1.0, 1.0)\n", + " return audio\n", + "\n", + "\n", + "# ==============================================================================\n", + "# PROGRESS TRACKER\n", + "# ==============================================================================\n", + "\n", + "\n", + "class ProgressTracker:\n", + " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", + "\n", + " def __init__(self, total: int, description: str = \"\"):\n", + " self.total = total\n", + " self.current = 0\n", + " self.description = description\n", + " self.start_time = time.time()\n", + " self.chunk_times = []\n", + "\n", + " def update(self, chunk_duration: float = None):\n", + " \"\"\"Met à jour la progression.\"\"\"\n", + " self.current += 1\n", + " if chunk_duration:\n", + " self.chunk_times.append(chunk_duration)\n", + " self._display()\n", + "\n", + " def _display(self):\n", + " \"\"\"Affiche la barre de progression.\"\"\"\n", + " elapsed = time.time() - self.start_time\n", + " percent = (self.current / self.total) * 100\n", + "\n", + " # Estimation temps restant\n", + " if self.chunk_times:\n", + " avg_time = np.mean(self.chunk_times)\n", + " remaining = avg_time * (self.total - self.current)\n", + " eta_str = self._format_time(remaining)\n", + " else:\n", + " eta_str = \"...\"\n", + "\n", + " # Barre de progression\n", + " bar_length = 30\n", + " filled = int(bar_length * self.current / self.total)\n", + " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", + "\n", + " elapsed_str = self._format_time(elapsed)\n", + "\n", + " print(\n", + " f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", + " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\",\n", + " end=\"\",\n", + " )\n", + "\n", + " if self.current >= self.total:\n", + " print() # Nouvelle ligne à la fin\n", + "\n", + " @staticmethod\n", + " def _format_time(seconds: float) -> str:\n", + " \"\"\"Formate un temps en secondes en HH:MM:SS.\"\"\"\n", + " hours = int(seconds // 3600)\n", + " minutes = int((seconds % 3600) // 60)\n", + " secs = int(seconds % 60)\n", + "\n", + " if hours > 0:\n", + " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", + " return f\"{minutes:02d}:{secs:02d}\"\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TTS ENGINE\n", + "# ==============================================================================\n", + "\n", + "_tts_model = None\n", + "_voices_cache = {}\n", + "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", + "\n", + "\n", + "def get_model():\n", + " \"\"\"Charge le modèle XTTS v2 avec cache.\"\"\"\n", + " global _tts_model\n", + "\n", + " if _tts_model is None:\n", + " print(\"🔄 Chargement du modèle XTTS v2...\")\n", + " from TTS.api import TTS\n", + "\n", + " _tts_model = TTS(Config.MODEL_NAME)\n", + "\n", + " if _device is not None and _device_name.startswith(\"cuda\"):\n", + " _tts_model = _tts_model.to(_device)\n", + "\n", + " print(\"✓ Modèle chargé\")\n", + "\n", + " return _tts_model\n", + "\n", + "\n", + "def get_voice_path(voice: str) -> str:\n", + " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", + " global _voices_cache\n", + " import urllib.request\n", + "\n", + " if voice in _voices_cache:\n", + " return _voices_cache[voice]\n", + "\n", + " if os.path.isfile(voice):\n", + " _voices_cache[voice] = voice\n", + " return voice\n", + "\n", + " if voice in Config.PRESET_VOICES:\n", + " url = Config.PRESET_VOICES[voice]\n", + " path = f\"/tmp/{voice}.wav\"\n", + "\n", + " if not os.path.exists(path):\n", + " print(f\"📥 Téléchargement de la voix '{voice}'...\")\n", + " urllib.request.urlretrieve(url, path)\n", + "\n", + " _voices_cache[voice] = path\n", + " return path\n", + "\n", + " raise FileNotFoundError(f\"Voix '{voice}' non trouvée\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# MAIN SYNTHESIS FUNCTIONS\n", + "# ==============================================================================\n", + "\n", + "\n", + "def synthesize_chunk(\n", + " text: str, voice_path: str, language: str = \"fr\", enable_text_splitting: bool = True\n", + ") -> np.ndarray:\n", + " \"\"\"\n", + " Synthétise un chunk de texte en audio via l'inférence directe (Low-Level).\n", + " Bypass total du SpeakerManager pour éviter le bug FileNotFoundError .pth\n", + " \"\"\"\n", + " model_wrapper = get_model()\n", + "\n", + " # 1. Accès \"chirurgical\" au modèle interne XTTS\n", + " # C'est lui qui fait le travail, sans la couche de gestion de fichiers buggée\n", + " if hasattr(model_wrapper, \"synthesizer\"):\n", + " xtts_model = model_wrapper.synthesizer.tts_model\n", + " else:\n", + " # Cas rare ou structure différente, on tente l'accès direct\n", + " xtts_model = model_wrapper.tts_model\n", + "\n", + " # 2. Calcul manuel des latents (Empreinte vocale)\n", + " # On transforme le fichier WAV en vecteurs mathématiques\n", + " try:\n", + " gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(\n", + " audio_path=[voice_path], gpt_cond_len=30, max_ref_length=60\n", + " )\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur calcul latents: {e}\")\n", + " raise e\n", + "\n", + " # 3. Inférence directe\n", + " # On appelle la fonction de génération pure, sans passer par tts()\n", + " try:\n", + " out = xtts_model.inference(\n", + " text=text,\n", + " language=language,\n", + " gpt_cond_latent=gpt_cond_latent,\n", + " speaker_embedding=speaker_embedding,\n", + " temperature=0.7, # Paramètre standard pour la créativité\n", + " length_penalty=1.0, # Pénalité de longueur\n", + " repetition_penalty=2.0, # Évite les bégaiements\n", + " top_k=50,\n", + " top_p=0.8,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " # Le résultat est généralement dans un dictionnaire sous la clé 'wav'\n", + " if isinstance(out, dict) and \"wav\" in out:\n", + " wav = out[\"wav\"]\n", + " else:\n", + " wav = out\n", + "\n", + " # S'assurer que c'est bien un numpy array sur CPU\n", + " if hasattr(wav, \"cpu\"):\n", + " wav = wav.cpu().numpy()\n", + " if isinstance(wav, list):\n", + " wav = np.array(wav, dtype=np.float32)\n", + "\n", + " return wav\n", + "\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'inférence directe : {e}\")\n", + " raise e\n", + "\n", + "\n", + "def text_to_speech_long(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " max_chars_per_chunk: int = None,\n", + " show_progress: bool = True,\n", + " enable_text_splitting: bool = True,\n", + ") -> dict:\n", + " \"\"\"\n", + " Génère un fichier audio long (> 1 heure) à partir de texte.\n", + " \"\"\"\n", + " import torch\n", + "\n", + " # Configuration\n", + " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Estimation initiale\n", + " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", + " print(f\"\\n📝 Texte: {len(text):,} caractères\")\n", + " print(f\"⏱️ Durée estimée: {ProgressTracker._format_time(estimated_duration)}\")\n", + "\n", + " # Découper le texte\n", + " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", + " print(f\"📦 Chunks: {len(chunks)}\")\n", + "\n", + " # Initialiser la progression\n", + " progress = None\n", + " if show_progress:\n", + " progress = ProgressTracker(len(chunks), \"🎙️ Synthèse\")\n", + "\n", + " # Générer l'audio chunk par chunk\n", + " audio_chunks = []\n", + "\n", + " for i, chunk in enumerate(chunks):\n", + " chunk_start = time.time()\n", + "\n", + " try:\n", + " wav = synthesize_chunk(\n", + " text=chunk,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + " audio_chunks.append(wav)\n", + "\n", + " except Exception as e:\n", + " print(f\"\\n⚠️ Erreur chunk {i + 1}: {e}\")\n", + " # Continuer avec les autres chunks\n", + " continue\n", + "\n", + " # Libérer la mémoire GPU périodiquement\n", + " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", + " torch.cuda.empty_cache()\n", + "\n", + " chunk_duration = time.time() - chunk_start\n", + " if progress:\n", + " progress.update(chunk_duration)\n", + "\n", + " if not audio_chunks:\n", + " raise RuntimeError(\"Aucun audio généré\")\n", + "\n", + " print(\"\\n🔗 Concaténation des chunks...\")\n", + "\n", + " # Concaténer avec crossfade\n", + " final_audio = AudioProcessor.concatenate_chunks(\n", + " audio_chunks, Config.SAMPLE_RATE, Config.CROSSFADE_DURATION\n", + " )\n", + "\n", + " # Libérer les chunks de la mémoire\n", + " del audio_chunks\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " print(\"✨ Post-traitement...\")\n", + " final_audio = AudioProcessor.enhance(\n", + " final_audio, Config.SAMPLE_RATE, normalize=True, warmth=True\n", + " )\n", + " else:\n", + " final_audio = AudioProcessor.normalize(final_audio)\n", + "\n", + " # Convertir en int16\n", + " final_audio = (final_audio * 32767).astype(np.int16)\n", + "\n", + " # Générer le nom de fichier\n", + " if output_path is None:\n", + " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", + " output_path = f\"tts_long_{voice}_{h}.wav\"\n", + "\n", + " # Dossier de sortie\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarder\n", + " print(f\"💾 Sauvegarde: {final_path}\")\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(final_audio.tobytes())\n", + "\n", + " # Calculer la durée réelle\n", + " duration = len(final_audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"\\n✅ Audio généré avec succès!\")\n", + " print(f\" 📁 Fichier: {final_path}\")\n", + " print(f\" ⏱️ Durée: {ProgressTracker._format_time(duration)}\")\n", + " print(f\" 📦 Chunks: {len(chunks)}\")\n", + " print(f\" 🎤 Voix: {voice}\")\n", + "\n", + " return {\n", + " \"path\": str(final_path),\n", + " \"sample_rate\": Config.SAMPLE_RATE,\n", + " \"duration_seconds\": duration,\n", + " \"duration_formatted\": ProgressTracker._format_time(duration),\n", + " \"audio_data\": final_audio,\n", + " \"voice\": voice,\n", + " \"language\": language,\n", + " \"device\": _device_name,\n", + " \"chunks_count\": len(chunks),\n", + " \"text_length\": len(text),\n", + " }\n", + "\n", + "\n", + "def text_to_speech(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " enable_text_splitting: bool = True,\n", + ") -> dict:\n", + " \"\"\"\n", + " Génère un fichier audio à partir de texte avec XTTS v2.\n", + " \"\"\"\n", + " # Basculer automatiquement vers la version long pour textes > 10000 chars\n", + " if len(text) > 10000:\n", + " print(\"📢 Texte long détecté - utilisation de text_to_speech_long()\")\n", + " return text_to_speech_long(\n", + " text=text,\n", + " voice=voice,\n", + " language=language,\n", + " output_path=output_path,\n", + " enhance=enhance,\n", + " use_gdrive=use_gdrive,\n", + " gdrive_folder=gdrive_folder,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Générer l'audio avec enable_text_splitting\n", + " wav = synthesize_chunk(\n", + " text=text,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", + " else:\n", + " audio = AudioProcessor.normalize(wav)\n", + "\n", + " audio = (audio * 32767).astype(np.int16)\n", + "\n", + " # Nom de fichier\n", + " if output_path is None:\n", + " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", + " output_path = f\"tts_{voice}_{h}.wav\"\n", + "\n", + " # Dossier de sortie\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarder\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(audio.tobytes())\n", + "\n", + " duration = len(audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"✓ Audio généré: {final_path}\")\n", + " print(f\" Durée: {duration:.2f}s | Voix: {voice}\")\n", + "\n", + " return {\n", + " \"path\": str(final_path),\n", + " \"sample_rate\": Config.SAMPLE_RATE,\n", + " \"duration_seconds\": duration,\n", + " \"audio_data\": audio,\n", + " \"voice\": voice,\n", + " \"language\": language,\n", + " \"device\": _device_name,\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# UTILITIES\n", + "# ==============================================================================\n", + "\n", + "\n", + "def preview_audio(result: dict) -> None:\n", + " \"\"\"Prévisualise l'audio dans le notebook.\"\"\"\n", + " from IPython.display import Audio, display\n", + "\n", + " audio = result[\"audio_data\"]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " display(Audio(audio, rate=result[\"sample_rate\"]))\n", + "\n", + "\n", + "def list_voices() -> list:\n", + " \"\"\"Liste les voix disponibles.\"\"\"\n", + " return list(Config.PRESET_VOICES.keys())\n", + "\n", + "\n", + "def list_languages() -> list:\n", + " \"\"\"Liste les langues supportées.\"\"\"\n", + " return [\n", + " \"en\",\n", + " \"es\",\n", + " \"fr\",\n", + " \"de\",\n", + " \"it\",\n", + " \"pt\",\n", + " \"pl\",\n", + " \"tr\",\n", + " \"ru\",\n", + " \"nl\",\n", + " \"cs\",\n", + " \"ar\",\n", + " \"zh-cn\",\n", + " \"ja\",\n", + " \"hu\",\n", + " \"ko\",\n", + " \"hi\",\n", + " ]\n", + "\n", + "\n", + "def clear_cache():\n", + " \"\"\"Libère la mémoire.\"\"\"\n", + " global _tts_model\n", + " import torch\n", + "\n", + " _tts_model = None\n", + " gc.collect()\n", + "\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " print(\"✓ Cache vidé\")\n", + "\n", + "\n", + "def estimate_duration(text: str) -> dict:\n", + " \"\"\"\n", + " Estime la durée audio pour un texte.\n", + " \"\"\"\n", + " duration = TextSplitter.estimate_audio_duration(text)\n", + " chunks = len(TextSplitter.split_for_long_audio(text))\n", + "\n", + " return {\n", + " \"chars\": len(text),\n", + " \"estimated_seconds\": duration,\n", + " \"estimated_formatted\": ProgressTracker._format_time(duration),\n", + " \"chunks_estimate\": chunks,\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# ALIASES\n", + "# ==============================================================================\n", + "\n", + "tts = text_to_speech\n", + "tts_long = text_to_speech_long\n", + "\n", + "\n", + "# ==============================================================================\n", + "# INITIALIZATION\n", + "# ==============================================================================\n", + "\n", + "\n", + "def init():\n", + " \"\"\"Initialise le module.\"\"\"\n", + " detect_device()\n", + " print(\"✅ Module XTTS v2 Long Audio chargé\")\n", + " print(f\" Device: {_device_name}\")\n", + " print(f\" Voix: {list_voices()}\")\n", + " print(f\" enable_text_splitting: activé par défaut\")\n", + " # Add this line to explicitly set torchaudio backend\n", + " try:\n", + " import torchaudio\n", + "\n", + " # This line is intentionally commented out as set_audio_backend is not available in all torchaudio versions.\n", + " # The `soundfile` library should be picked up automatically if torchcodec is not installed.\n", + " # torchaudio.set_audio_backend(\"soundfile\")\n", + " print(\n", + " \"💡 torchaudio backend is expected to use 'soundfile' as torchcodec is no longer installed.\"\n", + " )\n", + " except ImportError:\n", + " print(\"⚠️ torchaudio not found.\")\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de la configuration de torchaudio: {e}\")\n", + "\n", + "\n", + "# Auto-init\n", + "if __name__ != \"__main__\":\n", + " try:\n", + " detect_device()\n", + " except:\n", + " pass\n", + "\n", + "\n", + "# ==============================================================================\n", + "# EXAMPLE USAGE\n", + "# ==============================================================================\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Installation si nécessaire\n", + " install_dependencies()\n", + "\n", + " # Initialisation\n", + " init()\n", + "\n", + " # Exemple avec texte court\n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"EXEMPLE 1: Texte court\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "FREsMU-QLEc4" + }, + "outputs": [], + "source": [ + "text_to_speech_to_synthetise = \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", + "\n", + "voice_gender = \"female_fr\"\n", + "# ['female_fr', 'male_fr']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 250 }, + "id": "2Any3vzyK8zF", + "outputId": "2d3ff69d-09bc-4334-f13f-7be141559de8" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "jIKtDA5hweJP", - "outputId": "8b9cbf18-4496-4c26-d2c8-50b97a3710d2", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m862.8/862.8 kB\u001b[0m \u001b[31m57.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m345.1/345.1 kB\u001b[0m \u001b[31m25.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.2/56.2 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m997.3/997.3 kB\u001b[0m \u001b[31m62.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m648.4/648.4 kB\u001b[0m \u001b[31m55.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m18.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n" - ] - } - ], - "source": [ - "# Installation des dépendances\n", - "!pip install -q scipy noisereduce\n", - "\n", - "# Installation du fork maintenu (supporte Python 3.12+)\n", - "!pip install -q coqui-tts\n", - "\n", - "\n", - "# Installation des dépendances\n", - "!pip install -q scipy noisereduce\n", - "!pip install -q numpy==2.0.2\n", - "\n", - "# Installation de soundfile pour le chargement audio (évite le bug torchcodec)\n", - "!pip install -q soundfile\n", - "\n", - "# Installation du fork maintenu (supporte Python 3.12+)\n", - "!pip install -q coqui-tts\n", - "\n", - "# Note: torchcodec n'est plus nécessaire - on utilise soundfile comme backend" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Voix disponibles: ['female_fr', 'male_fr']\n", + "📥 Téléchargement de la voix 'female_fr'...\n", + "🔄 Chargement du modèle XTTS v2...\n" + ] }, { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1FQhKQ1IE4iX", - "outputId": "68446e3c-ac8f-474b-a329-dfa6b8d6aec9" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "📦 Installation de FFmpeg...\n", - "✓ FFmpeg installé\n", - "⚙️ Device: cuda (Tesla T4\n", - "✅ Module XTTS v2 Long Audio chargé\n", - " Device: cuda (Tesla T4\n", - " Voix: ['female_fr', 'male_fr']\n", - " enable_text_splitting: activé par défaut\n", - "💡 torchaudio backend is expected to use 'soundfile' as torchcodec is no longer installed.\n", - "\n", - "============================================================\n", - "EXEMPLE 1: Texte court\n" - ] - } - ], - "source": [ - "import os\n", - "import re\n", - "import gc\n", - "import wave\n", - "import time\n", - "import hashlib\n", - "import warnings\n", - "from pathlib import Path\n", - "from typing import Optional, Union, List, Callable\n", - "from dataclasses import dataclass\n", - "from enum import Enum\n", - "\n", - "import numpy as np\n", - "\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", - "\n", - "# ==============================================================================\n", - "# INSTALLATION (Colab)\n", - "# ==============================================================================\n", - "\n", - "def install_dependencies():\n", - " \"\"\"Installe les dépendances si nécessaire (Colab).\"\"\"\n", - " import subprocess\n", - " import sys\n", - "\n", - " # Installer FFmpeg pour torchcodec\n", - " try:\n", - " print(\"📦 Installation de FFmpeg...\")\n", - " subprocess.check_call([\"apt-get\", \"update\", \"-qq\"])\n", - " subprocess.check_call([\"apt-get\", \"install\", \"-qq\", \"ffmpeg\"])\n", - " print(\"✓ FFmpeg installé\")\n", - " except Exception as e:\n", - " print(f\"⚠️ Erreur lors de l'installation de FFmpeg: {e}\")\n", - "\n", - " packages = [\n", - " (\"scipy\", \"scipy\"),\n", - " (\"noisereduce\", \"noisereduce\"),\n", - " (\"TTS\", \"coqui-tts\"),\n", - " ]\n", - "\n", - " for module, package in packages:\n", - " try:\n", - " __import__(module)\n", - " except ImportError:\n", - " print(f\"📦 Installation de {package}...\")\n", - " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", package])\n", - "\n", - " # numpy compatible\n", - " # The previous attempt to install a specific numpy version was causing compatibility issues.\n", - " # Removing this line to allow torchcodec and other libraries to install a compatible numpy version.\n", - " # try:\n", - " # subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"numpy==2.0.2\"])\n", - " # except:\n", - " # pass\n", - "\n", - "# ==============================================================================\n", - "# CONFIGURATION\n", - "# ==============================================================================\n", - "\n", - "@dataclass\n", - "class TTSConfig:\n", - " \"\"\"Configuration globale du module TTS.\"\"\"\n", - " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", - " SAMPLE_RATE: int = 24000\n", - " DEFAULT_LANGUAGE: str = \"fr\"\n", - " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", - "\n", - " # Configuration pour audio longs\n", - " MAX_CHARS_PER_CHUNK: int = 500 # Caractères max par chunk pour textes très longs\n", - " CROSSFADE_DURATION: float = 0.05 # Durée du crossfade en secondes\n", - " ENABLE_TEXT_SPLITTING: bool = True # Activer le split natif XTTS\n", - "\n", - " PRESET_VOICES: dict = None\n", - "\n", - " def __post_init__(self):\n", - " self.PRESET_VOICES = {\n", - " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", - " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", - " }\n", - "\n", - "Config = TTSConfig()\n", - "\n", - "# ==============================================================================\n", - "# DEVICE MANAGEMENT\n", - "# ==============================================================================\n", - "\n", - "_device = None\n", - "_device_name = \"cpu\"\n", - "\n", - "def detect_device():\n", - " \"\"\"Détecte le meilleur device disponible.\"\"\"\n", - " global _device, _device_name\n", - " import torch\n", - "\n", - " # Essayer TPU\n", - " try:\n", - " import torch_xla.core.xla_model as xm\n", - " _device = xm.xla_device()\n", - " _device_name = \"tpu\"\n", - " print(f\"⚙️ Device: TPU\")\n", - " return\n", - " except:\n", - " pass\n", - "\n", - " # Essayer CUDA\n", - " if torch.cuda.is_available():\n", - " _device = torch.device(\"cuda\")\n", - " _device_name = f\"cuda ({torch.cuda.get_device_name(0)}\";\n", - " print(f\"⚙️ Device: {_device_name}\")\n", - " return\n", - "\n", - " # Fallback CPU\n", - " _device = torch.device(\"cpu\")\n", - " _device_name = \"cpu\"\n", - " print(f\"⚙️ Device: CPU\")\n", - "\n", - "# ==============================================================================\n", - "# TEXT SPLITTING UTILITIES\n", - "# ==============================================================================\n", - "\n", - "class TextSplitter:\n", - " \"\"\"\n", - " Utilitaire pour découper intelligemment les textes longs.\n", - " Préserve la cohérence des phrases et paragraphes.\n", - " \"\"\"\n", - "\n", - " @staticmethod\n", - " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", - " \"\"\"\n", - " Estime la durée audio pour un texte donné.\n", - " \"\"\"\n", - " return len(text) / chars_per_second\n", - "\n", - " @staticmethod\n", - " def split_into_sentences(text: str) -> List[str]:\n", - " \"\"\"Découpe le texte en phrases.\"\"\"\n", - " # Pattern pour fin de phrase\n", - " pattern = r'(?<=[.!?])\\s+'\n", - " sentences = re.split(pattern, text)\n", - " return [s.strip() for s in sentences if s.strip()]\n", - "\n", - " @staticmethod\n", - " def split_into_paragraphs(text: str) -> List[str]:\n", - " \"\"\"Découpe le texte en paragraphes.\"\"\"\n", - " paragraphs = re.split(r'\\n\\s*\\n', text)\n", - " return [p.strip() for p in paragraphs if p.strip()]\n", - "\n", - " @classmethod\n", - " def split_for_long_audio(\n", - " cls,\n", - " text: str,\n", - " max_chars: int = 500,\n", - " preserve_sentences: bool = True\n", - " ) -> List[str]:\n", - " \"\"\"\n", - " Découpe un texte long en chunks optimaux pour la synthèse.\n", - " \"\"\"\n", - " # Si texte court, retourner tel quel\n", - " if len(text) <= max_chars:\n", - " return [text]\n", - "\n", - " chunks = []\n", - "\n", - " if preserve_sentences:\n", - " sentences = cls.split_into_sentences(text)\n", - " current_chunk = \"\"\n", - "\n", - " for sentence in sentences:\n", - " # Si la phrase seule dépasse max_chars, la découper\n", - " if len(sentence) > max_chars:\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " current_chunk = \"\"\n", - " # Découper la phrase longue par mots\n", - " words = sentence.split()\n", - " sub_chunk = \"\"\n", - " for word in words:\n", - " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", - " sub_chunk += \" \" + word if sub_chunk else word\n", - " else:\n", - " if sub_chunk:\n", - " chunks.append(sub_chunk.strip())\n", - " sub_chunk = word\n", - " if sub_chunk:\n", - " current_chunk = sub_chunk\n", - " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", - " current_chunk += \" \" + sentence if current_chunk else sentence\n", - " else:\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " current_chunk = sentence\n", - "\n", - " if current_chunk:\n", - " chunks.append(current_chunk.strip())\n", - " else:\n", - " # Découpage simple par caractères\n", - " for i in range(0, len(text), max_chars):\n", - " chunks.append(text[i:i + max_chars])\n", - "\n", - " return chunks\n", - "\n", - "\n", - "# ==============================================================================\n", - "# AUDIO PROCESSING\n", - "# ==============================================================================\n", - "\n", - "class AudioProcessor:\n", - " \"\"\"Processeur audio pour post-traitement et concaténation.\"\"\"\n", - "\n", - " @staticmethod\n", - " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", - " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - "\n", - " peak = np.max(np.abs(audio))\n", - " if peak > 0:\n", - " target_linear = 10 ** (target_db / 20)\n", - " audio = audio * (target_linear / peak)\n", - "\n", - " return np.clip(audio, -1.0, 1.0)\n", - "\n", - " @staticmethod\n", - " def crossfade(\n", - " audio1: np.ndarray,\n", - " audio2: np.ndarray,\n", - " sample_rate: int,\n", - " duration: float = 0.05\n", - " ) -> np.ndarray:\n", - " \"\"\"\n", - " Concatène deux segments audio avec crossfade.\n", - " \"\"\"\n", - " # Convertir en float si nécessaire\n", - " if audio1.dtype == np.int16:\n", - " audio1 = audio1.astype(np.float32) / 32768.0\n", - " if audio2.dtype == np.int16:\n", - " audio2 = audio2.astype(np.float32) / 32768.0\n", - "\n", - " fade_samples = int(sample_rate * duration)\n", - "\n", - " # Si audio trop court pour crossfade, concaténer simplement\n", - " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", - " return np.concatenate([audio1, audio2])\n", - "\n", - " # Créer les courbes de fade\n", - " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", - " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", - "\n", - " # Appliquer le crossfade\n", - " audio1_end = audio1[-fade_samples:] * fade_out\n", - " audio2_start = audio2[:fade_samples] * fade_in\n", - "\n", - " # Assembler\n", - " result = np.concatenate([\n", - " audio1[:-fade_samples],\n", - " audio1_end + audio2_start,\n", - " audio2[fade_samples:]\n", - " ])\n", - "\n", - " return result\n", - "\n", - " @classmethod\n", - " def concatenate_chunks(\n", - " cls,\n", - " audio_chunks: List[np.ndarray],\n", - " sample_rate: int,\n", - " crossfade_duration: float = 0.05\n", - " ) -> np.ndarray:\n", - " \"\"\"\n", - " Concatène plusieurs chunks audio avec crossfade.\n", - " \"\"\"\n", - " if not audio_chunks:\n", - " return np.array([], dtype=np.float32)\n", - "\n", - " if len(audio_chunks) == 1:\n", - " audio = audio_chunks[0]\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - " return audio\n", - "\n", - " result = audio_chunks[0]\n", - " if result.dtype == np.int16:\n", - " result = result.astype(np.float32) / 32768.0\n", - "\n", - " for chunk in audio_chunks[1:]:\n", - " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", - "\n", - " return result\n", - "\n", - " @staticmethod\n", - " def enhance(\n", - " audio: np.ndarray,\n", - " sample_rate: int,\n", - " normalize: bool = True,\n", - " warmth: bool = True\n", - " ) -> np.ndarray:\n", - " \"\"\"Améliore la qualité audio.\"\"\"\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - "\n", - " if warmth:\n", - " try:\n", - " from scipy import signal\n", - " nyquist = sample_rate / 2\n", - " cutoff = min(300, nyquist * 0.9) / nyquist\n", - " b, a = signal.butter(2, cutoff, btype='low')\n", - " bass = signal.filtfilt(b, a, audio)\n", - " audio = audio + 0.15 * bass\n", - " except ImportError:\n", - " pass\n", - "\n", - " if normalize:\n", - " peak = np.max(np.abs(audio))\n", - " if peak > 0:\n", - " target = 10 ** (-3.0 / 20)\n", - " audio = audio * (target / peak)\n", - "\n", - " audio = np.clip(audio, -1.0, 1.0)\n", - " return audio\n", - "\n", - "\n", - "# ==============================================================================\n", - "# PROGRESS TRACKER\n", - "# ==============================================================================\n", - "\n", - "class ProgressTracker:\n", - " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", - "\n", - " def __init__(self, total: int, description: str = \"\"):\n", - " self.total = total\n", - " self.current = 0\n", - " self.description = description\n", - " self.start_time = time.time()\n", - " self.chunk_times = []\n", - "\n", - " def update(self, chunk_duration: float = None):\n", - " \"\"\"Met à jour la progression.\"\"\"\n", - " self.current += 1\n", - " if chunk_duration:\n", - " self.chunk_times.append(chunk_duration)\n", - " self._display()\n", - "\n", - " def _display(self):\n", - " \"\"\"Affiche la barre de progression.\"\"\"\n", - " elapsed = time.time() - self.start_time\n", - " percent = (self.current / self.total) * 100\n", - "\n", - " # Estimation temps restant\n", - " if self.chunk_times:\n", - " avg_time = np.mean(self.chunk_times)\n", - " remaining = avg_time * (self.total - self.current)\n", - " eta_str = self._format_time(remaining)\n", - " else:\n", - " eta_str = \"...\"\n", - "\n", - " # Barre de progression\n", - " bar_length = 30\n", - " filled = int(bar_length * self.current / self.total)\n", - " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", - "\n", - " elapsed_str = self._format_time(elapsed)\n", - "\n", - " print(f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", - " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\", end=\"\")\n", - "\n", - " if self.current >= self.total:\n", - " print() # Nouvelle ligne à la fin\n", - "\n", - " @staticmethod\n", - " def _format_time(seconds: float) -> str:\n", - " \"\"\"Formate un temps en secondes en HH:MM:SS.\"\"\"\n", - " hours = int(seconds // 3600)\n", - " minutes = int((seconds % 3600) // 60)\n", - " secs = int(seconds % 60)\n", - "\n", - " if hours > 0:\n", - " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", - " return f\"{minutes:02d}:{secs:02d}\"\n", - "\n", - "\n", - "# ==============================================================================\n", - "# TTS ENGINE\n", - "# ==============================================================================\n", - "\n", - "_tts_model = None\n", - "_voices_cache = {}\n", - "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", - "\n", - "def get_model():\n", - " \"\"\"Charge le modèle XTTS v2 avec cache.\"\"\"\n", - " global _tts_model\n", - "\n", - " if _tts_model is None:\n", - " print(\"🔄 Chargement du modèle XTTS v2...\")\n", - " from TTS.api import TTS\n", - "\n", - " _tts_model = TTS(Config.MODEL_NAME)\n", - "\n", - " if _device is not None and _device_name.startswith(\"cuda\"):\n", - " _tts_model = _tts_model.to(_device)\n", - "\n", - " print(\"✓ Modèle chargé\")\n", - "\n", - " return _tts_model\n", - "\n", - "\n", - "def get_voice_path(voice: str) -> str:\n", - " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", - " global _voices_cache\n", - " import urllib.request\n", - "\n", - " if voice in _voices_cache:\n", - " return _voices_cache[voice]\n", - "\n", - " if os.path.isfile(voice):\n", - " _voices_cache[voice] = voice\n", - " return voice\n", - "\n", - " if voice in Config.PRESET_VOICES:\n", - " url = Config.PRESET_VOICES[voice]\n", - " path = f\"/tmp/{voice}.wav\"\n", - "\n", - " if not os.path.exists(path):\n", - " print(f\"📥 Téléchargement de la voix '{voice}'...\")\n", - " urllib.request.urlretrieve(url, path)\n", - "\n", - " _voices_cache[voice] = path\n", - " return path\n", - "\n", - " raise FileNotFoundError(f\"Voix '{voice}' non trouvée\")\n", - "\n", - "\n", - "# ==============================================================================\n", - "# MAIN SYNTHESIS FUNCTIONS\n", - "# ==============================================================================\n", - "\n", - "def synthesize_chunk(\n", - " text: str,\n", - " voice_path: str,\n", - " language: str = \"fr\",\n", - " enable_text_splitting: bool = True\n", - ") -> np.ndarray:\n", - " \"\"\"\n", - " Synthétise un chunk de texte en audio via l'inférence directe (Low-Level).\n", - " Bypass total du SpeakerManager pour éviter le bug FileNotFoundError .pth\n", - " \"\"\"\n", - " model_wrapper = get_model()\n", - "\n", - " # 1. Accès \"chirurgical\" au modèle interne XTTS\n", - " # C'est lui qui fait le travail, sans la couche de gestion de fichiers buggée\n", - " if hasattr(model_wrapper, 'synthesizer'):\n", - " xtts_model = model_wrapper.synthesizer.tts_model\n", - " else:\n", - " # Cas rare ou structure différente, on tente l'accès direct\n", - " xtts_model = model_wrapper.tts_model\n", - "\n", - " # 2. Calcul manuel des latents (Empreinte vocale)\n", - " # On transforme le fichier WAV en vecteurs mathématiques\n", - " try:\n", - " gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(\n", - " audio_path=[voice_path],\n", - " gpt_cond_len=30,\n", - " max_ref_length=60\n", - " )\n", - " except Exception as e:\n", - " print(f\"⚠️ Erreur calcul latents: {e}\")\n", - " raise e\n", - "\n", - " # 3. Inférence directe\n", - " # On appelle la fonction de génération pure, sans passer par tts()\n", - " try:\n", - " out = xtts_model.inference(\n", - " text=text,\n", - " language=language,\n", - " gpt_cond_latent=gpt_cond_latent,\n", - " speaker_embedding=speaker_embedding,\n", - " temperature=0.7, # Paramètre standard pour la créativité\n", - " length_penalty=1.0, # Pénalité de longueur\n", - " repetition_penalty=2.0, # Évite les bégaiements\n", - " top_k=50,\n", - " top_p=0.8,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - "\n", - " # Le résultat est généralement dans un dictionnaire sous la clé 'wav'\n", - " if isinstance(out, dict) and 'wav' in out:\n", - " wav = out['wav']\n", - " else:\n", - " wav = out\n", - "\n", - " # S'assurer que c'est bien un numpy array sur CPU\n", - " if hasattr(wav, 'cpu'):\n", - " wav = wav.cpu().numpy()\n", - " if isinstance(wav, list):\n", - " wav = np.array(wav, dtype=np.float32)\n", - "\n", - " return wav\n", - "\n", - " except Exception as e:\n", - " print(f\"⚠️ Erreur lors de l'inférence directe : {e}\")\n", - " raise e\n", - "\n", - "\n", - "def text_to_speech_long(\n", - " text: str,\n", - " voice: str = \"female_fr\",\n", - " language: str = \"fr\",\n", - " output_path: Optional[str] = None,\n", - " enhance: bool = False,\n", - " use_gdrive: bool = False,\n", - " gdrive_folder: str = None,\n", - " max_chars_per_chunk: int = None,\n", - " show_progress: bool = True,\n", - " enable_text_splitting: bool = True\n", - ") -> dict:\n", - " \"\"\"\n", - " Génère un fichier audio long (> 1 heure) à partir de texte.\n", - " \"\"\"\n", - " import torch\n", - "\n", - " # Configuration\n", - " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", - " voice_path = get_voice_path(voice)\n", - "\n", - " # Estimation initiale\n", - " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", - " print(f\"\\n📝 Texte: {len(text):,} caractères\")\n", - " print(f\"⏱️ Durée estimée: {ProgressTracker._format_time(estimated_duration)}\")\n", - "\n", - " # Découper le texte\n", - " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", - " print(f\"📦 Chunks: {len(chunks)}\")\n", - "\n", - " # Initialiser la progression\n", - " progress = None\n", - " if show_progress:\n", - " progress = ProgressTracker(len(chunks), \"🎙️ Synthèse\")\n", - "\n", - " # Générer l'audio chunk par chunk\n", - " audio_chunks = []\n", - "\n", - " for i, chunk in enumerate(chunks):\n", - " chunk_start = time.time()\n", - "\n", - " try:\n", - " wav = synthesize_chunk(\n", - " text=chunk,\n", - " voice_path=voice_path,\n", - " language=language,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - " audio_chunks.append(wav)\n", - "\n", - " except Exception as e:\n", - " print(f\"\\n⚠️ Erreur chunk {i+1}: {e}\")\n", - " # Continuer avec les autres chunks\n", - " continue\n", - "\n", - " # Libérer la mémoire GPU périodiquement\n", - " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", - " torch.cuda.empty_cache()\n", - "\n", - " chunk_duration = time.time() - chunk_start\n", - " if progress:\n", - " progress.update(chunk_duration)\n", - "\n", - " if not audio_chunks:\n", - " raise RuntimeError(\"Aucun audio généré\")\n", - "\n", - " print(\"\\n🔗 Concaténation des chunks...\")\n", - "\n", - " # Concaténer avec crossfade\n", - " final_audio = AudioProcessor.concatenate_chunks(\n", - " audio_chunks,\n", - " Config.SAMPLE_RATE,\n", - " Config.CROSSFADE_DURATION\n", - " )\n", - "\n", - " # Libérer les chunks de la mémoire\n", - " del audio_chunks\n", - " gc.collect()\n", - " if _device_name.startswith(\"cuda\"):\n", - " torch.cuda.empty_cache()\n", - "\n", - " # Post-traitement\n", - " if enhance:\n", - " print(\"✨ Post-traitement...\")\n", - " final_audio = AudioProcessor.enhance(\n", - " final_audio,\n", - " Config.SAMPLE_RATE,\n", - " normalize=True,\n", - " warmth=True\n", - " )\n", - " else:\n", - " final_audio = AudioProcessor.normalize(final_audio)\n", - "\n", - " # Convertir en int16\n", - " final_audio = (final_audio * 32767).astype(np.int16)\n", - "\n", - " # Générer le nom de fichier\n", - " if output_path is None:\n", - " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", - " output_path = f\"tts_long_{voice}_{h}.wav\"\n", - "\n", - " # Dossier de sortie\n", - " if use_gdrive:\n", - " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", - " folder.mkdir(parents=True, exist_ok=True)\n", - " final_path = folder / Path(output_path).name\n", - " else:\n", - " final_path = Path(output_path)\n", - "\n", - " # Sauvegarder\n", - " print(f\"💾 Sauvegarde: {final_path}\")\n", - " with wave.open(str(final_path), \"wb\") as wav_file:\n", - " wav_file.setnchannels(1)\n", - " wav_file.setsampwidth(2)\n", - " wav_file.setframerate(Config.SAMPLE_RATE)\n", - " wav_file.writeframes(final_audio.tobytes())\n", - "\n", - " # Calculer la durée réelle\n", - " duration = len(final_audio) / Config.SAMPLE_RATE\n", - "\n", - " print(f\"\\n✅ Audio généré avec succès!\")\n", - " print(f\" 📁 Fichier: {final_path}\")\n", - " print(f\" ⏱️ Durée: {ProgressTracker._format_time(duration)}\")\n", - " print(f\" 📦 Chunks: {len(chunks)}\")\n", - " print(f\" 🎤 Voix: {voice}\")\n", - "\n", - " return {\n", - " 'path': str(final_path),\n", - " 'sample_rate': Config.SAMPLE_RATE,\n", - " 'duration_seconds': duration,\n", - " 'duration_formatted': ProgressTracker._format_time(duration),\n", - " 'audio_data': final_audio,\n", - " 'voice': voice,\n", - " 'language': language,\n", - " 'device': _device_name,\n", - " 'chunks_count': len(chunks),\n", - " 'text_length': len(text)\n", - " }\n", - "\n", - "\n", - "def text_to_speech(\n", - " text: str,\n", - " voice: str = \"female_fr\",\n", - " language: str = \"fr\",\n", - " output_path: Optional[str] = None,\n", - " enhance: bool = False,\n", - " use_gdrive: bool = False,\n", - " gdrive_folder: str = None,\n", - " enable_text_splitting: bool = True\n", - ") -> dict:\n", - " \"\"\"\n", - " Génère un fichier audio à partir de texte avec XTTS v2.\n", - " \"\"\"\n", - " # Basculer automatiquement vers la version long pour textes > 10000 chars\n", - " if len(text) > 10000:\n", - " print(\"📢 Texte long détecté - utilisation de text_to_speech_long()\")\n", - " return text_to_speech_long(\n", - " text=text,\n", - " voice=voice,\n", - " language=language,\n", - " output_path=output_path,\n", - " enhance=enhance,\n", - " use_gdrive=use_gdrive,\n", - " gdrive_folder=gdrive_folder,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - "\n", - " voice_path = get_voice_path(voice)\n", - "\n", - " # Générer l'audio avec enable_text_splitting\n", - " wav = synthesize_chunk(\n", - " text=text,\n", - " voice_path=voice_path,\n", - " language=language,\n", - " enable_text_splitting=enable_text_splitting\n", - " )\n", - "\n", - " # Post-traitement\n", - " if enhance:\n", - " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", - " else:\n", - " audio = AudioProcessor.normalize(wav)\n", - "\n", - " audio = (audio * 32767).astype(np.int16)\n", - "\n", - " # Nom de fichier\n", - " if output_path is None:\n", - " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", - " output_path = f\"tts_{voice}_{h}.wav\"\n", - "\n", - " # Dossier de sortie\n", - " if use_gdrive:\n", - " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", - " folder.mkdir(parents=True, exist_ok=True)\n", - " final_path = folder / Path(output_path).name\n", - " else:\n", - " final_path = Path(output_path)\n", - "\n", - " # Sauvegarder\n", - " with wave.open(str(final_path), \"wb\") as wav_file:\n", - " wav_file.setnchannels(1)\n", - " wav_file.setsampwidth(2)\n", - " wav_file.setframerate(Config.SAMPLE_RATE)\n", - " wav_file.writeframes(audio.tobytes())\n", - "\n", - " duration = len(audio) / Config.SAMPLE_RATE\n", - "\n", - " print(f\"✓ Audio généré: {final_path}\")\n", - " print(f\" Durée: {duration:.2f}s | Voix: {voice}\")\n", - "\n", - " return {\n", - " 'path': str(final_path),\n", - " 'sample_rate': Config.SAMPLE_RATE,\n", - " 'duration_seconds': duration,\n", - " 'audio_data': audio,\n", - " 'voice': voice,\n", - " 'language': language,\n", - " 'device': _device_name\n", - " }\n", - "\n", - "\n", - "# ==============================================================================\n", - "# UTILITIES\n", - "# ==============================================================================\n", - "\n", - "def preview_audio(result: dict) -> None:\n", - " \"\"\"Prévisualise l'audio dans le notebook.\"\"\"\n", - " from IPython.display import Audio, display\n", - "\n", - " audio = result['audio_data']\n", - " if audio.dtype == np.int16:\n", - " audio = audio.astype(np.float32) / 32768.0\n", - "\n", - " display(Audio(audio, rate=result['sample_rate']))\n", - "\n", - "\n", - "def list_voices() -> list:\n", - " \"\"\"Liste les voix disponibles.\"\"\"\n", - " return list(Config.PRESET_VOICES.keys())\n", - "\n", - "\n", - "def list_languages() -> list:\n", - " \"\"\"Liste les langues supportées.\"\"\"\n", - " return [\"en\", \"es\", \"fr\", \"de\", \"it\", \"pt\", \"pl\", \"tr\",\n", - " \"ru\", \"nl\", \"cs\", \"ar\", \"zh-cn\", \"ja\", \"hu\", \"ko\", \"hi\"]\n", - "\n", - "\n", - "def clear_cache():\n", - " \"\"\"Libère la mémoire.\"\"\"\n", - " global _tts_model\n", - " import torch\n", - "\n", - " _tts_model = None\n", - " gc.collect()\n", - "\n", - " if _device_name.startswith(\"cuda\"):\n", - " torch.cuda.empty_cache()\n", - "\n", - " print(\"✓ Cache vidé\")\n", - "\n", - "\n", - "def estimate_duration(text: str) -> dict:\n", - " \"\"\"\n", - " Estime la durée audio pour un texte.\n", - " \"\"\"\n", - " duration = TextSplitter.estimate_audio_duration(text)\n", - " chunks = len(TextSplitter.split_for_long_audio(text))\n", - "\n", - " return {\n", - " 'chars': len(text),\n", - " 'estimated_seconds': duration,\n", - " 'estimated_formatted': ProgressTracker._format_time(duration),\n", - " 'chunks_estimate': chunks\n", - " }\n", - "\n", - "\n", - "# ==============================================================================\n", - "# ALIASES\n", - "# ==============================================================================\n", - "\n", - "tts = text_to_speech\n", - "tts_long = text_to_speech_long\n", - "\n", - "\n", - "# ==============================================================================\n", - "# INITIALIZATION\n", - "# ==============================================================================\n", - "\n", - "def init():\n", - " \"\"\"Initialise le module.\"\"\"\n", - " detect_device()\n", - " print(\"✅ Module XTTS v2 Long Audio chargé\")\n", - " print(f\" Device: {_device_name}\")\n", - " print(f\" Voix: {list_voices()}\")\n", - " print(f\" enable_text_splitting: activé par défaut\")\n", - " # Add this line to explicitly set torchaudio backend\n", - " try:\n", - " import torchaudio\n", - " # This line is intentionally commented out as set_audio_backend is not available in all torchaudio versions.\n", - " # The `soundfile` library should be picked up automatically if torchcodec is not installed.\n", - " # torchaudio.set_audio_backend(\"soundfile\")\n", - " print(\"💡 torchaudio backend is expected to use 'soundfile' as torchcodec is no longer installed.\")\n", - " except ImportError:\n", - " print(\"⚠️ torchaudio not found.\")\n", - " except Exception as e:\n", - " print(f\"⚠️ Erreur lors de la configuration de torchaudio: {e}\")\n", - "\n", - "\n", - "# Auto-init\n", - "if __name__ != \"__main__\":\n", - " try:\n", - " detect_device()\n", - " except:\n", - " pass\n", - "\n", - "\n", - "# ==============================================================================\n", - "# EXAMPLE USAGE\n", - "# ==============================================================================\n", - "\n", - "if __name__ == \"__main__\":\n", - " # Installation si nécessaire\n", - " install_dependencies()\n", - "\n", - " # Initialisation\n", - " init()\n", - "\n", - " # Exemple avec texte court\n", - " print(\"\\n\" + \"=\"*60)\n", - " print(\"EXEMPLE 1: Texte court\")\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1.87G/1.87G [00:35<00:00, 52.2MiB/s]\n", + "4.37kiB [00:00, 5.36MiB/s]\n", + "361kiB [00:00, 98.7MiB/s]\n", + "100%|██████████| 32.0/32.0 [00:00<00:00, 63.3kiB/s]\n", + "100%|██████████| 7.75M/7.75M [00:00<00:00, 102MiB/s]\n" + ] }, { - "cell_type": "code", - "source": [ - "\n", - "text_to_speech_to_synthetise= \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", - "\n", - "voice_gender = 'female_fr'\n", - "# ['female_fr', 'male_fr']" - ], - "metadata": { - "id": "FREsMU-QLEc4" - }, - "execution_count": 4, - "outputs": [] + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Modèle chargé\n", + "✓ Audio généré: tts_female_fr_a22d596a.wav\n", + " Durée: 61.17s | Voix: female_fr\n" + ] }, { - "cell_type": "code", - "source": [ - "# -----------------------------------------------------------------------------\n", - "# CELLULE 3: Exemples d'utilisation\n", - "# -----------------------------------------------------------------------------\n", - "\n", - "# Montage Google Drive (optionnel)\n", - "# mount_gdrive()\n", - "\n", - "# Liste des voix disponibles\n", - "print(\"Voix disponibles:\", list_voices())\n", - "\n", - "# Génération simple\n", - "result = text_to_speech(text_to_speech_to_synthetise,voice=voice_gender)\n", - "\n", - "# Prévisualisation\n", - "preview_audio(result)" + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 250 - }, - "id": "2Any3vzyK8zF", - "outputId": "2d3ff69d-09bc-4334-f13f-7be141559de8" - }, - "execution_count": 5, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Voix disponibles: ['female_fr', 'male_fr']\n", - "📥 Téléchargement de la voix 'female_fr'...\n", - "🔄 Chargement du modèle XTTS v2...\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "100%|██████████| 1.87G/1.87G [00:35<00:00, 52.2MiB/s]\n", - "4.37kiB [00:00, 5.36MiB/s]\n", - "361kiB [00:00, 98.7MiB/s]\n", - "100%|██████████| 32.0/32.0 [00:00<00:00, 63.3kiB/s]\n", - "100%|██████████| 7.75M/7.75M [00:00<00:00, 102MiB/s]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "✓ Modèle chargé\n", - "✓ Audio généré: tts_female_fr_a22d596a.wav\n", - " Durée: 61.17s | Voix: female_fr\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {} - } + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# -----------------------------------------------------------------------------\n", + "# CELLULE 3: Exemples d'utilisation\n", + "# -----------------------------------------------------------------------------\n", + "\n", + "# Montage Google Drive (optionnel)\n", + "# mount_gdrive()\n", + "\n", + "# Liste des voix disponibles\n", + "print(\"Voix disponibles:\", list_voices())\n", + "\n", + "# Génération simple\n", + "result = text_to_speech(text_to_speech_to_synthetise, voice=voice_gender)\n", + "\n", + "# Prévisualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "2565nagRK0eb", + "outputId": "38fb657a-44c2-4ce0-c39b-3cebe22149d2" + }, + "outputs": [ { - "cell_type": "code", - "source": [ - "\n", - "# -----------------------------------------------------------------------------\n", - "# CELLULE 3: Exemples d'utilisation\n", - "# -----------------------------------------------------------------------------\n", - "\n", - "# Montage Google Drive (optionnel)\n", - "# mount_gdrive()\n", - "\n", - "# Liste des voix disponibles\n", - "print(\"Voix disponibles:\", list_voices())\n", - "\n", - "# Génération simple\n", - "result = text_to_speech(text_to_speech_to_synthetise,voice=voice_gender)\n", - "\n", - "# Prévisualisation\n", - "preview_audio(result)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2565nagRK0eb", - "outputId": "38fb657a-44c2-4ce0-c39b-3cebe22149d2" - }, - "execution_count": 6, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Voix disponibles: ['female_fr', 'male_fr']\n", - "✓ Audio généré: tts_female_fr_a22d596a.wav\n", - " Durée: 63.25s | Voix: female_fr\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - " \n", - " " - ] - }, - "metadata": {} - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Voix disponibles: ['female_fr', 'male_fr']\n", + "✓ Audio généré: tts_female_fr_a22d596a.wav\n", + " Durée: 63.25s | Voix: female_fr\n" + ] }, { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "Naxv1wHEp6NQ", - "outputId": "33d8bb87-0781-465a-b3d5-9b35f85de770", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "error", - "ename": "NameError", - "evalue": "name 'stop' is not defined", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipython-input-3957423419.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstop\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'stop' is not defined" - ] - } + "data": { + "text/html": [ + "\n", + " \n", + " " ], - "source": [ - "stop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ulSK6K1op63B" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BRhVnXgSE7Yd" - }, - "outputs": [], - "source": [ - "# Lire le fichier\n", - "with open(\"mon_texte_long.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " texte_complet = f.read()\n", - "\n", - "# Lancer la génération\n", - "text_to_speech_long(\n", - " text=texte_complet,\n", - " voice=\"female_fr\",\n", - " language=\"fr\"\n", - ")" + "text/plain": [ + "" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4o0EdnBHp7la" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KaWW0-DIMy7R" - }, - "outputs": [], - "source": [] + }, + "metadata": {}, + "output_type": "display_data" } - ], - "metadata": { - "accelerator": "GPU", + ], + "source": [ + "# -----------------------------------------------------------------------------\n", + "# CELLULE 3: Exemples d'utilisation\n", + "# -----------------------------------------------------------------------------\n", + "\n", + "# Montage Google Drive (optionnel)\n", + "# mount_gdrive()\n", + "\n", + "# Liste des voix disponibles\n", + "print(\"Voix disponibles:\", list_voices())\n", + "\n", + "# Génération simple\n", + "result = text_to_speech(text_to_speech_to_synthetise, voice=voice_gender)\n", + "\n", + "# Prévisualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { "colab": { - "gpuType": "T4", - "provenance": [], - "authorship_tag": "ABX9TyMWr0Dv6vrMdJNJnihMF5Pg", - "include_colab_link": true + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" + "id": "Naxv1wHEp6NQ", + "outputId": "33d8bb87-0781-465a-b3d5-9b35f85de770" + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'stop' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipython-input-3957423419.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstop\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'stop' is not defined" + ] } + ], + "source": [ + "stop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ulSK6K1op63B" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BRhVnXgSE7Yd" + }, + "outputs": [], + "source": [ + "# Lire le fichier\n", + "with open(\"mon_texte_long.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " texte_complet = f.read()\n", + "\n", + "# Lancer la génération\n", + "text_to_speech_long(text=texte_complet, voice=\"female_fr\", language=\"fr\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4o0EdnBHp7la" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KaWW0-DIMy7R" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "authorship_tag": "ABX9TyMWr0Dv6vrMdJNJnihMF5Pg", + "gpuType": "T4", + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file