diff --git a/long_TTS_xtts_V6.ipynb b/long_TTS_xtts_V6.ipynb new file mode 100644 index 000000000000..7285e0566c00 --- /dev/null +++ b/long_TTS_xtts_V6.ipynb @@ -0,0 +1,1498 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "header" + }, + "source": [ + "# TTS XTTS v2 - Long Audio Generator v6\n", + "\n", + "**Version 4.0** - Compatible PyTorch 2.9+ (Colab 2026)\n", + "\n", + "Fonctionnalites:\n", + "- Generation audio longue duree (> 1 heure)\n", + "- Fix torchcodec/torchaudio pour PyTorch 2.9+\n", + "- Chunking intelligent par paragraphes\n", + "- Crossfade entre chunks\n", + "- Barre de progression avec ETA\n", + "- Support Google Drive\n", + "\n", + "**Auteur:** Bruno | **Corrections:** Gemini, Claude" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "install_deps", + "outputId": "55cd944b-c5fa-498c-fb32-78d151e5787e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/1.8 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m81.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for docopt (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "torch = 2.9.0+cu126 -> installation de torchcodec==0.9.* via https://download.pytorch.org/whl/cu126\n", + "torchcodec detecte: True\n", + "torchcodec version: 0.9.1+cu126\n" + ] + } + ], + "source": [ + "# Installation des dependances\n", + "# --------------------------------------------------------------\n", + "# Remarque importante (PyTorch>=2.9) :\n", + "# - Coqui TTS exige la bibliotheque `torchcodec` pour l'I/O audio. (cf. message d'erreur)\n", + "# - La version de torchcodec doit etre compatible avec votre version de torch.\n", + "#\n", + "# Sources (documentation officielle) :\n", + "# - coqui-tts: installer torch, torchaudio et (seulement pour torch>=2.9) torchcodec.\n", + "# - torchcodec: table de compatibilite torch <-> torchcodec + note CUDA/CPU.\n", + "\n", + "!pip install -q -U pip\n", + "!pip install -q numpy==2.0.2 scipy soundfile noisereduce\n", + "!pip install -q -U coqui-tts\n", + "\n", + "# Installer torchcodec dans une version compatible avec torch (et CUDA si detecte)\n", + "import sys, subprocess, re\n", + "\n", + "try:\n", + " import torch\n", + "except Exception as e:\n", + " raise RuntimeError(\n", + " \"PyTorch (torch) n'est pas importable. Installez d'abord torch/torchaudio, \"\n", + " \"puis relancez cette cellule.\"\n", + " ) from e\n", + "\n", + "\n", + "def _torch_major_minor(ver: str) -> str:\n", + " base = ver.split(\"+\")[0]\n", + " parts = base.split(\".\")\n", + " return \".\".join(parts[:2]) if len(parts) >= 2 else base\n", + "\n", + "\n", + "torch_ver = torch.__version__\n", + "mm = _torch_major_minor(torch_ver)\n", + "\n", + "# Mapping base sur la table de compatibilite officielle torchcodec.\n", + "if mm == \"2.10\":\n", + " torchcodec_spec = \"torchcodec==0.10.*\"\n", + "elif mm == \"2.9\":\n", + " torchcodec_spec = \"torchcodec==0.9.*\"\n", + "elif mm == \"2.8\":\n", + " torchcodec_spec = \"torchcodec==0.7.*\"\n", + "else:\n", + " torchcodec_spec = \"torchcodec\"\n", + "\n", + "# Si votre torch est un build CUDA (ex: 2.9.0+cu126), on tente d'installer torchcodec\n", + "# depuis l'index PyTorch correspondant. Sinon, on installe la version CPU depuis PyPI.\n", + "index_url = None\n", + "if \"+\" in torch_ver:\n", + " build = torch_ver.split(\"+\", 1)[1]\n", + " if build.startswith(\"cu\"):\n", + " index_url = f\"https://download.pytorch.org/whl/{build}\"\n", + "\n", + "print(\n", + " f\"torch = {torch_ver} -> installation de {torchcodec_spec}\"\n", + " + (f\" via {index_url}\" if index_url else \" (CPU PyPI)\")\n", + ")\n", + "\n", + "\n", + "def _pip_install_torchcodec():\n", + " cmd = [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"-U\", torchcodec_spec]\n", + " if index_url:\n", + " cmd += [\"--index-url\", index_url]\n", + " subprocess.check_call(cmd)\n", + "\n", + "\n", + "try:\n", + " _pip_install_torchcodec()\n", + "except Exception as e:\n", + " # Fallback : essayer sans index_url (CPU PyPI).\n", + " if index_url:\n", + " print(f\"⚠️ Echec avec l'index PyTorch ({index_url}). Tentative CPU via PyPI…\")\n", + " index_url = None\n", + " _pip_install_torchcodec()\n", + " else:\n", + " raise\n", + "\n", + "# Verification (metadonnees pip)\n", + "import importlib.util, importlib.metadata\n", + "\n", + "print(\"torchcodec detecte:\", importlib.util.find_spec(\"torchcodec\") is not None)\n", + "print(\"torchcodec version:\", importlib.metadata.version(\"torchcodec\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "main_module", + "outputId": "f17c6fea-cd4e-438a-f628-070f6bfde7b3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Patch torchaudio applique (backend: soundfile)\n", + "⚙️ Device: cuda (Tesla T4)\n", + "\n", + "============================================================\n", + "TTS XTTS v2 - Long Audio Generator v4\n", + "Compatible PyTorch 2.9+ (fix torchcodec)\n", + "============================================================\n", + "Voix disponibles: ['female_fr', 'male_fr']\n" + ] + } + ], + "source": [ + "# -*- coding: utf-8 -*-\n", + "\"\"\"\n", + "TTS XTTS v2 - Version Long Audio v4\n", + "====================================\n", + "\n", + "Module de synthese vocale haute qualite utilisant Coqui XTTS v2.\n", + "Compatible avec PyTorch 2.9+ (fix torchcodec/torchaudio).\n", + "\n", + "Auteur: Bruno\n", + "Date: Janvier 2026\n", + "Corrections: Gemini, Claude\n", + "\"\"\"\n", + "\n", + "# ==============================================================================\n", + "# IMPORTS STANDARDS (APRES LE FIX)\n", + "# ==============================================================================\n", + "\n", + "import os\n", + "import re\n", + "import gc\n", + "import wave\n", + "import time\n", + "import hashlib\n", + "import warnings\n", + "import inspect\n", + "from pathlib import Path\n", + "from typing import Optional, List\n", + "from dataclasses import dataclass\n", + "import numpy as np\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "\n", + "# ==============================================================================\n", + "# TORCHAUDIO FIX - Backend soundfile\n", + "# ==============================================================================\n", + "\n", + "\n", + "def _patch_torchaudio():\n", + " \"\"\"\n", + " Patch torchaudio.load pour utiliser le backend soundfile au lieu de torchcodec.\n", + " Resout l'erreur: \"Could not load libtorchcodec\" sur Colab avec PyTorch 2.9+.\n", + " \"\"\"\n", + " try:\n", + " import torchaudio\n", + "\n", + " # Verifier si deja patche\n", + " if hasattr(torchaudio, \"_original_load_patched\"):\n", + " return\n", + "\n", + " # Sauvegarder la fonction originale\n", + " _original_load = torchaudio.load\n", + "\n", + " def _patched_load(filepath, *args, **kwargs):\n", + " \"\"\"\n", + " Version patchee de torchaudio.load qui utilise soundfile comme backend.\n", + " \"\"\"\n", + " # Forcer le backend soundfile si non specifie\n", + " if \"backend\" not in kwargs:\n", + " kwargs[\"backend\"] = \"soundfile\"\n", + "\n", + " try:\n", + " return _original_load(filepath, *args, **kwargs)\n", + " except Exception as e:\n", + " # Si soundfile echoue, essayer sans specifier de backend\n", + " if \"backend\" in kwargs:\n", + " del kwargs[\"backend\"]\n", + " try:\n", + " return _original_load(filepath, *args, **kwargs)\n", + " except:\n", + " pass\n", + " raise e\n", + "\n", + " # Appliquer le patch\n", + " torchaudio.load = _patched_load\n", + " torchaudio._original_load_patched = True\n", + " print(\"✓ Patch torchaudio applique (backend: soundfile)\")\n", + "\n", + " except ImportError:\n", + " pass\n", + " except Exception as e:\n", + " print(f\"⚠️ Impossible de patcher torchaudio: {e}\")\n", + "\n", + "\n", + "# Appliquer le patch torchaudio\n", + "_patch_torchaudio()\n", + "\n", + "# ==============================================================================\n", + "# CONFIGURATION\n", + "# ==============================================================================\n", + "\n", + "\n", + "@dataclass\n", + "class TTSConfig:\n", + " \"\"\"Configuration globale du module TTS.\"\"\"\n", + "\n", + " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", + " SAMPLE_RATE: int = 24000\n", + " DEFAULT_LANGUAGE: str = \"fr\"\n", + " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", + " MAX_CHARS_PER_CHUNK: int = 500\n", + " CROSSFADE_DURATION: float = 0.05\n", + " ENABLE_TEXT_SPLITTING: bool = True\n", + " PRESET_VOICES: dict = None\n", + "\n", + " def __post_init__(self):\n", + " self.PRESET_VOICES = {\n", + " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", + " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", + " }\n", + "\n", + "\n", + "Config = TTSConfig()\n", + "\n", + "# ==============================================================================\n", + "# DEVICE MANAGEMENT\n", + "# ==============================================================================\n", + "\n", + "_device = None\n", + "_device_name = \"cpu\"\n", + "\n", + "\n", + "def detect_device():\n", + " \"\"\"Detecte le meilleur device disponible.\"\"\"\n", + " global _device, _device_name\n", + " import torch\n", + "\n", + " # Essayer TPU\n", + " try:\n", + " import torch_xla.core.xla_model as xm\n", + "\n", + " _device = xm.xla_device()\n", + " _device_name = \"tpu\"\n", + " print(f\"⚙️ Device: TPU\")\n", + " return\n", + " except:\n", + " pass\n", + "\n", + " # Essayer CUDA\n", + " if torch.cuda.is_available():\n", + " _device = torch.device(\"cuda\")\n", + " _device_name = f\"cuda ({torch.cuda.get_device_name(0)})\"\n", + " print(f\"⚙️ Device: {_device_name}\")\n", + " return\n", + "\n", + " # Fallback CPU\n", + " _device = torch.device(\"cpu\")\n", + " _device_name = \"cpu\"\n", + " print(f\"⚙️ Device: CPU\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TEXT SPLITTING UTILITIES\n", + "# ==============================================================================\n", + "\n", + "\n", + "class TextSplitter:\n", + " \"\"\"Utilitaire pour decouper intelligemment les textes longs.\"\"\"\n", + "\n", + " @staticmethod\n", + " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", + " \"\"\"Estime la duree audio en secondes.\"\"\"\n", + " return len(text) / chars_per_second\n", + "\n", + " @staticmethod\n", + " def split_into_sentences(text: str) -> List[str]:\n", + " \"\"\"Decoupe le texte en phrases.\"\"\"\n", + " pattern = r\"(?<=[.!?])\\s+\"\n", + " sentences = re.split(pattern, text)\n", + " return [s.strip() for s in sentences if s.strip()]\n", + "\n", + " @staticmethod\n", + " def split_into_paragraphs(text: str) -> List[str]:\n", + " \"\"\"Decoupe le texte en paragraphes.\"\"\"\n", + " paragraphs = re.split(r\"\\n\\s*\\n\", text)\n", + " return [p.strip() for p in paragraphs if p.strip()]\n", + "\n", + " @classmethod\n", + " def split_for_long_audio(\n", + " cls, text: str, max_chars: int = 500, preserve_sentences: bool = True\n", + " ) -> List[str]:\n", + " \"\"\"Decoupe le texte pour generation audio longue.\"\"\"\n", + " if len(text) <= max_chars:\n", + " return [text]\n", + "\n", + " chunks = []\n", + " if preserve_sentences:\n", + " sentences = cls.split_into_sentences(text)\n", + " current_chunk = \"\"\n", + "\n", + " for sentence in sentences:\n", + " if len(sentence) > max_chars:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = \"\"\n", + " # Decouper la phrase trop longue par mots\n", + " words = sentence.split()\n", + " sub_chunk = \"\"\n", + " for word in words:\n", + " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", + " sub_chunk += \" \" + word if sub_chunk else word\n", + " else:\n", + " if sub_chunk:\n", + " chunks.append(sub_chunk.strip())\n", + " sub_chunk = word\n", + " if sub_chunk:\n", + " current_chunk = sub_chunk\n", + " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", + " current_chunk += \" \" + sentence if current_chunk else sentence\n", + " else:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = sentence\n", + "\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " else:\n", + " for i in range(0, len(text), max_chars):\n", + " chunks.append(text[i : i + max_chars])\n", + "\n", + " return chunks\n", + "\n", + "\n", + "# ==============================================================================\n", + "# AUDIO PROCESSING\n", + "# ==============================================================================\n", + "\n", + "\n", + "class AudioProcessor:\n", + " \"\"\"Processeur audio pour post-traitement et concatenation.\"\"\"\n", + "\n", + " @staticmethod\n", + " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", + " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target_linear = 10 ** (target_db / 20)\n", + " audio = audio * (target_linear / peak)\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + " @staticmethod\n", + " def crossfade(\n", + " audio1: np.ndarray, audio2: np.ndarray, sample_rate: int, duration: float = 0.05\n", + " ) -> np.ndarray:\n", + " \"\"\"Concatene deux segments audio avec crossfade.\"\"\"\n", + " if audio1.dtype == np.int16:\n", + " audio1 = audio1.astype(np.float32) / 32768.0\n", + " if audio2.dtype == np.int16:\n", + " audio2 = audio2.astype(np.float32) / 32768.0\n", + "\n", + " fade_samples = int(sample_rate * duration)\n", + " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", + " return np.concatenate([audio1, audio2])\n", + "\n", + " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", + " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", + " audio1_end = audio1[-fade_samples:] * fade_out\n", + " audio2_start = audio2[:fade_samples] * fade_in\n", + "\n", + " return np.concatenate(\n", + " [audio1[:-fade_samples], audio1_end + audio2_start, audio2[fade_samples:]]\n", + " )\n", + "\n", + " @classmethod\n", + " def concatenate_chunks(\n", + " cls,\n", + " audio_chunks: List[np.ndarray],\n", + " sample_rate: int,\n", + " crossfade_duration: float = 0.05,\n", + " ) -> np.ndarray:\n", + " \"\"\"Concatene plusieurs chunks audio avec crossfade.\"\"\"\n", + " if not audio_chunks:\n", + " return np.array([], dtype=np.float32)\n", + " if len(audio_chunks) == 1:\n", + " audio = audio_chunks[0]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " return audio\n", + "\n", + " result = audio_chunks[0]\n", + " if result.dtype == np.int16:\n", + " result = result.astype(np.float32) / 32768.0\n", + " for chunk in audio_chunks[1:]:\n", + " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", + " return result\n", + "\n", + " @staticmethod\n", + " def enhance(\n", + " audio: np.ndarray, sample_rate: int, normalize: bool = True, warmth: bool = True\n", + " ) -> np.ndarray:\n", + " \"\"\"Ameliore l'audio avec normalisation et warmth.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " # Ajouter de la chaleur (boost basses frequences)\n", + " if warmth:\n", + " try:\n", + " from scipy import signal\n", + "\n", + " nyquist = sample_rate / 2\n", + " cutoff = min(300, nyquist * 0.9) / nyquist\n", + " b, a = signal.butter(2, cutoff, btype=\"low\")\n", + " bass = signal.filtfilt(b, a, audio)\n", + " audio = audio + 0.15 * bass\n", + " except ImportError:\n", + " pass\n", + "\n", + " # Normaliser\n", + " if normalize:\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target = 10 ** (-3.0 / 20)\n", + " audio = audio * (target / peak)\n", + "\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + "\n", + "# ==============================================================================\n", + "# PROGRESS TRACKER\n", + "# ==============================================================================\n", + "\n", + "\n", + "class ProgressTracker:\n", + " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", + "\n", + " def __init__(self, total: int, description: str = \"\"):\n", + " self.total = total\n", + " self.current = 0\n", + " self.description = description\n", + " self.start_time = time.time()\n", + " self.chunk_times = []\n", + "\n", + " def update(self, chunk_duration: float = None):\n", + " \"\"\"Met a jour la progression.\"\"\"\n", + " self.current += 1\n", + " if chunk_duration:\n", + " self.chunk_times.append(chunk_duration)\n", + " self._display()\n", + "\n", + " def _display(self):\n", + " \"\"\"Affiche la barre de progression.\"\"\"\n", + " elapsed = time.time() - self.start_time\n", + " percent = (self.current / self.total) * 100\n", + "\n", + " if self.chunk_times:\n", + " avg_time = np.mean(self.chunk_times)\n", + " remaining = avg_time * (self.total - self.current)\n", + " eta_str = self._format_time(remaining)\n", + " else:\n", + " eta_str = \"...\"\n", + "\n", + " bar_length = 30\n", + " filled = int(bar_length * self.current / self.total)\n", + " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", + " elapsed_str = self._format_time(elapsed)\n", + "\n", + " print(\n", + " f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", + " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\",\n", + " end=\"\",\n", + " )\n", + "\n", + " if self.current >= self.total:\n", + " print()\n", + "\n", + " @staticmethod\n", + " def _format_time(seconds: float) -> str:\n", + " \"\"\"Formate les secondes en HH:MM:SS.\"\"\"\n", + " hours = int(seconds // 3600)\n", + " minutes = int((seconds % 3600) // 60)\n", + " secs = int(seconds % 60)\n", + " if hours > 0:\n", + " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", + " return f\"{minutes:02d}:{secs:02d}\"\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TTS ENGINE\n", + "# ==============================================================================\n", + "\n", + "_tts_model = None\n", + "_voices_cache = {}\n", + "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", + "\n", + "\n", + "def get_model():\n", + " \"\"\"Charge le modele XTTS v2 avec cache.\"\"\"\n", + " global _tts_model\n", + "\n", + " if _tts_model is None:\n", + " print(\"🔄 Chargement du modele XTTS v2...\")\n", + "\n", + " from TTS.api import TTS\n", + "\n", + " _tts_model = TTS(Config.MODEL_NAME)\n", + "\n", + " # Deplacement sur GPU (selon la version, .to() peut etre sur le wrapper ou sur le sous-modele)\n", + " if _device is not None and _device_name.startswith(\"cuda\"):\n", + " try:\n", + " if hasattr(_tts_model, \"to\"):\n", + " _tts_model = _tts_model.to(_device)\n", + " elif hasattr(_tts_model, \"tts_model\") and hasattr(\n", + " _tts_model.tts_model, \"to\"\n", + " ):\n", + " _tts_model.tts_model = _tts_model.tts_model.to(_device)\n", + " except Exception as e:\n", + " print(f\"⚠️ Impossible de deplacer le modele sur CUDA: {e}\")\n", + "\n", + " print(\"✓ Modele charge\")\n", + "\n", + " return _tts_model\n", + "\n", + "\n", + "def get_voice_path(voice: str) -> str:\n", + " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", + " global _voices_cache\n", + " import urllib.request\n", + "\n", + " if voice in _voices_cache:\n", + " return _voices_cache[voice]\n", + "\n", + " if os.path.isfile(voice):\n", + " _voices_cache[voice] = voice\n", + " return voice\n", + "\n", + " if voice in Config.PRESET_VOICES:\n", + " url = Config.PRESET_VOICES[voice]\n", + " path = f\"/tmp/{voice}.wav\"\n", + " if not os.path.exists(path):\n", + " print(f\"📥 Telechargement de la voix '{voice}'...\")\n", + " urllib.request.urlretrieve(url, path)\n", + " _voices_cache[voice] = path\n", + " return path\n", + "\n", + " raise FileNotFoundError(f\"Voix '{voice}' non trouvee\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# MAIN SYNTHESIS FUNCTIONS\n", + "# ==============================================================================\n", + "\n", + "\n", + "def _filter_kwargs(fn, kwargs: dict) -> dict:\n", + " \"\"\"Garde uniquement les kwargs acceptes par fn (compatibilite entre versions).\"\"\"\n", + " try:\n", + " sig = inspect.signature(fn)\n", + " return {k: v for k, v in kwargs.items() if k in sig.parameters}\n", + " except (TypeError, ValueError):\n", + " # Signature indisponible (ex: fonction C++) -> on ne filtre pas\n", + " return kwargs\n", + "\n", + "\n", + "def _get_conditioning_latents_compat(xtts_model, voice_path: str):\n", + " \"\"\"Compat: get_conditioning_latents() a change de signature selon les versions.\"\"\"\n", + " fn = getattr(xtts_model, \"get_conditioning_latents\", None)\n", + " if fn is None:\n", + " raise AttributeError(\n", + " \"Le modele XTTS ne fournit pas get_conditioning_latents().\"\n", + " )\n", + "\n", + " base_kwargs = {\"gpt_cond_len\": 30, \"max_ref_length\": 60}\n", + "\n", + " # Tentative par introspection\n", + " try:\n", + " sig = inspect.signature(fn)\n", + " params = sig.parameters\n", + "\n", + " if \"audio_path\" in params:\n", + " # Certaines versions veulent une liste, d'autres une str\n", + " try:\n", + " return fn(audio_path=[voice_path], **_filter_kwargs(fn, base_kwargs))\n", + " except TypeError:\n", + " return fn(audio_path=voice_path, **_filter_kwargs(fn, base_kwargs))\n", + "\n", + " if \"audio_paths\" in params:\n", + " return fn(audio_paths=[voice_path], **_filter_kwargs(fn, base_kwargs))\n", + "\n", + " if \"speaker_wav\" in params:\n", + " return fn(speaker_wav=voice_path, **_filter_kwargs(fn, base_kwargs))\n", + "\n", + " except (TypeError, ValueError):\n", + " pass\n", + "\n", + " # Fallback brut (plus permissif)\n", + " try:\n", + " return fn(audio_path=[voice_path], gpt_cond_len=30, max_ref_length=60)\n", + " except Exception:\n", + " try:\n", + " return fn(audio_path=voice_path, gpt_cond_len=30, max_ref_length=60)\n", + " except Exception:\n", + " return fn(voice_path)\n", + "\n", + "\n", + "def synthesize_chunk(\n", + " text: str, voice_path: str, language: str = \"fr\", enable_text_splitting: bool = True\n", + ") -> np.ndarray:\n", + " \"\"\"Synthetise un chunk de texte en audio via l'inference directe.\"\"\"\n", + " model_wrapper = get_model()\n", + "\n", + " # Acceder au modele XTTS directement (bypass SpeakerManager bug)\n", + " if hasattr(model_wrapper, \"synthesizer\"):\n", + " xtts_model = model_wrapper.synthesizer.tts_model\n", + " else:\n", + " xtts_model = model_wrapper.tts_model\n", + "\n", + " # Calculer les latents de conditionnement (compat multi-versions)\n", + " try:\n", + " gpt_cond_latent, speaker_embedding = _get_conditioning_latents_compat(\n", + " xtts_model, voice_path\n", + " )\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur calcul latents: {e}\")\n", + " raise e\n", + "\n", + " # Inference directe (filtrage des kwargs selon la signature)\n", + " try:\n", + " inference_kwargs = {\n", + " \"text\": text,\n", + " \"language\": language,\n", + " \"gpt_cond_latent\": gpt_cond_latent,\n", + " \"speaker_embedding\": speaker_embedding,\n", + " \"temperature\": 0.7,\n", + " \"length_penalty\": 1.0,\n", + " \"repetition_penalty\": 2.0,\n", + " \"top_k\": 50,\n", + " \"top_p\": 0.8,\n", + " \"enable_text_splitting\": enable_text_splitting,\n", + " }\n", + "\n", + " # Alias possibles selon versions\n", + " try:\n", + " sig = inspect.signature(xtts_model.inference)\n", + " params = sig.parameters\n", + " if \"speaker_embedding\" not in params and \"speaker_latents\" in params:\n", + " inference_kwargs[\"speaker_latents\"] = inference_kwargs.pop(\n", + " \"speaker_embedding\"\n", + " )\n", + " except (TypeError, ValueError):\n", + " pass\n", + "\n", + " out = xtts_model.inference(\n", + " **_filter_kwargs(xtts_model.inference, inference_kwargs)\n", + " )\n", + "\n", + " if isinstance(out, dict) and \"wav\" in out:\n", + " wav = out[\"wav\"]\n", + " else:\n", + " wav = out\n", + "\n", + " if hasattr(wav, \"cpu\"):\n", + " wav = wav.cpu().numpy()\n", + " if isinstance(wav, list):\n", + " wav = np.array(wav, dtype=np.float32)\n", + "\n", + " return wav\n", + "\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'inference directe : {e}\")\n", + " raise e\n", + "\n", + "\n", + "def text_to_speech_long(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " max_chars_per_chunk: int = None,\n", + " show_progress: bool = True,\n", + " enable_text_splitting: bool = True,\n", + ") -> dict:\n", + " \"\"\"Genere un fichier audio long (> 1 heure) a partir de texte.\"\"\"\n", + " import torch\n", + "\n", + " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Estimation\n", + " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", + " print(f\"\\n📝 Texte: {len(text):,} caracteres\")\n", + " print(f\"⏱️ Duree estimee: {ProgressTracker._format_time(estimated_duration)}\")\n", + "\n", + " # Decoupage\n", + " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", + " print(f\"📦 Chunks: {len(chunks)}\")\n", + "\n", + " # Synthese\n", + " progress = ProgressTracker(len(chunks), \"🎙️ Synthese\") if show_progress else None\n", + " audio_chunks = []\n", + "\n", + " for i, chunk in enumerate(chunks):\n", + " chunk_start = time.time()\n", + " try:\n", + " wav = synthesize_chunk(\n", + " text=chunk,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + " audio_chunks.append(wav)\n", + " except Exception as e:\n", + " print(f\"\\n⚠️ Erreur chunk {i + 1}: {e}\")\n", + " continue\n", + "\n", + " # Nettoyage memoire periodique\n", + " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", + " torch.cuda.empty_cache()\n", + "\n", + " if progress:\n", + " progress.update(time.time() - chunk_start)\n", + "\n", + " if not audio_chunks:\n", + " raise RuntimeError(\"Aucun audio genere\")\n", + "\n", + " # Concatenation\n", + " print(\"\\n🔗 Concatenation des chunks...\")\n", + " final_audio = AudioProcessor.concatenate_chunks(\n", + " audio_chunks, Config.SAMPLE_RATE, Config.CROSSFADE_DURATION\n", + " )\n", + "\n", + " # Nettoyage memoire\n", + " del audio_chunks\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " print(\"✨ Post-traitement...\")\n", + " final_audio = AudioProcessor.enhance(\n", + " final_audio, Config.SAMPLE_RATE, normalize=True, warmth=True\n", + " )\n", + " else:\n", + " final_audio = AudioProcessor.normalize(final_audio)\n", + "\n", + " # Conversion en int16\n", + " final_audio = (final_audio * 32767).astype(np.int16)\n", + "\n", + " # Chemin de sortie\n", + " if output_path is None:\n", + " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", + " output_path = f\"tts_long_{voice}_{h}.wav\"\n", + "\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarde WAV\n", + " print(f\"💾 Sauvegarde: {final_path}\")\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(final_audio.tobytes())\n", + "\n", + " duration = len(final_audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"\\n✅ Audio genere avec succes!\")\n", + " print(f\" 📁 Fichier: {final_path}\")\n", + " print(f\" ⏱️ Duree: {ProgressTracker._format_time(duration)}\")\n", + " print(f\" 📦 Chunks: {len(chunks)}\")\n", + " print(f\" 🎤 Voix: {voice}\")\n", + "\n", + " return {\n", + " \"path\": str(final_path),\n", + " \"sample_rate\": Config.SAMPLE_RATE,\n", + " \"duration_seconds\": duration,\n", + " \"duration_formatted\": ProgressTracker._format_time(duration),\n", + " \"audio_data\": final_audio,\n", + " \"voice\": voice,\n", + " \"language\": language,\n", + " \"device\": _device_name,\n", + " \"chunks_count\": len(chunks),\n", + " \"text_length\": len(text),\n", + " }\n", + "\n", + "\n", + "def text_to_speech(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " enable_text_splitting: bool = True,\n", + ") -> dict:\n", + " \"\"\"Genere un fichier audio a partir de texte avec XTTS v2.\"\"\"\n", + " # Rediriger vers version longue si necessaire\n", + " if len(text) > 10000:\n", + " print(\"📢 Texte long detecte - utilisation de text_to_speech_long()\")\n", + " return text_to_speech_long(\n", + " text=text,\n", + " voice=voice,\n", + " language=language,\n", + " output_path=output_path,\n", + " enhance=enhance,\n", + " use_gdrive=use_gdrive,\n", + " gdrive_folder=gdrive_folder,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Synthese\n", + " wav = synthesize_chunk(\n", + " text=text,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", + " else:\n", + " audio = AudioProcessor.normalize(wav)\n", + "\n", + " audio = (audio * 32767).astype(np.int16)\n", + "\n", + " # Chemin de sortie\n", + " if output_path is None:\n", + " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", + " output_path = f\"tts_{voice}_{h}.wav\"\n", + "\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarde WAV\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(audio.tobytes())\n", + "\n", + " duration = len(audio) / Config.SAMPLE_RATE\n", + " print(f\"✓ Audio genere: {final_path}\")\n", + " print(f\" Duree: {duration:.2f}s | Voix: {voice}\")\n", + "\n", + " return {\n", + " \"path\": str(final_path),\n", + " \"sample_rate\": Config.SAMPLE_RATE,\n", + " \"duration_seconds\": duration,\n", + " \"audio_data\": audio,\n", + " \"voice\": voice,\n", + " \"language\": language,\n", + " \"device\": _device_name,\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# UTILITIES\n", + "# ==============================================================================\n", + "\n", + "\n", + "def preview_audio(result: dict) -> None:\n", + " \"\"\"Previsualise l'audio dans le notebook.\"\"\"\n", + " from IPython.display import Audio, display\n", + "\n", + " audio = result[\"audio_data\"]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " display(Audio(audio, rate=result[\"sample_rate\"]))\n", + "\n", + "\n", + "def list_voices() -> list:\n", + " \"\"\"Liste les voix disponibles.\"\"\"\n", + " return list(Config.PRESET_VOICES.keys())\n", + "\n", + "\n", + "def list_languages() -> list:\n", + " \"\"\"Liste les langues supportees.\"\"\"\n", + " return [\n", + " \"en\",\n", + " \"es\",\n", + " \"fr\",\n", + " \"de\",\n", + " \"it\",\n", + " \"pt\",\n", + " \"pl\",\n", + " \"tr\",\n", + " \"ru\",\n", + " \"nl\",\n", + " \"cs\",\n", + " \"ar\",\n", + " \"zh-cn\",\n", + " \"ja\",\n", + " \"hu\",\n", + " \"ko\",\n", + " \"hi\",\n", + " ]\n", + "\n", + "\n", + "def clear_cache():\n", + " \"\"\"Vide le cache du modele.\"\"\"\n", + " global _tts_model\n", + " import torch\n", + "\n", + " _tts_model = None\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + " print(\"✓ Cache vide\")\n", + "\n", + "\n", + "def estimate_duration(text: str) -> dict:\n", + " \"\"\"Estime la duree audio pour un texte.\"\"\"\n", + " duration = TextSplitter.estimate_audio_duration(text)\n", + " chunks = len(TextSplitter.split_for_long_audio(text))\n", + " return {\n", + " \"chars\": len(text),\n", + " \"estimated_seconds\": duration,\n", + " \"estimated_formatted\": ProgressTracker._format_time(duration),\n", + " \"chunks_estimate\": chunks,\n", + " }\n", + "\n", + "\n", + "# Aliases\n", + "tts = text_to_speech\n", + "tts_long = text_to_speech_long\n", + "\n", + "# ==============================================================================\n", + "# INITIALIZATION\n", + "# ==============================================================================\n", + "\n", + "\n", + "def init():\n", + " \"\"\"Initialise le module.\"\"\"\n", + " detect_device()\n", + " print(\"✅ Module XTTS v2 Long Audio v4 charge\")\n", + " print(f\" Device: {_device_name}\")\n", + " print(f\" Voix disponibles: {list_voices()}\")\n", + " print(f\" enable_text_splitting: active par defaut\")\n", + " print(f\" Fix torchcodec: actif\")\n", + "\n", + "\n", + "# Auto-init\n", + "try:\n", + " detect_device()\n", + "except:\n", + " pass\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"TTS XTTS v2 - Long Audio Generator v4\")\n", + "print(\"Compatible PyTorch 2.9+ (fix torchcodec)\")\n", + "print(\"=\" * 60)\n", + "print(f\"Voix disponibles: {list_voices()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mount_drive", + "outputId": "483dc2dd-86ae-456f-86b5-e6f92e96f257" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "# Monter Google Drive (optionnel)\n", + "# Le notebook peut aussi s'executer hors Colab : dans ce cas on ignore simplement le montage.\n", + "\n", + "try:\n", + " from google.colab import drive # type: ignore\n", + "\n", + " drive.mount(\"/content/drive\")\n", + "except Exception as e:\n", + " print(\"ℹ️ Google Colab non detecte ou Drive indisponible -> montage ignore.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "init_module", + "outputId": "33a7cc61-9054-4601-f338-aaa653bef831" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⚙️ Device: cuda (Tesla T4)\n", + "✅ Module XTTS v2 Long Audio v4 charge\n", + " Device: cuda (Tesla T4)\n", + " Voix disponibles: ['female_fr', 'male_fr']\n", + " enable_text_splitting: active par defaut\n", + " Fix torchcodec: actif\n" + ] + } + ], + "source": [ + "# Initialisation du module\n", + "init()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "id": "example_short", + "outputId": "577e3721-4fce-40f1-979b-79c5aef7cf2e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📥 Telechargement de la voix 'female_fr'...\n", + "🔄 Chargement du modele XTTS v2...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1.87G/1.87G [00:31<00:00, 59.9MiB/s]\n", + "4.37kiB [00:00, 7.24MiB/s]\n", + "361kiB [00:00, 107MiB/s]\n", + "100%|██████████| 32.0/32.0 [00:00<00:00, 91.7kiB/s]\n", + "100%|██████████| 7.75M/7.75M [00:00<00:00, 16.2MiB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Modele charge\n", + "✓ Audio genere: tts_female_fr_3f623356.wav\n", + " Duree: 9.28s | Voix: female_fr\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# ==============================================================================\n", + "# EXEMPLE 1: Texte court\n", + "# ==============================================================================\n", + "\n", + "text_court = \"\"\"\n", + "Bonjour! Ceci est un test de synthese vocale avec XTTS v2.\n", + "Le module est maintenant compatible avec PyTorch 2.9 et superieur.\n", + "\"\"\"\n", + "\n", + "# Choisir la voix\n", + "voice_gender = \"female_fr\" # ou \"male_fr\"\n", + "\n", + "# Generation\n", + "result = text_to_speech(text_court.strip(), voice=voice_gender, enhance=True)\n", + "\n", + "# Previsualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VKh0LumVWdsK" + }, + "outputs": [], + "source": [ + "# 1. Importation de la bibliothèque principale\n", + "import ipywidgets as widgets\n", + "\n", + "# 2. Importation de la fonction d'affichage (optionnel mais recommandé pour la clarté)\n", + "from IPython.display import display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "umnfhCzn2WGG" + }, + "outputs": [], + "source": [ + "# ==========================================\n", + "# 3. INTERFACE UTILISATEUR\n", + "# ==========================================\n", + "\n", + "text_input = widgets.Textarea(\n", + " placeholder=\"Entrez votre texte ici...\",\n", + " value=\"texte\",\n", + " layout=widgets.Layout(width=\"100%\", height=\"150px\"),\n", + ")\n", + "button = widgets.Button(\n", + " description=\"Générer Audio\", button_style=\"success\", icon=\"check\"\n", + ")\n", + "progress_bar = widgets.IntProgress(\n", + " value=0, min=0, max=100, layout=widgets.Layout(width=\"100%\", visibility=\"hidden\")\n", + ")\n", + "output = widgets.Output()\n", + "\n", + "\n", + "def on_click(b):\n", + " with output:\n", + " clear_output()\n", + " if not text_input.value.strip():\n", + " print(\"❌ Le texte est vide.\")\n", + " return\n", + "\n", + " button.disabled = True\n", + " progress_bar.layout.visibility = \"visible\"\n", + " progress_bar.bar_style = \"info\"\n", + " progress_bar.value = 10\n", + "\n", + " try:\n", + " # 1. Drive\n", + " if not os.path.exists(\"/content/drive\"):\n", + " drive.mount(\"/content/drive\")\n", + " if not os.path.exists(DRIVE_FOLDER):\n", + " os.makedirs(DRIVE_FOLDER)\n", + " progress_bar.value = 20\n", + "\n", + " # 2. Gemini\n", + " print(\"🧠 Optimisation du texte (Gemini)...\")\n", + " res_ia = traiter_via_gemini_pour_elevenlabs(text_input.value)\n", + " titre = res_ia.get(\"titre\", \"Audio_Output\")\n", + " texte_final = res_ia.get(\"texte_optimise\", text_input.value)\n", + " progress_bar.value = 50\n", + "\n", + " # 3. ElevenLabs\n", + " print(f\"🎙️ Génération avec Voix ID: {VOICE_ID} ({MODEL_ID})...\")\n", + " nom_fichier = f\"{assainir_nom_fichier(titre)}.mp3\"\n", + " chemin_complet = os.path.join(DRIVE_FOLDER, nom_fichier)\n", + "\n", + " if generer_audio_elevenlabs(texte_final, chemin_complet):\n", + " progress_bar.value = 100\n", + " progress_bar.bar_style = \"success\"\n", + " print(f\"✅ Fichier sauvegardé : {chemin_complet}\")\n", + " display(Audio(chemin_complet))\n", + " else:\n", + " raise Exception(\"Erreur API ElevenLabs\")\n", + "\n", + " except Exception as e:\n", + " progress_bar.bar_style = \"danger\"\n", + " print(f\"❌ Erreur : {e}\")\n", + " finally:\n", + " button.disabled = False\n", + "\n", + "\n", + "button.on_click(on_click)\n", + "\n", + "# Affichage avec rappel de la config chargée\n", + "display(\n", + " widgets.VBox(\n", + " [\n", + " widgets.HTML(f\"

Générateur TTS - Config : {MODEL_ID}

\"),\n", + " widgets.Label(f\"Voix chargée : {VOICE_ID} (Guillaume)\"),\n", + " text_input,\n", + " button,\n", + " progress_bar,\n", + " output,\n", + " ]\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 96 + }, + "id": "custom_text", + "outputId": "2a865273-027a-4958-9da2-7985d9a507d6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Audio genere: tts_female_fr_a37ea6fe.wav\n", + " Duree: 1.34s | Voix: female_fr\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# ==============================================================================\n", + "# VOTRE TEXTE PERSONNALISE\n", + "# ==============================================================================\n", + "\n", + "# Entrez votre texte ici\n", + "text_to_synthesize = \"\"\"\n", + "Votre texte ici...\n", + "\"\"\"\n", + "\n", + "# Configuration\n", + "voice_gender = \"female_fr\" # \"female_fr\" ou \"male_fr\"\n", + "save_to_drive = False # True pour sauvegarder sur Google Drive\n", + "\n", + "# Generation\n", + "result = text_to_speech(\n", + " text_to_synthesize.strip(),\n", + " voice=voice_gender,\n", + " enhance=True,\n", + " use_gdrive=save_to_drive,\n", + ")\n", + "\n", + "# Previsualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cleanup", + "outputId": "6a7d6774-2254-4715-a5be-ea378f6a3eee" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Cache vide\n" + ] + } + ], + "source": [ + "# Nettoyer le cache si necessaire (libere la memoire GPU)\n", + "clear_cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Y9x2A2JKO4rd" + }, + "outputs": [], + "source": [ + "stop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 130 + }, + "id": "generate_long", + "outputId": "5f0391f1-1deb-448f-c71c-528ee69050aa" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔄 Chargement du modele XTTS v2...\n", + "✓ Modele charge\n", + "✓ Audio genere: /content/drive/MyDrive/TTS_Output/tts_female_fr_28bb1d44.wav\n", + " Duree: 54.82s | Voix: female_fr\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Generation du texte long\n", + "result_long = text_to_speech(\n", + " text_long.strip(),\n", + " voice=\"female_fr\",\n", + " enhance=True,\n", + " use_gdrive=True, # Mettre True pour sauvegarder sur Drive\n", + ")\n", + "\n", + "# Previsualisation\n", + "preview_audio(result_long)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "example_long", + "outputId": "1c25049d-6271-4864-d733-ef701a7b9a28" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📊 Estimation:\n", + " Caracteres: 956\n", + " Duree estimee: 01:03\n", + " Chunks estimes: 2\n" + ] + } + ], + "source": [ + "# ==============================================================================\n", + "# EXEMPLE 2: Texte long\n", + "# ==============================================================================\n", + "\n", + "text_long = \"\"\"\n", + "La synthese vocale, egalement appelee text-to-speech ou TTS, est une technologie\n", + "qui permet de convertir du texte ecrit en parole audible. Cette technologie a\n", + "considerablement evolue au fil des annees, passant de voix robotiques et\n", + "mecaniques a des voix naturelles et expressives.\n", + "\n", + "XTTS v2 est l'un des modeles les plus avances dans ce domaine. Developpe par\n", + "Coqui AI, il utilise des techniques d'apprentissage profond pour generer une\n", + "parole de haute qualite dans plusieurs langues. Le modele peut meme cloner\n", + "des voix a partir d'un court echantillon audio de reference.\n", + "\n", + "Les applications de la synthese vocale sont nombreuses: assistants virtuels,\n", + "livres audio, accessibilite pour les personnes malvoyantes, doublage video,\n", + "et bien d'autres encore. Avec les avancees recentes en intelligence artificielle,\n", + "la qualite de la synthese vocale continue de s'ameliorer, rendant la distinction\n", + "entre voix humaine et voix synthetique de plus en plus difficile.\n", + "\"\"\"\n", + "\n", + "# Estimation avant generation\n", + "estimation = estimate_duration(text_long)\n", + "print(f\"📊 Estimation:\")\n", + "print(f\" Caracteres: {estimation['chars']:,}\")\n", + "print(f\" Duree estimee: {estimation['estimated_formatted']}\")\n", + "print(f\" Chunks estimes: {estimation['chunks_estimate']}\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/long_TTS_xtts_v3.ipynb b/long_TTS_xtts_v3.ipynb new file mode 100644 index 000000000000..2509b7d90991 --- /dev/null +++ b/long_TTS_xtts_v3.ipynb @@ -0,0 +1,1203 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s8NfbT3sw2-z" + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "XYDOUW523oJP" + }, + "outputs": [], + "source": [ + "PROMPT = \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", + "\n", + "voice_gender = \"female_fr\"\n", + "# ['female_fr', 'male_fr']" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jIKtDA5hweJP", + "outputId": "8b9cbf18-4496-4c26-d2c8-50b97a3710d2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m862.8/862.8 kB\u001b[0m \u001b[31m57.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m345.1/345.1 kB\u001b[0m \u001b[31m25.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.2/56.2 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m997.3/997.3 kB\u001b[0m \u001b[31m62.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m648.4/648.4 kB\u001b[0m \u001b[31m55.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m18.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "# Installation des dépendances\n", + "!pip install -q scipy noisereduce\n", + "\n", + "# Installation du fork maintenu (supporte Python 3.12+)\n", + "!pip install -q coqui-tts\n", + "\n", + "\n", + "# Installation des dépendances\n", + "!pip install -q scipy noisereduce\n", + "!pip install -q numpy==2.0.2\n", + "\n", + "# Installation de soundfile pour le chargement audio (évite le bug torchcodec)\n", + "!pip install -q soundfile\n", + "\n", + "# Installation du fork maintenu (supporte Python 3.12+)\n", + "!pip install -q coqui-tts\n", + "\n", + "# Note: torchcodec n'est plus nécessaire - on utilise soundfile comme backend" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1FQhKQ1IE4iX", + "outputId": "68446e3c-ac8f-474b-a329-dfa6b8d6aec9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📦 Installation de FFmpeg...\n", + "✓ FFmpeg installé\n", + "⚙️ Device: cuda (Tesla T4\n", + "✅ Module XTTS v2 Long Audio chargé\n", + " Device: cuda (Tesla T4\n", + " Voix: ['female_fr', 'male_fr']\n", + " enable_text_splitting: activé par défaut\n", + "💡 torchaudio backend is expected to use 'soundfile' as torchcodec is no longer installed.\n", + "\n", + "============================================================\n", + "EXEMPLE 1: Texte court\n" + ] + } + ], + "source": [ + "import os\n", + "import re\n", + "import gc\n", + "import wave\n", + "import time\n", + "import hashlib\n", + "import warnings\n", + "from pathlib import Path\n", + "from typing import Optional, Union, List, Callable\n", + "from dataclasses import dataclass\n", + "from enum import Enum\n", + "\n", + "import numpy as np\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "\n", + "# ==============================================================================\n", + "# INSTALLATION (Colab)\n", + "# ==============================================================================\n", + "\n", + "\n", + "def install_dependencies():\n", + " \"\"\"Installe les dépendances si nécessaire (Colab).\"\"\"\n", + " import subprocess\n", + " import sys\n", + "\n", + " # Installer FFmpeg pour torchcodec\n", + " try:\n", + " print(\"📦 Installation de FFmpeg...\")\n", + " subprocess.check_call([\"apt-get\", \"update\", \"-qq\"])\n", + " subprocess.check_call([\"apt-get\", \"install\", \"-qq\", \"ffmpeg\"])\n", + " print(\"✓ FFmpeg installé\")\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'installation de FFmpeg: {e}\")\n", + "\n", + " packages = [\n", + " (\"scipy\", \"scipy\"),\n", + " (\"noisereduce\", \"noisereduce\"),\n", + " (\"TTS\", \"coqui-tts\"),\n", + " ]\n", + "\n", + " for module, package in packages:\n", + " try:\n", + " __import__(module)\n", + " except ImportError:\n", + " print(f\"📦 Installation de {package}...\")\n", + " subprocess.check_call(\n", + " [sys.executable, \"-m\", \"pip\", \"install\", \"-q\", package]\n", + " )\n", + "\n", + " # numpy compatible\n", + " # The previous attempt to install a specific numpy version was causing compatibility issues.\n", + " # Removing this line to allow torchcodec and other libraries to install a compatible numpy version.\n", + " # try:\n", + " # subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", \"numpy==2.0.2\"])\n", + " # except:\n", + " # pass\n", + "\n", + "\n", + "# ==============================================================================\n", + "# CONFIGURATION\n", + "# ==============================================================================\n", + "\n", + "\n", + "@dataclass\n", + "class TTSConfig:\n", + " \"\"\"Configuration globale du module TTS.\"\"\"\n", + "\n", + " MODEL_NAME: str = \"tts_models/multilingual/multi-dataset/xtts_v2\"\n", + " SAMPLE_RATE: int = 24000\n", + " DEFAULT_LANGUAGE: str = \"fr\"\n", + " GDRIVE_FOLDER: str = \"/content/drive/MyDrive/TTS_Output\"\n", + "\n", + " # Configuration pour audio longs\n", + " MAX_CHARS_PER_CHUNK: int = 500 # Caractères max par chunk pour textes très longs\n", + " CROSSFADE_DURATION: float = 0.05 # Durée du crossfade en secondes\n", + " ENABLE_TEXT_SPLITTING: bool = True # Activer le split natif XTTS\n", + "\n", + " PRESET_VOICES: dict = None\n", + "\n", + " def __post_init__(self):\n", + " self.PRESET_VOICES = {\n", + " \"female_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/female.wav\",\n", + " \"male_fr\": \"https://huggingface.co/spaces/coqui/xtts/resolve/main/examples/male.wav\",\n", + " }\n", + "\n", + "\n", + "Config = TTSConfig()\n", + "\n", + "# ==============================================================================\n", + "# DEVICE MANAGEMENT\n", + "# ==============================================================================\n", + "\n", + "_device = None\n", + "_device_name = \"cpu\"\n", + "\n", + "\n", + "def detect_device():\n", + " \"\"\"Détecte le meilleur device disponible.\"\"\"\n", + " global _device, _device_name\n", + " import torch\n", + "\n", + " # Essayer TPU\n", + " try:\n", + " import torch_xla.core.xla_model as xm\n", + "\n", + " _device = xm.xla_device()\n", + " _device_name = \"tpu\"\n", + " print(f\"⚙️ Device: TPU\")\n", + " return\n", + " except:\n", + " pass\n", + "\n", + " # Essayer CUDA\n", + " if torch.cuda.is_available():\n", + " _device = torch.device(\"cuda\")\n", + " _device_name = f\"cuda ({torch.cuda.get_device_name(0)}\"\n", + " print(f\"⚙️ Device: {_device_name}\")\n", + " return\n", + "\n", + " # Fallback CPU\n", + " _device = torch.device(\"cpu\")\n", + " _device_name = \"cpu\"\n", + " print(f\"⚙️ Device: CPU\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TEXT SPLITTING UTILITIES\n", + "# ==============================================================================\n", + "\n", + "\n", + "class TextSplitter:\n", + " \"\"\"\n", + " Utilitaire pour découper intelligemment les textes longs.\n", + " Préserve la cohérence des phrases et paragraphes.\n", + " \"\"\"\n", + "\n", + " @staticmethod\n", + " def estimate_audio_duration(text: str, chars_per_second: float = 15.0) -> float:\n", + " \"\"\"\n", + " Estime la durée audio pour un texte donné.\n", + " \"\"\"\n", + " return len(text) / chars_per_second\n", + "\n", + " @staticmethod\n", + " def split_into_sentences(text: str) -> List[str]:\n", + " \"\"\"Découpe le texte en phrases.\"\"\"\n", + " # Pattern pour fin de phrase\n", + " pattern = r\"(?<=[.!?])\\s+\"\n", + " sentences = re.split(pattern, text)\n", + " return [s.strip() for s in sentences if s.strip()]\n", + "\n", + " @staticmethod\n", + " def split_into_paragraphs(text: str) -> List[str]:\n", + " \"\"\"Découpe le texte en paragraphes.\"\"\"\n", + " paragraphs = re.split(r\"\\n\\s*\\n\", text)\n", + " return [p.strip() for p in paragraphs if p.strip()]\n", + "\n", + " @classmethod\n", + " def split_for_long_audio(\n", + " cls, text: str, max_chars: int = 500, preserve_sentences: bool = True\n", + " ) -> List[str]:\n", + " \"\"\"\n", + " Découpe un texte long en chunks optimaux pour la synthèse.\n", + " \"\"\"\n", + " # Si texte court, retourner tel quel\n", + " if len(text) <= max_chars:\n", + " return [text]\n", + "\n", + " chunks = []\n", + "\n", + " if preserve_sentences:\n", + " sentences = cls.split_into_sentences(text)\n", + " current_chunk = \"\"\n", + "\n", + " for sentence in sentences:\n", + " # Si la phrase seule dépasse max_chars, la découper\n", + " if len(sentence) > max_chars:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = \"\"\n", + " # Découper la phrase longue par mots\n", + " words = sentence.split()\n", + " sub_chunk = \"\"\n", + " for word in words:\n", + " if len(sub_chunk) + len(word) + 1 <= max_chars:\n", + " sub_chunk += \" \" + word if sub_chunk else word\n", + " else:\n", + " if sub_chunk:\n", + " chunks.append(sub_chunk.strip())\n", + " sub_chunk = word\n", + " if sub_chunk:\n", + " current_chunk = sub_chunk\n", + " elif len(current_chunk) + len(sentence) + 1 <= max_chars:\n", + " current_chunk += \" \" + sentence if current_chunk else sentence\n", + " else:\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " current_chunk = sentence\n", + "\n", + " if current_chunk:\n", + " chunks.append(current_chunk.strip())\n", + " else:\n", + " # Découpage simple par caractères\n", + " for i in range(0, len(text), max_chars):\n", + " chunks.append(text[i : i + max_chars])\n", + "\n", + " return chunks\n", + "\n", + "\n", + "# ==============================================================================\n", + "# AUDIO PROCESSING\n", + "# ==============================================================================\n", + "\n", + "\n", + "class AudioProcessor:\n", + " \"\"\"Processeur audio pour post-traitement et concaténation.\"\"\"\n", + "\n", + " @staticmethod\n", + " def normalize(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:\n", + " \"\"\"Normalise l'audio au niveau cible.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target_linear = 10 ** (target_db / 20)\n", + " audio = audio * (target_linear / peak)\n", + "\n", + " return np.clip(audio, -1.0, 1.0)\n", + "\n", + " @staticmethod\n", + " def crossfade(\n", + " audio1: np.ndarray, audio2: np.ndarray, sample_rate: int, duration: float = 0.05\n", + " ) -> np.ndarray:\n", + " \"\"\"\n", + " Concatène deux segments audio avec crossfade.\n", + " \"\"\"\n", + " # Convertir en float si nécessaire\n", + " if audio1.dtype == np.int16:\n", + " audio1 = audio1.astype(np.float32) / 32768.0\n", + " if audio2.dtype == np.int16:\n", + " audio2 = audio2.astype(np.float32) / 32768.0\n", + "\n", + " fade_samples = int(sample_rate * duration)\n", + "\n", + " # Si audio trop court pour crossfade, concaténer simplement\n", + " if len(audio1) < fade_samples or len(audio2) < fade_samples:\n", + " return np.concatenate([audio1, audio2])\n", + "\n", + " # Créer les courbes de fade\n", + " fade_out = np.linspace(1.0, 0.0, fade_samples)\n", + " fade_in = np.linspace(0.0, 1.0, fade_samples)\n", + "\n", + " # Appliquer le crossfade\n", + " audio1_end = audio1[-fade_samples:] * fade_out\n", + " audio2_start = audio2[:fade_samples] * fade_in\n", + "\n", + " # Assembler\n", + " result = np.concatenate(\n", + " [audio1[:-fade_samples], audio1_end + audio2_start, audio2[fade_samples:]]\n", + " )\n", + "\n", + " return result\n", + "\n", + " @classmethod\n", + " def concatenate_chunks(\n", + " cls,\n", + " audio_chunks: List[np.ndarray],\n", + " sample_rate: int,\n", + " crossfade_duration: float = 0.05,\n", + " ) -> np.ndarray:\n", + " \"\"\"\n", + " Concatène plusieurs chunks audio avec crossfade.\n", + " \"\"\"\n", + " if not audio_chunks:\n", + " return np.array([], dtype=np.float32)\n", + "\n", + " if len(audio_chunks) == 1:\n", + " audio = audio_chunks[0]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + " return audio\n", + "\n", + " result = audio_chunks[0]\n", + " if result.dtype == np.int16:\n", + " result = result.astype(np.float32) / 32768.0\n", + "\n", + " for chunk in audio_chunks[1:]:\n", + " result = cls.crossfade(result, chunk, sample_rate, crossfade_duration)\n", + "\n", + " return result\n", + "\n", + " @staticmethod\n", + " def enhance(\n", + " audio: np.ndarray, sample_rate: int, normalize: bool = True, warmth: bool = True\n", + " ) -> np.ndarray:\n", + " \"\"\"Améliore la qualité audio.\"\"\"\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " if warmth:\n", + " try:\n", + " from scipy import signal\n", + "\n", + " nyquist = sample_rate / 2\n", + " cutoff = min(300, nyquist * 0.9) / nyquist\n", + " b, a = signal.butter(2, cutoff, btype=\"low\")\n", + " bass = signal.filtfilt(b, a, audio)\n", + " audio = audio + 0.15 * bass\n", + " except ImportError:\n", + " pass\n", + "\n", + " if normalize:\n", + " peak = np.max(np.abs(audio))\n", + " if peak > 0:\n", + " target = 10 ** (-3.0 / 20)\n", + " audio = audio * (target / peak)\n", + "\n", + " audio = np.clip(audio, -1.0, 1.0)\n", + " return audio\n", + "\n", + "\n", + "# ==============================================================================\n", + "# PROGRESS TRACKER\n", + "# ==============================================================================\n", + "\n", + "\n", + "class ProgressTracker:\n", + " \"\"\"Suivi de progression avec estimation du temps restant.\"\"\"\n", + "\n", + " def __init__(self, total: int, description: str = \"\"):\n", + " self.total = total\n", + " self.current = 0\n", + " self.description = description\n", + " self.start_time = time.time()\n", + " self.chunk_times = []\n", + "\n", + " def update(self, chunk_duration: float = None):\n", + " \"\"\"Met à jour la progression.\"\"\"\n", + " self.current += 1\n", + " if chunk_duration:\n", + " self.chunk_times.append(chunk_duration)\n", + " self._display()\n", + "\n", + " def _display(self):\n", + " \"\"\"Affiche la barre de progression.\"\"\"\n", + " elapsed = time.time() - self.start_time\n", + " percent = (self.current / self.total) * 100\n", + "\n", + " # Estimation temps restant\n", + " if self.chunk_times:\n", + " avg_time = np.mean(self.chunk_times)\n", + " remaining = avg_time * (self.total - self.current)\n", + " eta_str = self._format_time(remaining)\n", + " else:\n", + " eta_str = \"...\"\n", + "\n", + " # Barre de progression\n", + " bar_length = 30\n", + " filled = int(bar_length * self.current / self.total)\n", + " bar = \"█\" * filled + \"░\" * (bar_length - filled)\n", + "\n", + " elapsed_str = self._format_time(elapsed)\n", + "\n", + " print(\n", + " f\"\\r{self.description} [{bar}] {self.current}/{self.total} \"\n", + " f\"({percent:.1f}%) | Temps: {elapsed_str} | ETA: {eta_str}\",\n", + " end=\"\",\n", + " )\n", + "\n", + " if self.current >= self.total:\n", + " print() # Nouvelle ligne à la fin\n", + "\n", + " @staticmethod\n", + " def _format_time(seconds: float) -> str:\n", + " \"\"\"Formate un temps en secondes en HH:MM:SS.\"\"\"\n", + " hours = int(seconds // 3600)\n", + " minutes = int((seconds % 3600) // 60)\n", + " secs = int(seconds % 60)\n", + "\n", + " if hours > 0:\n", + " return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n", + " return f\"{minutes:02d}:{secs:02d}\"\n", + "\n", + "\n", + "# ==============================================================================\n", + "# TTS ENGINE\n", + "# ==============================================================================\n", + "\n", + "_tts_model = None\n", + "_voices_cache = {}\n", + "os.environ[\"COQUI_TOS_AGREED\"] = \"1\"\n", + "\n", + "\n", + "def get_model():\n", + " \"\"\"Charge le modèle XTTS v2 avec cache.\"\"\"\n", + " global _tts_model\n", + "\n", + " if _tts_model is None:\n", + " print(\"🔄 Chargement du modèle XTTS v2...\")\n", + " from TTS.api import TTS\n", + "\n", + " _tts_model = TTS(Config.MODEL_NAME)\n", + "\n", + " if _device is not None and _device_name.startswith(\"cuda\"):\n", + " _tts_model = _tts_model.to(_device)\n", + "\n", + " print(\"✓ Modèle chargé\")\n", + "\n", + " return _tts_model\n", + "\n", + "\n", + "def get_voice_path(voice: str) -> str:\n", + " \"\"\"Obtient le chemin vers un fichier de voix.\"\"\"\n", + " global _voices_cache\n", + " import urllib.request\n", + "\n", + " if voice in _voices_cache:\n", + " return _voices_cache[voice]\n", + "\n", + " if os.path.isfile(voice):\n", + " _voices_cache[voice] = voice\n", + " return voice\n", + "\n", + " if voice in Config.PRESET_VOICES:\n", + " url = Config.PRESET_VOICES[voice]\n", + " path = f\"/tmp/{voice}.wav\"\n", + "\n", + " if not os.path.exists(path):\n", + " print(f\"📥 Téléchargement de la voix '{voice}'...\")\n", + " urllib.request.urlretrieve(url, path)\n", + "\n", + " _voices_cache[voice] = path\n", + " return path\n", + "\n", + " raise FileNotFoundError(f\"Voix '{voice}' non trouvée\")\n", + "\n", + "\n", + "# ==============================================================================\n", + "# MAIN SYNTHESIS FUNCTIONS\n", + "# ==============================================================================\n", + "\n", + "\n", + "def synthesize_chunk(\n", + " text: str, voice_path: str, language: str = \"fr\", enable_text_splitting: bool = True\n", + ") -> np.ndarray:\n", + " \"\"\"\n", + " Synthétise un chunk de texte en audio via l'inférence directe (Low-Level).\n", + " Bypass total du SpeakerManager pour éviter le bug FileNotFoundError .pth\n", + " \"\"\"\n", + " model_wrapper = get_model()\n", + "\n", + " # 1. Accès \"chirurgical\" au modèle interne XTTS\n", + " # C'est lui qui fait le travail, sans la couche de gestion de fichiers buggée\n", + " if hasattr(model_wrapper, \"synthesizer\"):\n", + " xtts_model = model_wrapper.synthesizer.tts_model\n", + " else:\n", + " # Cas rare ou structure différente, on tente l'accès direct\n", + " xtts_model = model_wrapper.tts_model\n", + "\n", + " # 2. Calcul manuel des latents (Empreinte vocale)\n", + " # On transforme le fichier WAV en vecteurs mathématiques\n", + " try:\n", + " gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(\n", + " audio_path=[voice_path], gpt_cond_len=30, max_ref_length=60\n", + " )\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur calcul latents: {e}\")\n", + " raise e\n", + "\n", + " # 3. Inférence directe\n", + " # On appelle la fonction de génération pure, sans passer par tts()\n", + " try:\n", + " out = xtts_model.inference(\n", + " text=text,\n", + " language=language,\n", + " gpt_cond_latent=gpt_cond_latent,\n", + " speaker_embedding=speaker_embedding,\n", + " temperature=0.7, # Paramètre standard pour la créativité\n", + " length_penalty=1.0, # Pénalité de longueur\n", + " repetition_penalty=2.0, # Évite les bégaiements\n", + " top_k=50,\n", + " top_p=0.8,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " # Le résultat est généralement dans un dictionnaire sous la clé 'wav'\n", + " if isinstance(out, dict) and \"wav\" in out:\n", + " wav = out[\"wav\"]\n", + " else:\n", + " wav = out\n", + "\n", + " # S'assurer que c'est bien un numpy array sur CPU\n", + " if hasattr(wav, \"cpu\"):\n", + " wav = wav.cpu().numpy()\n", + " if isinstance(wav, list):\n", + " wav = np.array(wav, dtype=np.float32)\n", + "\n", + " return wav\n", + "\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de l'inférence directe : {e}\")\n", + " raise e\n", + "\n", + "\n", + "def text_to_speech_long(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " max_chars_per_chunk: int = None,\n", + " show_progress: bool = True,\n", + " enable_text_splitting: bool = True,\n", + ") -> dict:\n", + " \"\"\"\n", + " Génère un fichier audio long (> 1 heure) à partir de texte.\n", + " \"\"\"\n", + " import torch\n", + "\n", + " # Configuration\n", + " max_chars = max_chars_per_chunk or Config.MAX_CHARS_PER_CHUNK\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Estimation initiale\n", + " estimated_duration = TextSplitter.estimate_audio_duration(text)\n", + " print(f\"\\n📝 Texte: {len(text):,} caractères\")\n", + " print(f\"⏱️ Durée estimée: {ProgressTracker._format_time(estimated_duration)}\")\n", + "\n", + " # Découper le texte\n", + " chunks = TextSplitter.split_for_long_audio(text, max_chars=max_chars)\n", + " print(f\"📦 Chunks: {len(chunks)}\")\n", + "\n", + " # Initialiser la progression\n", + " progress = None\n", + " if show_progress:\n", + " progress = ProgressTracker(len(chunks), \"🎙️ Synthèse\")\n", + "\n", + " # Générer l'audio chunk par chunk\n", + " audio_chunks = []\n", + "\n", + " for i, chunk in enumerate(chunks):\n", + " chunk_start = time.time()\n", + "\n", + " try:\n", + " wav = synthesize_chunk(\n", + " text=chunk,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + " audio_chunks.append(wav)\n", + "\n", + " except Exception as e:\n", + " print(f\"\\n⚠️ Erreur chunk {i + 1}: {e}\")\n", + " # Continuer avec les autres chunks\n", + " continue\n", + "\n", + " # Libérer la mémoire GPU périodiquement\n", + " if _device_name.startswith(\"cuda\") and (i + 1) % 10 == 0:\n", + " torch.cuda.empty_cache()\n", + "\n", + " chunk_duration = time.time() - chunk_start\n", + " if progress:\n", + " progress.update(chunk_duration)\n", + "\n", + " if not audio_chunks:\n", + " raise RuntimeError(\"Aucun audio généré\")\n", + "\n", + " print(\"\\n🔗 Concaténation des chunks...\")\n", + "\n", + " # Concaténer avec crossfade\n", + " final_audio = AudioProcessor.concatenate_chunks(\n", + " audio_chunks, Config.SAMPLE_RATE, Config.CROSSFADE_DURATION\n", + " )\n", + "\n", + " # Libérer les chunks de la mémoire\n", + " del audio_chunks\n", + " gc.collect()\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " print(\"✨ Post-traitement...\")\n", + " final_audio = AudioProcessor.enhance(\n", + " final_audio, Config.SAMPLE_RATE, normalize=True, warmth=True\n", + " )\n", + " else:\n", + " final_audio = AudioProcessor.normalize(final_audio)\n", + "\n", + " # Convertir en int16\n", + " final_audio = (final_audio * 32767).astype(np.int16)\n", + "\n", + " # Générer le nom de fichier\n", + " if output_path is None:\n", + " h = hashlib.md5(text[:100].encode()).hexdigest()[:8]\n", + " output_path = f\"tts_long_{voice}_{h}.wav\"\n", + "\n", + " # Dossier de sortie\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarder\n", + " print(f\"💾 Sauvegarde: {final_path}\")\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(final_audio.tobytes())\n", + "\n", + " # Calculer la durée réelle\n", + " duration = len(final_audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"\\n✅ Audio généré avec succès!\")\n", + " print(f\" 📁 Fichier: {final_path}\")\n", + " print(f\" ⏱️ Durée: {ProgressTracker._format_time(duration)}\")\n", + " print(f\" 📦 Chunks: {len(chunks)}\")\n", + " print(f\" 🎤 Voix: {voice}\")\n", + "\n", + " return {\n", + " \"path\": str(final_path),\n", + " \"sample_rate\": Config.SAMPLE_RATE,\n", + " \"duration_seconds\": duration,\n", + " \"duration_formatted\": ProgressTracker._format_time(duration),\n", + " \"audio_data\": final_audio,\n", + " \"voice\": voice,\n", + " \"language\": language,\n", + " \"device\": _device_name,\n", + " \"chunks_count\": len(chunks),\n", + " \"text_length\": len(text),\n", + " }\n", + "\n", + "\n", + "def text_to_speech(\n", + " text: str,\n", + " voice: str = \"female_fr\",\n", + " language: str = \"fr\",\n", + " output_path: Optional[str] = None,\n", + " enhance: bool = False,\n", + " use_gdrive: bool = False,\n", + " gdrive_folder: str = None,\n", + " enable_text_splitting: bool = True,\n", + ") -> dict:\n", + " \"\"\"\n", + " Génère un fichier audio à partir de texte avec XTTS v2.\n", + " \"\"\"\n", + " # Basculer automatiquement vers la version long pour textes > 10000 chars\n", + " if len(text) > 10000:\n", + " print(\"📢 Texte long détecté - utilisation de text_to_speech_long()\")\n", + " return text_to_speech_long(\n", + " text=text,\n", + " voice=voice,\n", + " language=language,\n", + " output_path=output_path,\n", + " enhance=enhance,\n", + " use_gdrive=use_gdrive,\n", + " gdrive_folder=gdrive_folder,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " voice_path = get_voice_path(voice)\n", + "\n", + " # Générer l'audio avec enable_text_splitting\n", + " wav = synthesize_chunk(\n", + " text=text,\n", + " voice_path=voice_path,\n", + " language=language,\n", + " enable_text_splitting=enable_text_splitting,\n", + " )\n", + "\n", + " # Post-traitement\n", + " if enhance:\n", + " audio = AudioProcessor.enhance(wav, Config.SAMPLE_RATE)\n", + " else:\n", + " audio = AudioProcessor.normalize(wav)\n", + "\n", + " audio = (audio * 32767).astype(np.int16)\n", + "\n", + " # Nom de fichier\n", + " if output_path is None:\n", + " h = hashlib.md5(text.encode()).hexdigest()[:8]\n", + " output_path = f\"tts_{voice}_{h}.wav\"\n", + "\n", + " # Dossier de sortie\n", + " if use_gdrive:\n", + " folder = Path(gdrive_folder or Config.GDRIVE_FOLDER)\n", + " folder.mkdir(parents=True, exist_ok=True)\n", + " final_path = folder / Path(output_path).name\n", + " else:\n", + " final_path = Path(output_path)\n", + "\n", + " # Sauvegarder\n", + " with wave.open(str(final_path), \"wb\") as wav_file:\n", + " wav_file.setnchannels(1)\n", + " wav_file.setsampwidth(2)\n", + " wav_file.setframerate(Config.SAMPLE_RATE)\n", + " wav_file.writeframes(audio.tobytes())\n", + "\n", + " duration = len(audio) / Config.SAMPLE_RATE\n", + "\n", + " print(f\"✓ Audio généré: {final_path}\")\n", + " print(f\" Durée: {duration:.2f}s | Voix: {voice}\")\n", + "\n", + " return {\n", + " \"path\": str(final_path),\n", + " \"sample_rate\": Config.SAMPLE_RATE,\n", + " \"duration_seconds\": duration,\n", + " \"audio_data\": audio,\n", + " \"voice\": voice,\n", + " \"language\": language,\n", + " \"device\": _device_name,\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# UTILITIES\n", + "# ==============================================================================\n", + "\n", + "\n", + "def preview_audio(result: dict) -> None:\n", + " \"\"\"Prévisualise l'audio dans le notebook.\"\"\"\n", + " from IPython.display import Audio, display\n", + "\n", + " audio = result[\"audio_data\"]\n", + " if audio.dtype == np.int16:\n", + " audio = audio.astype(np.float32) / 32768.0\n", + "\n", + " display(Audio(audio, rate=result[\"sample_rate\"]))\n", + "\n", + "\n", + "def list_voices() -> list:\n", + " \"\"\"Liste les voix disponibles.\"\"\"\n", + " return list(Config.PRESET_VOICES.keys())\n", + "\n", + "\n", + "def list_languages() -> list:\n", + " \"\"\"Liste les langues supportées.\"\"\"\n", + " return [\n", + " \"en\",\n", + " \"es\",\n", + " \"fr\",\n", + " \"de\",\n", + " \"it\",\n", + " \"pt\",\n", + " \"pl\",\n", + " \"tr\",\n", + " \"ru\",\n", + " \"nl\",\n", + " \"cs\",\n", + " \"ar\",\n", + " \"zh-cn\",\n", + " \"ja\",\n", + " \"hu\",\n", + " \"ko\",\n", + " \"hi\",\n", + " ]\n", + "\n", + "\n", + "def clear_cache():\n", + " \"\"\"Libère la mémoire.\"\"\"\n", + " global _tts_model\n", + " import torch\n", + "\n", + " _tts_model = None\n", + " gc.collect()\n", + "\n", + " if _device_name.startswith(\"cuda\"):\n", + " torch.cuda.empty_cache()\n", + "\n", + " print(\"✓ Cache vidé\")\n", + "\n", + "\n", + "def estimate_duration(text: str) -> dict:\n", + " \"\"\"\n", + " Estime la durée audio pour un texte.\n", + " \"\"\"\n", + " duration = TextSplitter.estimate_audio_duration(text)\n", + " chunks = len(TextSplitter.split_for_long_audio(text))\n", + "\n", + " return {\n", + " \"chars\": len(text),\n", + " \"estimated_seconds\": duration,\n", + " \"estimated_formatted\": ProgressTracker._format_time(duration),\n", + " \"chunks_estimate\": chunks,\n", + " }\n", + "\n", + "\n", + "# ==============================================================================\n", + "# ALIASES\n", + "# ==============================================================================\n", + "\n", + "tts = text_to_speech\n", + "tts_long = text_to_speech_long\n", + "\n", + "\n", + "# ==============================================================================\n", + "# INITIALIZATION\n", + "# ==============================================================================\n", + "\n", + "\n", + "def init():\n", + " \"\"\"Initialise le module.\"\"\"\n", + " detect_device()\n", + " print(\"✅ Module XTTS v2 Long Audio chargé\")\n", + " print(f\" Device: {_device_name}\")\n", + " print(f\" Voix: {list_voices()}\")\n", + " print(f\" enable_text_splitting: activé par défaut\")\n", + " # Add this line to explicitly set torchaudio backend\n", + " try:\n", + " import torchaudio\n", + "\n", + " # This line is intentionally commented out as set_audio_backend is not available in all torchaudio versions.\n", + " # The `soundfile` library should be picked up automatically if torchcodec is not installed.\n", + " # torchaudio.set_audio_backend(\"soundfile\")\n", + " print(\n", + " \"💡 torchaudio backend is expected to use 'soundfile' as torchcodec is no longer installed.\"\n", + " )\n", + " except ImportError:\n", + " print(\"⚠️ torchaudio not found.\")\n", + " except Exception as e:\n", + " print(f\"⚠️ Erreur lors de la configuration de torchaudio: {e}\")\n", + "\n", + "\n", + "# Auto-init\n", + "if __name__ != \"__main__\":\n", + " try:\n", + " detect_device()\n", + " except:\n", + " pass\n", + "\n", + "\n", + "# ==============================================================================\n", + "# EXAMPLE USAGE\n", + "# ==============================================================================\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Installation si nécessaire\n", + " install_dependencies()\n", + "\n", + " # Initialisation\n", + " init()\n", + "\n", + " # Exemple avec texte court\n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"EXEMPLE 1: Texte court\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "FREsMU-QLEc4" + }, + "outputs": [], + "source": [ + "text_to_speech_to_synthetise = \"Ce document présente les Manifold-Constrained Hyper-Connections (mHC), une architecture novatrice conçue par DeepSeek-AI pour stabiliser l'entraînement des grands modèles de langage. Bien que les Hyper-Connections (HC) classiques améliorent les performances en élargissant le flux résiduel, leur nature non contrainte provoque souvent une instabilité numérique et des problèmes de divergence du signal. Pour remédier à cela, les auteurs utilisent l'algorithme de Sinkhorn-Knopp afin de projeter les connexions sur une variété de matrices doublement stochastiques, préservant ainsi la propriété de mappage d'identité. Cette approche garantit une propagation saine du signal tout en optimisant l'efficacité matérielle grâce à la fusion de noyaux et à des stratégies de mémorisation sélective. Les résultats expérimentaux démontrent que mHC surpasse les méthodes existantes en termes de scalabilité et de capacités de raisonnement sur divers tests de référence. En intégrant ces contraintes géométriques rigoureuses, le cadre mHC offre une solution robuste pour l'évolution des architectures neuronales à grande échelle.\"\n", + "\n", + "voice_gender = \"female_fr\"\n", + "# ['female_fr', 'male_fr']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 250 + }, + "id": "2Any3vzyK8zF", + "outputId": "2d3ff69d-09bc-4334-f13f-7be141559de8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Voix disponibles: ['female_fr', 'male_fr']\n", + "📥 Téléchargement de la voix 'female_fr'...\n", + "🔄 Chargement du modèle XTTS v2...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1.87G/1.87G [00:35<00:00, 52.2MiB/s]\n", + "4.37kiB [00:00, 5.36MiB/s]\n", + "361kiB [00:00, 98.7MiB/s]\n", + "100%|██████████| 32.0/32.0 [00:00<00:00, 63.3kiB/s]\n", + "100%|██████████| 7.75M/7.75M [00:00<00:00, 102MiB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Modèle chargé\n", + "✓ Audio généré: tts_female_fr_a22d596a.wav\n", + " Durée: 61.17s | Voix: female_fr\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# -----------------------------------------------------------------------------\n", + "# CELLULE 3: Exemples d'utilisation\n", + "# -----------------------------------------------------------------------------\n", + "\n", + "# Montage Google Drive (optionnel)\n", + "# mount_gdrive()\n", + "\n", + "# Liste des voix disponibles\n", + "print(\"Voix disponibles:\", list_voices())\n", + "\n", + "# Génération simple\n", + "result = text_to_speech(text_to_speech_to_synthetise, voice=voice_gender)\n", + "\n", + "# Prévisualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2565nagRK0eb", + "outputId": "38fb657a-44c2-4ce0-c39b-3cebe22149d2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Voix disponibles: ['female_fr', 'male_fr']\n", + "✓ Audio généré: tts_female_fr_a22d596a.wav\n", + " Durée: 63.25s | Voix: female_fr\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# -----------------------------------------------------------------------------\n", + "# CELLULE 3: Exemples d'utilisation\n", + "# -----------------------------------------------------------------------------\n", + "\n", + "# Montage Google Drive (optionnel)\n", + "# mount_gdrive()\n", + "\n", + "# Liste des voix disponibles\n", + "print(\"Voix disponibles:\", list_voices())\n", + "\n", + "# Génération simple\n", + "result = text_to_speech(text_to_speech_to_synthetise, voice=voice_gender)\n", + "\n", + "# Prévisualisation\n", + "preview_audio(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Naxv1wHEp6NQ", + "outputId": "33d8bb87-0781-465a-b3d5-9b35f85de770" + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'stop' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipython-input-3957423419.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstop\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'stop' is not defined" + ] + } + ], + "source": [ + "stop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ulSK6K1op63B" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BRhVnXgSE7Yd" + }, + "outputs": [], + "source": [ + "# Lire le fichier\n", + "with open(\"mon_texte_long.txt\", \"r\", encoding=\"utf-8\") as f:\n", + " texte_complet = f.read()\n", + "\n", + "# Lancer la génération\n", + "text_to_speech_long(text=texte_complet, voice=\"female_fr\", language=\"fr\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4o0EdnBHp7la" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KaWW0-DIMy7R" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "authorship_tag": "ABX9TyMWr0Dv6vrMdJNJnihMF5Pg", + "gpuType": "T4", + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file