-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaudio_processor.py
More file actions
executable file
·128 lines (99 loc) · 4.75 KB
/
Copy pathaudio_processor.py
File metadata and controls
executable file
·128 lines (99 loc) · 4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import subprocess
from pydub import AudioSegment
import shutil
import math
def separate_vocals(audio_path, output_dir):
"""Separates vocals from audio using Demucs. Handles large files by chunking."""
# Check for CUDA
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Demucs using device: {device}")
# Load audio to check duration
audio = AudioSegment.from_wav(audio_path)
duration_sec = len(audio) / 1000.0
# Chunk size in seconds (e.g., 10 minutes)
CHUNK_SIZE = 600
if duration_sec <= CHUNK_SIZE:
# Process normally
return _run_demucs(audio_path, output_dir, device)
print(f"Audio is too long ({duration_sec:.2f}s). Splitting into {CHUNK_SIZE}s chunks...")
# Create temp dir for chunks
chunks_dir = os.path.join(output_dir, "temp_chunks")
os.makedirs(chunks_dir, exist_ok=True)
num_chunks = math.ceil(duration_sec / CHUNK_SIZE)
vocals_chunks = []
no_vocals_chunks = []
try:
for i in range(num_chunks):
start_ms = i * CHUNK_SIZE * 1000
end_ms = min((i + 1) * CHUNK_SIZE * 1000, len(audio))
chunk = audio[start_ms:end_ms]
chunk_name = f"chunk_{i}"
chunk_path = os.path.join(chunks_dir, f"{chunk_name}.wav")
chunk.export(chunk_path, format="wav")
print(f"Processing chunk {i+1}/{num_chunks}...")
# Process chunk
# Demucs creates a subfolder for each input file
chunk_out_dir = os.path.join(chunks_dir, "out")
v_path, nv_path = _run_demucs(chunk_path, chunk_out_dir, device)
vocals_chunks.append(AudioSegment.from_wav(v_path))
no_vocals_chunks.append(AudioSegment.from_wav(nv_path))
# Concatenate results
print("Concatenating chunks...")
full_vocals = sum(vocals_chunks)
full_no_vocals = sum(no_vocals_chunks)
# Save final results
final_out_dir = os.path.join(output_dir, "htdemucs", os.path.splitext(os.path.basename(audio_path))[0])
os.makedirs(final_out_dir, exist_ok=True)
final_vocals_path = os.path.join(final_out_dir, "vocals.wav")
final_no_vocals_path = os.path.join(final_out_dir, "no_vocals.wav")
full_vocals.export(final_vocals_path, format="wav")
full_no_vocals.export(final_no_vocals_path, format="wav")
return final_vocals_path, final_no_vocals_path
finally:
# Cleanup
if os.path.exists(chunks_dir):
shutil.rmtree(chunks_dir)
def _run_demucs(audio_path, output_dir, device):
"""Helper to run demucs on a single file."""
os.makedirs(output_dir, exist_ok=True)
cmd = [
"demucs",
"-n", "htdemucs",
"--two-stems=vocals",
"-d", device,
"-o", output_dir,
# "-j", "0", # Disable multiprocessing to save RAM?
audio_path
]
# print(f"Running Demucs: {' '.join(cmd)}")
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
print(f"Demucs failed: {e}")
raise
filename = os.path.splitext(os.path.basename(audio_path))[0]
vocals_path = os.path.join(output_dir, "htdemucs", filename, "vocals.wav")
no_vocals_path = os.path.join(output_dir, "htdemucs", filename, "no_vocals.wav")
if not os.path.exists(vocals_path) or not os.path.exists(no_vocals_path):
raise FileNotFoundError(f"Demucs output files not found for {audio_path}")
return vocals_path, no_vocals_path
def merge_audio(vocals_path, background_path, output_path):
"""Merges vocals and background music using dynamic RMS normalization."""
print(f"Merging {vocals_path} and {background_path} (with dynamic volume leveling)...")
vocals = AudioSegment.from_wav(vocals_path)
bg = AudioSegment.from_wav(background_path)
# Динамическое выравнивание громкости (Нормализация по LUFS/dBFS)
# 1. Приводим фоновую музыку к стандарту -14 dBFS (чтобы она не "орала" и не клипповала)
target_bg_dbfs = -14.0
bg_gain = target_bg_dbfs - bg.dBFS
bg = bg.apply_gain(bg_gain)
# 2. Приводим голос к стандарту -8 dBFS (на 6 децибел громче музыки)
target_vocals_dbfs = -8.0
vocals_gain = target_vocals_dbfs - vocals.dBFS
vocals = vocals.apply_gain(vocals_gain)
# Накладываем голос на музыку
combined = bg.overlay(vocals)
combined.export(output_path, format="wav")
print(f"Merged audio saved to {output_path}")