-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvoice_analyzer.py
More file actions
executable file
·172 lines (138 loc) · 6.29 KB
/
Copy pathvoice_analyzer.py
File metadata and controls
executable file
·172 lines (138 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import librosa
import numpy as np
import os
from pydub import AudioSegment
def analyze_speaker_pitch(audio_path, segments, speaker_id):
"""
Analyzes the pitch of a specific speaker in the audio file.
Returns the average pitch (F0) in Hz.
"""
print(f"Analyzing pitch for {speaker_id}...")
try:
# Load the full audio file
y, sr = librosa.load(audio_path, sr=None)
speaker_pitches = []
for seg in segments:
if seg.get("speaker") != speaker_id:
continue
# Extract segment
start_sample = int(seg["start"] * sr)
end_sample = int(seg["end"] * sr)
if end_sample - start_sample < 512: # Skip very short segments
continue
y_seg = y[start_sample:end_sample]
# Extract pitch (F0) using piptrack
# fmin=50, fmax=400 covers most human speech range
f0, voiced_flag, voiced_probs = librosa.pyin(
y_seg,
fmin=librosa.note_to_hz('C2'),
fmax=librosa.note_to_hz('C6')
)
# Filter out unvoiced parts (NaNs)
valid_f0 = f0[~np.isnan(f0)]
if len(valid_f0) > 0:
speaker_pitches.extend(valid_f0)
if not speaker_pitches:
print(f"Warning: No valid pitch data found for {speaker_id}")
return None
avg_pitch = np.mean(speaker_pitches)
print(f"Average pitch for {speaker_id}: {avg_pitch:.2f} Hz")
return avg_pitch
except Exception as e:
print(f"Error analyzing pitch: {e}")
return None
def detect_gender(pitch_hz):
"""
Detects gender based on average pitch.
Threshold is typically around 165-175 Hz.
"""
if pitch_hz is None:
return "MALE" # Default
# Simple threshold
if pitch_hz < 170:
return "MALE"
else:
return "FEMALE"
def get_pitch_adjustment(target_pitch, gender):
"""
Calculates the pitch shift (in Hz or semitones) needed for TTS.
Edge-TTS accepts pitch in Hz (e.g. +50Hz) or relative semitones (not supported directly by all voices, but we can try).
Actually, edge-tts supports +XHz or +Xst (semitones).
We will try to map the speaker's pitch relative to the average for their gender.
Avg Male: ~120 Hz
Avg Female: ~210 Hz
"""
if target_pitch is None:
return "+0Hz"
base_pitch = 120 if gender == "MALE" else 210
# Calculate difference
diff = target_pitch - base_pitch
# Dampen the effect so it's not too extreme
# e.g. if diff is +50Hz, we might only apply +25Hz shift to keep it natural
adjustment = int(diff * 0.5)
# Format for edge-tts
sign = "+" if adjustment >= 0 else ""
return f"{sign}{adjustment}Hz"
def get_speaker_sample(audio_path, segments, speaker, output_dir, target_duration_sec=10):
"""
Extracts a sample audio clip for the specified speaker by combining multiple segments
until a target duration (e.g., 10 seconds) is reached.
Returns the path to the sample file.
"""
speaker_segments = [s for s in segments if s.get("speaker") == speaker]
if not speaker_segments:
return None
# Sort segments by length (longest first) to get the best continuous speech chunks
speaker_segments.sort(key=lambda s: s["end"] - s["start"], reverse=True)
audio = AudioSegment.from_wav(audio_path)
combined_sample = AudioSegment.empty()
current_duration_ms = 0
target_duration_ms = target_duration_sec * 1000
for seg in speaker_segments:
start_time = seg["start"] * 1000
end_time = seg["end"] * 1000
chunk = audio[start_time:end_time]
# Add a tiny bit of silence between chunks to keep it natural
combined_sample += chunk + AudioSegment.silent(duration=100)
current_duration_ms += len(chunk)
if current_duration_ms >= target_duration_ms:
break
# Cap the audio length to prevent OOM in XTTS (max ~12 seconds)
max_duration = target_duration_ms + 2000
if len(combined_sample) > max_duration:
combined_sample = combined_sample[:max_duration]
sample_dir = os.path.join(output_dir, "speaker_samples")
os.makedirs(sample_dir, exist_ok=True)
sample_path = os.path.join(sample_dir, f"{speaker}_sample.wav")
combined_sample.export(sample_path, format="wav")
# Apply DeepFilterNet to clean the sample and remove demucs artifacts
try:
print(f"✨ Enhancing voice sample for {speaker} using DeepFilterNet...")
import subprocess
import sys
# Опциональная проверка на установленную библиотеку
try:
import df.enhance
except ImportError:
print("⚠️ ВНИМАНИЕ: DeepFilterNet не установлен! Очистка голоса пропущена.")
print("💡 Для идеального клонирования установите: pip install deepfilternet (Внимание: скачает ~3 ГБ PyTorch)")
return sample_path
clean_sample_path = os.path.join(sample_dir, f"{speaker}_sample_DeepFilterNet3.wav")
# Run deepFilter directly via Python to bypass PATH issues
cmd = [
sys.executable, "-c",
"import sys; from df.enhance import run; sys.argv=['deepFilter', sys.argv[1], '-o', sys.argv[2]]; sys.exit(run())",
sample_path, sample_dir
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"DeepFilterNet stderr: {result.stderr}")
raise Exception(f"Command failed with exit status {result.returncode}")
if os.path.exists(clean_sample_path):
print("✨ Voice sample successfully cleaned!")
return clean_sample_path
else:
print("⚠️ DeepFilterNet output file not found. Using original sample.")
except Exception as e:
print(f"⚠️ DeepFilterNet enhancement failed: {e}. Using original uncleaned sample.")
return sample_path