forked from LiuYunPlayer/DiffSingerForTuneLab
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDiffSingerSpeakerMix.cs
More file actions
80 lines (72 loc) · 3.41 KB
/
Copy pathDiffSingerSpeakerMix.cs
File metadata and controls
80 lines (72 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
using System;
using System.Collections.Generic;
using System.Linq;
namespace DiffSingerForTuneLab;
// 说话人逐帧混合(模型无关,每会话每块算一次、acoustic/pitch/variance 三域共享):
// 忠实移植 OpenUtau DiffSingerSpeakerEmbedManager.PhraseSpeakerEmbedByFrame 的逐帧嵌入混合。
// 每条 mix:<suffix> 曲线 [0,100]·0.01 累积到该 suffix 的逐帧权重;逐帧标准化——
// Σ>1 时各 suffix 按和归一化,否则默认 suffix(part 级 KeySpeaker)补 1-Σ。
// 权重与模型无关、按 suffix 定义;各域用各自的 emb 解析器 ToEmbedding(预测器/声学的 speakers 表与 .emb 各异)。
// 无 mix 轨(或单说话人)时退化为「默认 suffix 恒权重 1」⇒ 等价旧的单 emb 逐帧广播。
public sealed class DiffSingerSpeakerMix
{
readonly (string Suffix, float[] Weight)[] mEntries; // 每帧跨条目权重和恒为 1
public int FrameCount { get; }
DiffSingerSpeakerMix((string Suffix, float[] Weight)[] entries, int frameCount)
{
mEntries = entries;
FrameCount = frameCount;
}
// 构造逐帧权重:默认 suffix 必有一席;各 mix 轨逐帧值([0,100],NaN 自由区视作 0)按 ×0.01 累积到对应 suffix;
// 逐帧标准化(Σ>1 归一化,否则默认 suffix 补 1-Σ)。忠实对齐 OpenUtau 的 standardization 段。
public static DiffSingerSpeakerMix Create(
string defaultSuffix, IReadOnlyList<(string Suffix, double[] Sampled)> tracks, int nFrames)
{
var bySuffix = new Dictionary<string, float[]>(StringComparer.Ordinal);
float[] Weight(string suffix)
{
if (!bySuffix.TryGetValue(suffix, out var w))
bySuffix[suffix] = w = new float[nFrames];
return w;
}
var defaultWeight = Weight(defaultSuffix); // 默认 suffix 恒占一席(即便无 mix 轨)
foreach (var (suffix, sampled) in tracks)
{
var w = Weight(suffix);
for (int f = 0; f < nFrames; f++)
{
double v = f < sampled.Length ? sampled[f] : double.NaN;
if (!double.IsNaN(v)) w[f] += (float)(v * 0.01);
}
}
for (int f = 0; f < nFrames; f++)
{
float sum = 0;
foreach (var w in bySuffix.Values) sum += w[f];
if (sum > 1)
foreach (var w in bySuffix.Values) w[f] /= sum;
else
defaultWeight[f] += 1 - sum;
}
return new DiffSingerSpeakerMix(
bySuffix.Select(kv => (kv.Key, kv.Value)).ToArray(), nFrames);
}
// 用指定域的 emb 解析器混出逐帧 spk_embed([1, FrameCount, hidden] 的扁平数组):Σ_suffix 权重·emb。
// resolveEmb 据 suffix 取该域 emb(预测器走 GetEmbedding、声学走 GetSpeakerEmbeddingBySuffix,均带回退+缓存)。
public float[] ToEmbedding(Func<string, float[]> resolveEmb, int hidden)
{
var spk = new float[FrameCount * hidden];
foreach (var (suffix, weight) in mEntries)
{
var emb = resolveEmb(suffix);
for (int f = 0; f < FrameCount; f++)
{
float w = weight[f];
if (w == 0) continue;
int b = f * hidden;
for (int i = 0; i < hidden; i++) spk[b + i] += w * emb[i];
}
}
return spk;
}
}