DiffSingerForTuneLab/DiffSingerPitch.cs at master · LiuYunPlayer/DiffSingerForTuneLab · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using TuneLab.SDK;

namespace DiffSingerForTuneLab;

// pitch 预测器（dspitch）：忠实移植 OpenUtau DsPitch.Process。
//   linguistic（词模式 word_div/word_dur 或音素模式 ph_dur，按编码器实际输入判）+ pitch 模型；
//   从音符构造 note_midi / note_dur(帧) / note_rest（head/tail padding、间隙插 rest、slur 继承、rest 组 tone 填充）；
//   pitch 基值全 60、retake 全 true ⇒ 不吃用户音高，纯从音符全量预测自然音高轮廓（逐帧 MIDI）。
//   PEXP（expr）本阶段喂中性 1.0（满表现力）；steps 暂与声学共用（OpenUtau 另有 DiffSingerStepsPitch，后续统一）。
// 调用方（Render）拿到预测轮廓后，用它替代自由区（用户音高 NaN 处）的矩形 note-step 兜底；用户已画处用户值覆盖，
// PITD/vibrato 永远叠加在上（见共识：自由区填 f0 + 回显、retake 全 true 事后合并）。无 dspitch ⇒ 返回 null 降级。
public static class DiffSingerPitch
{
    // phones = body 音素（不含 head/tail）；phDur = padded 帧（len=phones+2，和=totalFrames）；
    // renderStart = 渲染起点秒（phones[0].StartTime - head*frameSec）。返回逐帧 MIDI 音高（len=totalFrames），无预测器返回 null。
    public static float[]? Predict(
        DiffSingerPredictor? v, IReadOnlyList<PhonemeSpan> phones,
        IReadOnlyList<SynthesisNoteSnapshot> notes, int[] phDur,
        double renderStart, double frameSec, DiffSingerSpeakerMix mix, VoicebankConfig cfg, int steps, bool tensorCache)
    {
        if (v is null || !v.HasModel("pitch") || phones.Count == 0 || notes.Count == 0)
            return null;

        int hidden = v.HiddenSize;
        int nTokens = phones.Count + 2;
        int totalFrames = phDur.Sum();
        int head = DiffSingerFrames.HeadFrames, tail = DiffSingerFrames.TailFrames;

        var tokens = phones.Select(p => (long)v.PhonemeToken(p.Symbol))
            .Prepend((long)v.PhonemeToken("SP")).Append((long)v.PhonemeToken("SP")).ToArray();

        // —— linguistic 编码器（词模式吃 word_div/word_dur，否则音素模式吃 ph_dur）——
        var lingInputs = new List<NamedOnnxValue> { NvL("tokens", tokens, nTokens) };
        if (v.LinguisticUsesWordBoundary)
        {
            var isVowel = phones.Select(p => v.IsVowel(p.Symbol)).ToArray();
            var (wordDiv, wordDur) = DiffSingerFrames.PaddedWordDivAndDur(isVowel, phDur);
            lingInputs.Add(NvL("word_div", wordDiv, wordDiv.Length));
            lingInputs.Add(NvL("word_dur", wordDur, wordDur.Length));
        }
        else
        {
            lingInputs.Add(NvL("ph_dur", phDur.Select(x => (long)x).ToArray(), nTokens));
        }
        if (v.Linguistic.InputMetadata.ContainsKey("languages"))
        {
            var langs = phones.Select(p => v.LangId(PhonemeLanguage(p.Symbol))).Prepend(0L).Append(0L).ToArray();
            lingInputs.Add(NvL("languages", langs, nTokens));
        }
        var lingOut = DiffSingerTensorCache.Run(v.Linguistic, v.LinguisticHash, lingInputs, tensorCache);
        var enc = lingOut.First(o => o.Name == "encoder_out").AsTensor<float>();
        var encDense = new DenseTensor<float>(enc.ToArray(), enc.Dimensions.ToArray());

        // —— note 序列：head padding + 各 note（间隙插 rest）+ tail padding；rest 组 tone 由最近非 rest 填充 ——
        var (noteMidi, noteDur, noteRest) = BuildNotes(v, phones, notes, renderStart, frameSec, totalFrames, head, tail);

        // —— pitch 模型：pitch 全 60、retake 全 true（全量预测）——
        var model = v.Model("pitch");
        var pitch = new float[totalFrames];
        Array.Fill(pitch, 60f);
        var retake = new bool[totalFrames];
        Array.Fill(retake, true);

        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("encoder_out", encDense),
            NvF("note_midi", noteMidi, noteMidi.Length),
            NvL("note_dur", noteDur.Select(x => (long)x).ToArray(), noteDur.Length),
            NvL("ph_dur", phDur.Select(x => (long)x).ToArray(), nTokens),
            NvF("pitch", pitch, totalFrames),
            NamedOnnxValue.CreateFromTensor("retake", new DenseTensor<bool>(retake, new[] { 1, totalFrames })),
        };

        AddAccel(inputs, model, cfg, steps);

        // 表现力（PEXP）：本阶段喂中性 1.0（满表现力）；可编辑 PEXP 轨后续再加。
        if (model.InputMetadata.ContainsKey("expr"))
        {
            var expr = new float[totalFrames];
            Array.Fill(expr, 1f);
            inputs.Add(NvF("expr", expr, totalFrames));
        }
        if (model.InputMetadata.ContainsKey("spk_embed"))
        {
            var spk = mix.ToEmbedding(v.GetEmbedding, hidden);
            inputs.Add(NamedOnnxValue.CreateFromTensor("spk_embed",
                new DenseTensor<float>(spk, new[] { 1, totalFrames, hidden })));
        }
        if (model.InputMetadata.ContainsKey("note_rest"))
            inputs.Add(NamedOnnxValue.CreateFromTensor("note_rest",
                new DenseTensor<bool>(noteRest, new[] { 1, noteRest.Length })));

        var outputs = DiffSingerTensorCache.Run(model, v.ModelHash("pitch"), inputs, tensorCache);
        return outputs.First().AsTensor<float>().ToArray();
    }

    // note 序列构造（移植 OpenUtau，叠加重叠扩展）：head padding(rest) + 各 note（间隙插 rest note）+ tail padding(rest)。
    //   note_rest：slur（歌词 +）继承前一个；否则该 note 的音素全为辅音/AP/SP（无真元音）⇒ rest。
    //   note_midi：rest 组的 tone 由最近的非 rest note 填充（全 rest ⇒ 全填 60）。
    //   头盖尾（OpenUtau 无、本插件扩展）：note 与后一 note 重叠时，有效终点截到后一 note 起点，使 note_dur 与
    //   phonemizer/声学侧的截断时间线同口径——否则 pitch 模型按 note 全长走、轮廓越过后一 note 起点（不让位）。
    //   同起点和弦退化为 dur=0 塌缩（排序长者在前先塌，短者存活）。
    static (float[] midi, int[] durFrames, bool[] rest) BuildNotes(
        DiffSingerPredictor v, IReadOnlyList<PhonemeSpan> phones,
        IReadOnlyList<SynthesisNoteSnapshot> notes,
        double renderStart, double frameSec, int totalFrames, int head, int tail)
    {
        var durSec = new List<double>();
        var midiList = new List<float>();
        var restList = new List<bool>();

        // head padding（首 note 起点之前，含越界前置辅音的空间）。
        durSec.Add(Math.Max(0, notes[0].StartTime - renderStart));
        midiList.Add(notes[0].Pitch);
        restList.Add(true);

        double prevEnd = notes[0].StartTime;
        for (int i = 0; i < notes.Count; i++)
        {
            var note = notes[i];
            // 头盖尾：与后一 note 重叠时，本 note 有效终点截到后一 note 起点（同起点 ⇒ 截到自身起点 ⇒ dur=0 塌缩）。
            double effectiveEnd = i + 1 < notes.Count
                ? Math.Min(note.EndTime, notes[i + 1].StartTime)
                : note.EndTime;
            double gap = note.StartTime - prevEnd;
            if (gap > 0)
            {
                durSec.Add(gap);
                midiList.Add(note.Pitch);
                restList.Add(true);
            }
            durSec.Add(Math.Max(0, effectiveEnd - note.StartTime));
            midiList.Add(note.Pitch);
            // 延音符（"-"/"+"）：无自身音素，沿用前一个 note 的 rest 状态（前为发声 ⇒ 本帧也发声、携自身 MIDI 滑过去）。
            if (DiffSingerPhonemizer.IsSlur(note.Lyric))
            {
                restList.Add(restList[^1]);
            }
            else
            {
                bool isRest = true;
                foreach (var p in phones)
                {
                    if (p.NoteIndex != i) continue;
                    if (p.Symbol != "AP" && p.Symbol != "SP" && v.IsVowel(p.Symbol)) { isRest = false; break; }
                }
                restList.Add(isRest);
            }
            prevEnd = effectiveEnd;
        }

        // tail padding。
        durSec.Add(tail * frameSec);
        midiList.Add(notes[^1].Pitch);
        restList.Add(true);

        var midi = midiList.ToArray();
        var rest = restList.ToArray();
        FillRestTones(midi, rest);

        var durFrames = DiffSingerFrames.FitDurationSum(
            DiffSingerFrames.DurationsToFrames(durSec, frameSec), totalFrames);
        return (midi, durFrames, rest);
    }

    // rest 组 tone 填充：每段连续 rest 用最近的非 rest note tone 填（首段用其后、末段用其前、中间段从中点劈半）。
    static void FillRestTones(float[] midi, bool[] rest)
    {
        int n = rest.Length;
        if (rest.All(r => r)) { Array.Fill(midi, 60f); return; }

        var groups = new List<(int start, int end)>();
        for (int i = 0; i < n; i++)
        {
            if (!rest[i]) continue;
            int j = i + 1;
            while (j < n && rest[j]) j++;
            groups.Add((i, j));
            i = j;
        }
        foreach (var (start, end) in groups)
        {
            if (start == 0)
                Array.Fill(midi, midi[end], 0, end);
            else if (end == n)
                Array.Fill(midi, midi[start - 1], start, n - start);
            else
            {
                int mid = (start + end + 1) / 2;
                Array.Fill(midi, midi[start - 1], start, mid - start);
                Array.Fill(midi, midi[end], mid, end - mid);
            }
        }
    }

    static void AddAccel(List<NamedOnnxValue> inputs, InferenceSession model, VoicebankConfig cfg, int steps)
    {
        if (cfg.UseContinuousAcceleration)
        {
            if (model.InputMetadata.ContainsKey("steps"))
                inputs.Add(NamedOnnxValue.CreateFromTensor("steps",
                    new DenseTensor<long>(new[] { (long)steps }, new[] { 1 })));
        }
        else if (model.InputMetadata.ContainsKey("speedup"))
        {
            long speedup = Math.Max(1, 1000 / Math.Max(1, steps));
            while (1000 % speedup != 0 && speedup > 1) speedup--;
            inputs.Add(NamedOnnxValue.CreateFromTensor("speedup",
                new DenseTensor<long>(new[] { speedup }, new[] { 1 })));
        }
    }

    static string PhonemeLanguage(string phoneme)
    {
        int slash = phoneme.IndexOf('/');
        return slash > 0 ? phoneme[..slash] : string.Empty;
    }

    static NamedOnnxValue NvL(string name, long[] data, int n)
        => NamedOnnxValue.CreateFromTensor(name, new DenseTensor<long>(data, new[] { 1, n }));
    static NamedOnnxValue NvF(string name, float[] data, int n)
        => NamedOnnxValue.CreateFromTensor(name, new DenseTensor<float>(data, new[] { 1, n }));
}