forked from LiuYunPlayer/DiffSingerForTuneLab
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDiffSingerSynthesisSession.cs
More file actions
1060 lines (936 loc) · 47.2 KB
/
Copy pathDiffSingerSynthesisSession.cs
File metadata and controls
1060 lines (936 loc) · 47.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using TuneLab.Foundation;
using TuneLab.SDK;
using static DiffSingerForTuneLab.DiffSingerDeclarations;
namespace DiffSingerForTuneLab;
// 一条 part 的合成会话。
// 关键设计:
// · 区段式重渲染:修改某个音素时,仅以该音素为中心前后各扩展一个音素作为「重渲染区段」,
// 渲染后将新 mel 通过频谱过渡拼贴到原序列的 mel 谱上,避免整个序列被改变。
// · pitch 锁定:用户修改 pitch 曲线后,自动音高预测(dspitch)被锁定不再重新生成,
// 仅缓存的预测作为 NaN 自由区的回退。除非用户选择 Retake 或 RedrawPitch。
// · dur 忠于 UI:音素时间线由用户界面(note 时长 / 钉死音素)决定,首次渲染后只有显式请求才重新 phonemize。
public sealed class DiffSingerSynthesisSession : ISynthesisSession
{
readonly VoicebankConfig mConfig;
readonly ISynthesisContext mContext;
readonly string mVoiceId;
readonly DiffSingerModelCache mModelCache;
readonly int mSamplingSteps;
readonly bool mTensorCache; // 张量缓存总开关(引擎设置 tensor_cache)
readonly int mCacheMaxSizeMb; // 缓存体积上限(MB);0 = 不限制(引擎设置 cache_max_size_mb)
// 运行时复用的声明派生物(每会话固定,构造期据声库能力集算一次):
// 可编辑轨集合(构造期订阅其区间编辑)+ 回显轨集合(产物 SynthesizedParameters 按其 key 聚合)。
readonly OrderedMap<PropertyKey, AutomationConfig> mReadbackConfigs;
readonly IDisposable mNotesSubscription;
readonly List<ILiveAutomation> mSubscribedAutomations = new(); // 已订阅 RangeModified 的固定轨(variance/gender/speed,恒定,Dispose 退订)
readonly Dictionary<string, ILiveAutomation> mMixSubscriptions = new(); // 已订阅的说话人混合轨(动态,key=mix:suffix,随 part 属性增减)
readonly Dictionary<ILiveNote, Action> mNoteHandlers = new();
readonly List<Piece> mPieces = new();
bool mNeedResegment;
// 缓存有效标志
bool mHasValidCache;
// 受影响的 time range(来自 OnRangeModified 或 note 修改),供 mel 拼贴使用
double mAffectedStartTime = double.NaN;
double mAffectedEndTime = double.NaN;
readonly SemaphoreSlim mRenderSemaphore;
public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext context,
string voiceId, DiffSingerModelCache modelCache, int samplingSteps, bool tensorCache, int cacheMaxSizeMb)
{
mConfig = config;
mContext = context;
mVoiceId = voiceId;
mModelCache = modelCache;
mSamplingSteps = samplingSteps;
mTensorCache = tensorCache;
mCacheMaxSizeMb = cacheMaxSizeMb;
// 声明派生物据声库能力集算一次(与引擎声明同一套 DiffSingerDeclarations,单一真相源)。
mReadbackConfigs = BuildReadbackConfigs(config);
mNotesSubscription = NotifiableExtensions.WhenAny(context.Notes, SubscribeNote, UnsubscribeNote);
context.Notes.ItemAdded += OnNotesStructureChanged;
context.Notes.ItemRemoved += OnNotesStructureChanged;
context.PartProperties.Modified += MarkAllDirtyAndResegment;
context.Pitch.RangeModified += OnRangeModified;
context.PitchDeviation.RangeModified += OnRangeModified;
context.Committed += OnCommitted;
// 固定轨(variance / gender / speed)区间编辑订阅:SDK 把声明上移到引擎后,宿主在「建会话之前」即
// RefreshDeclarations 填好 Voice.AutomationConfigs(见 MidiPart 时序),故构造期 TryGetAutomation 即命中、直接订阅。
// 这些轨与 part 属性无关、恒定,构造期订一次即可。
foreach (var key in BuildFixedAutomationConfigs(config).Keys)
if (context.TryGetAutomation(key.Id, out var automation))
{
automation.RangeModified += OnRangeModified;
mSubscribedAutomations.Add(automation);
}
// 说话人混合轨是动态集(随 part 属性 speaker_mix 容器增减):构造期同步一次(覆盖重开工程时已选的),
// 之后由 part 属性变更(MarkAllDirtyAndResegment)补/退订——见 SyncMixSubscriptions。
SyncMixSubscriptions();
mNeedResegment = true;
}
public string DefaultLyric => "a";
// —— 调度:窗内第一个脏块的纯值边界(peek 廉价、确定性)——
public SynthesisRange? GetNextSegment(double startTime, double endTime)
=> FindNextDirtyPiece(startTime, endTime) is { } p ? new SynthesisRange(p.StartTime, p.EndTime) : null;
Piece? FindNextDirtyPiece(double startTime, double endTime)
{
if (mNeedResegment) Resegment();
foreach (var piece in mPieces)
{
if (!piece.Dirty || piece.Failed || piece.Synthesizing) continue;
if (piece.EndTime < startTime || piece.StartTime > endTime) continue;
return piece;
}
return null;
}
public async Task SynthesizeNext(double startTime, double endTime, CancellationToken cancellation = default)
{
if (FindNextDirtyPiece(startTime, endTime) is not { } piece)
return;
var snapshot = mContext.GetSnapshot(piece.Notes, piece.Notes[0].StartTime.Value, piece.Notes.Max(n => n.EndTime.Value));
piece.Dirty = false;
piece.Synthesizing = true;
piece.Progress = 0;
StatusChanged?.Invoke();
var report = new Progress<double>(p => { piece.Progress = p; StatusChanged?.Invoke(); });
await mRenderSemaphore.WaitAsync(cancellation);
try
{
// offload:worker 只读冻结快照跑 ONNX(绝不碰活视图);模型懒加载经引擎级缓存(首载触发原生加载)。
// 合成毕在 worker 线程顺手做一次缓存体积上限逐出(仅开缓存且设了上限时;off 数据线程、尽力而为)。
var rendered = await Task.Run(() =>
{
var result = Render(snapshot, piece.Notes, report, cancellation);
if (mTensorCache && mCacheMaxSizeMb > 0)
DiffSingerTensorCache.EnforceSizeLimit(mCacheMaxSizeMb);
return result;
}, CancellationToken.None);
if (rendered != null && mPieces.Contains(piece))
{
int rate = rendered.SampleRate;
// 用渲染后的音素边界精化过渡区间:使过渡在追溯音素内部进行
double stitchStart = mAffectedStartTime;
double stitchEnd = mAffectedEndTime;
RefineStitchRange(rendered.Phonemes,
ref stitchStart, ref stitchEnd, piece.FrameSec);
// 区段式音频拼贴:仅替换受影响的 time range,其余保持旧音频不变
var stitchedAudio = StitchAudio(rendered.Audio, piece.CachedAudio,
rendered.StartTime, rate, stitchStart, stitchEnd);
// 缓存旧音频供下次拼贴
piece.CachedAudio = stitchedAudio;
piece.Segment?.Dispose();
piece.Segment = mContext.CreateAudioSegment((long)(rendered.StartTime * rate), stitchedAudio.Length, rate);
piece.Segment.Write(0, stitchedAudio);
piece.Segment.Commit();
piece.Phonemes = rendered.Phonemes;
// 回显曲线也做区间拼贴:未修改区间的 pitch/tension 与缓存的旧曲线一致
piece.PitchReadback = StitchPoints(rendered.PitchReadback, piece.CachedPitchReadback,
mAffectedStartTime, mAffectedEndTime);
piece.CachedPitchReadback = piece.PitchReadback;
if (rendered.VarianceReadback.Count > 0)
{
var stitchedVar = new Dictionary<string, IReadOnlyList<Point>>();
foreach (var kvp in rendered.VarianceReadback)
{
piece.CachedVarianceReadback.TryGetValue(kvp.Key, out var oldVar);
stitchedVar[kvp.Key] = StitchPoints(kvp.Value, oldVar,
mAffectedStartTime, mAffectedEndTime);
}
piece.VarianceReadback = stitchedVar;
piece.CachedVarianceReadback = stitchedVar;
}
}
}
catch (Exception ex)
{
piece.Failed = true;
piece.Error = ex.Message;
TuneLabContext.Global.GetLogger().Warning($"DiffSinger:合成失败 [{piece.StartTime:F2}s]:{ex}");
}
finally
{
mRenderSemaphore.Release();
piece.Synthesizing = false;
StatusChanged?.Invoke();
}
}
// —— 音频拼贴 ——
// 将新音频的 affected 区间(前后各扩展 3 帧过渡)替换到旧音频中。
// 若旧音频不存在(首次渲染)则直接返回新音频。
static float[] StitchAudio(float[] newAudio, float[]? oldAudio,
double renderStartSec, int sampleRate,
double affectedStart, double affectedEnd)
{
if (oldAudio == null || oldAudio.Length == 0)
return newAudio;
if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd))
return newAudio; // 无明确 affected 区间时全量替换
if (oldAudio.Length != newAudio.Length)
return newAudio; // 长度不同无法拼贴
// 计算 affected 区间的采样点范围(前后扩展 3 帧 = 3 * hop_size 采样点)
int hop = 512; // DiffSinger 标准 hop_size
int fadeSamples = 3 * hop;
int startSample = Math.Max(0, (int)((affectedStart - renderStartSec) * sampleRate) - fadeSamples);
int endSample = Math.Min(newAudio.Length, (int)((affectedEnd - renderStartSec) * sampleRate) + fadeSamples);
if (startSample >= endSample)
return newAudio;
var result = new float[oldAudio.Length];
Array.Copy(oldAudio, result, oldAudio.Length);
// 拷贝 affected 区间的新音频
int copyLen = endSample - startSample;
Array.Copy(newAudio, startSample, result, startSample, copyLen);
// 前过渡区:第一帧(线性渐入)
int fadeLen = Math.Min(fadeSamples, copyLen / 2);
for (int i = 0; i < fadeLen; i++)
{
float t = (float)(i + 1) / (fadeLen + 1);
int idx = startSample + i;
if (idx >= 0 && idx < result.Length)
result[idx] = oldAudio[idx] * (1 - t) + newAudio[idx] * t;
}
// 后过渡区:最后一帧(线性渐出)
for (int i = 0; i < fadeLen; i++)
{
float t = (float)(i + 1) / (fadeLen + 1);
int idx = endSample - 1 - i;
if (idx >= 0 && idx < result.Length)
result[idx] = oldAudio[idx] * (1 - t) + newAudio[idx] * t;
}
return result;
}
// —— 推理链(worker,只读冻结快照)——
// 推理结果
sealed record RenderResultEx(float[] Audio, double StartTime, int SampleRate,
List<SynthesizedPhoneme> Phonemes, List<Point> PitchReadback,
Dictionary<string, IReadOnlyList<Point>> VarianceReadback,
int[] MelDims, float[] Mel);
RenderResultEx? Render(SynthesisSnapshot snapshot, IReadOnlyList<ILiveNote> origins, Piece piece,
IProgress<double>? progress, CancellationToken cancellation)
{
var notes = snapshot.Notes;
if (notes.Count == 0 || cancellation.IsCancellationRequested)
return null;
var models = mModelCache.GetOrLoad(mVoiceId, mConfig);
int hop = models.HopSize, sr = models.SampleRate, hidden = models.HiddenSize;
double frameSec = (double)hop / sr;
int head = DiffSingerFrames.HeadFrames;
int numMelBins = models.NumMelBins;
piece.FrameSec = frameSec;
string partLang = snapshot.PartProperties.GetString(KeyLanguage, string.Empty);
string speaker = snapshot.PartProperties.GetString(KeySpeaker, mConfig.Speakers.Count > 0 ? mConfig.Speakers[0] : string.Empty);
var noteLang = notes.Select(nt => nt.Properties.GetString(KeyLanguage, partLang)).ToArray();
// 渲染模式(仅 Retake 用会话级,RedrawPitch 由 piece 级标记驱动)
bool isRetake = false;
if (!mRenderModeConsumed && mRenderMode == RenderMode.Retake)
{
isRetake = true;
mRenderModeConsumed = true;
mRenderMode = RenderMode.Normal;
}
bool isRedrawPitch = piece.RedrawPitchRequested;
piece.RedrawPitchRequested = false;
bool pieceHasNoCache = piece.CachedPhones == null;
bool needFullPredict = isRetake || !mHasValidCache || pieceHasNoCache;
bool needPitchPredict = needFullPredict || isRedrawPitch;
// —— 模型优先级:dsdur/dspitch 提级模型 ——
var durPred = models.GetPredictor("dsdur");
var phones = durPred != null
? DiffSingerPhonemizer.Phonemize(durPred, notes, noteLang, speaker, hop, sr, mTensorCache)
: FallbackPhonemes(models, notes, noteLang); // 无 dur 预测器:每 note 一元音兜底
if (phones.Count == 0)
return null;
progress?.Report(0.2);
if (cancellation.IsCancellationRequested)
return null;
// —— 帧布局 ——
var phoneDurSec = phones.Select(p => Math.Max(0, p.EndTime - p.StartTime)).ToArray();
var durations = DiffSingerFrames.PaddedPhoneFrames(phoneDurSec, frameSec);
int nTokens = durations.Length;
int nFrames = durations.Sum();
double renderStart = phones[0].StartTime - head * frameSec;
// 逐帧时刻 + 说话人逐帧混合(acoustic/pitch/variance 三域共享;未启用任何混合时退化为默认 speaker 恒权重)。
// 遍历全量 mix:<suffix> 候选,snapshot.TryGetAutomation 只命中已声明轨——即用户在 part 面板已 + 的 speaker
// (speaker_mix 容器已选键),未选的 speaker 此处自然跳过、不参与混合。
var frameTimes = new double[nFrames];
for (int f = 0; f < nFrames; f++) frameTimes[f] = renderStart + (f + 0.5) * frameSec;
var mixTracks = new List<(string Suffix, double[] Sampled)>();
foreach (var (key, suffix) in SpeakerMixTracks(mConfig))
if (snapshot.TryGetAutomation(key, out var mixAuto))
mixTracks.Add((suffix, mixAuto.Evaluator.Evaluate(frameTimes)));
var speakerMix = DiffSingerSpeakerMix.Create(Suffix(speaker), mixTracks, nFrames);
// tokens/languages
var tokens = new long[nTokens];
var langs = new long[nTokens];
tokens[0] = AcousticToken(models, "SP");
tokens[nTokens - 1] = AcousticToken(models, "SP");
for (int i = 0; i < phones.Count; i++)
{
tokens[i + 1] = AcousticToken(models, phones[i].Symbol);
langs[i + 1] = models.TryGetLanguage(PhonemeLang(phones[i].Symbol), out var lid) ? lid : 0;
}
// 逐帧 note 音高回退(直接按 note 时间区间查找,不受 -/+ 延音符无音素的影响)
var framePitch = new double[nFrames];
{
int ni = 0;
for (int f = 0; f < nFrames; f++)
{
double t = frameTimes[f];
// 找出包含当前帧的 note(延音符未产生音素,但 pitch 仍需跟随它的音高)
while (ni < notes.Count - 1 && t >= notes[ni + 1].StartTime)
ni++;
framePitch[f] = notes[ni].Pitch;
}
}
// —— dspitch 自然音高预测(纯从音符、retake 全 true、不吃用户音高):替代自由区的矩形 note-step 兜底 ——
// 用户已画处(Pitch 非 NaN)用户值覆盖;NaN 自由区用预测轮廓(无 dspitch ⇒ 仍用矩形 framePitch);PITD/vibrato 叠加在上。
var predictedPitch = DiffSingerPitch.Predict(
models.GetPredictor("dspitch"), phones, notes, durations,
renderStart, frameSec, speakerMix, mConfig, mSamplingSteps, mTensorCache);
progress?.Report(0.28);
if (cancellation.IsCancellationRequested)
return null;
// 逐帧 f0(Hz) + 半音曲线
var pitchCurve = snapshot.Pitch.Evaluator.Evaluate(frameTimes);
var deviation = snapshot.PitchDeviation.Evaluator.Evaluate(frameTimes);
var f0 = new float[nFrames];
var semis = new float[nFrames];
var pitchReadback = new List<Point>(nFrames);
// —— 初始生成自动音高时排除前后 SP 音高数据(不画到用户界面)——
// 判断哪些帧属于 head SP(第 0 个 token)和 tail SP(最后一个 token)
int spHeadFrames = durations[0]; // head SP 的帧数
int spTailFrames = durations[^1]; // tail SP 的帧数
for (int f = 0; f < nFrames; f++)
{
double fallback;
if (predictedPitch != null)
fallback = f < predictedPitch.Length ? predictedPitch[f] : predictedPitch[^1];
else
fallback = framePitch[f];
double semitone = (double.IsNaN(pitchCurve[f]) ? fallback : pitchCurve[f]) + deviation[f];
semis[f] = (float)semitone;
f0[f] = DiffSingerFrames.ToneToFreq(semitone);
// 排除前后 SP 帧的音高回显(不在 pitchReadback 中添加)
bool isHeadOrTailSp = f < spHeadFrames || f >= nFrames - spTailFrames;
if (!isHeadOrTailSp)
{
pitchReadback.Add(new Point(frameTimes[f], semitone));
}
}
progress?.Report(0.3);
if (cancellation.IsCancellationRequested)
return null;
// —— variance 预测(基线;下方与用户 delta 合成喂声学、纯预测产回显)——
var varCurves = DiffSingerVariance.Predict(
models.GetPredictor("dsvariance"), phones.Select(p => p.Symbol).ToList(),
durations, semis, speakerMix, mConfig, mSamplingSteps, mTensorCache);
progress?.Report(0.45);
if (cancellation.IsCancellationRequested)
return null;
// —— 声学输入 ——
var ac = models.Acoustic;
var inputs = new List<NamedOnnxValue>();
void AddL(string name, long[] data, int[] dims)
{ if (ac.InputMetadata.ContainsKey(name)) inputs.Add(NamedOnnxValue.CreateFromTensor(name, new DenseTensor<long>(data, dims))); }
void AddF(string name, float[] data, int[] dims)
{ if (ac.InputMetadata.ContainsKey(name)) inputs.Add(NamedOnnxValue.CreateFromTensor(name, new DenseTensor<float>(data, dims))); }
AddL("tokens", tokens, new[] { 1, nTokens });
AddL("languages", langs, new[] { 1, nTokens });
AddL("durations", durations.Select(x => (long)x).ToArray(), new[] { 1, nTokens });
AddF("f0", f0, new[] { 1, nFrames });
// —— variance:使用(缓存的)预测 + 用户 delta 合成喂声学 ——
// 回显显示最终喂声学的值(预测+包络加权),包络修改时实参同步跟随
var varReadback = new Dictionary<string, IReadOnlyList<Point>>();
foreach (var spec in Variances)
{
float[]? predicted = varCurves[spec.Key];
double[]? user = snapshot.TryGetAutomation(spec.Key, out var auto)
? auto.Evaluator.Evaluate(frameTimes)
: null;
float[] combined = CombineVariance(spec, predicted, user, nFrames);
if (ac.InputMetadata.ContainsKey(spec.Key))
AddF(spec.Key, combined, new[] { 1, nFrames });
if (spec.Use(mConfig) && spec.Predict(mConfig) && predicted != null)
varReadback[spec.Key] = BuildReadbackSegment(spec, combined, frameTimes, nFrames, spHeadFrames, spTailFrames);
}
// —— gender / velocity ——
AddF("gender", BuildCurveInput(snapshot, KeyGender, GenderBaseline, GenderConvert(), frameTimes, nFrames), new[] { 1, nFrames });
AddF("velocity", BuildCurveInput(snapshot, KeySpeed, SpeedBaseline, SpeedConvert, frameTimes, nFrames), new[] { 1, nFrames });
if (ac.InputMetadata.ContainsKey("spk_embed"))
{
var spk = speakerMix.ToEmbedding(models.GetSpeakerEmbeddingBySuffix, hidden);
inputs.Add(NamedOnnxValue.CreateFromTensor("spk_embed", new DenseTensor<float>(spk, new[] { 1, nFrames, hidden })));
}
if (mConfig.UseContinuousAcceleration)
{
if (ac.InputMetadata.ContainsKey("depth") && mConfig.UseVariableDepth)
inputs.Add(NamedOnnxValue.CreateFromTensor("depth", new DenseTensor<float>(new[] { (float)models.MaxDepth }, new[] { 1 })));
if (ac.InputMetadata.ContainsKey("steps"))
inputs.Add(NamedOnnxValue.CreateFromTensor("steps", new DenseTensor<long>(new[] { (long)mSamplingSteps }, new[] { 1 })));
}
else if (ac.InputMetadata.ContainsKey("speedup"))
{
long speedup = Math.Max(1, 1000 / Math.Max(1, mSamplingSteps));
while (1000 % speedup != 0 && speedup > 1) speedup--;
inputs.Add(NamedOnnxValue.CreateFromTensor("speedup", new DenseTensor<long>(new[] { speedup }, new[] { 1 })));
}
var melOut = DiffSingerTensorCache.Run(ac, models.AcousticHash, inputs, mTensorCache);
var mel = melOut.First(v => v.Name == "mel").AsTensor<float>();
progress?.Report(0.75);
if (cancellation.IsCancellationRequested)
return null;
// —— 声码器(使用原始 mel 形状创建张量)——
var voc = models.Vocoder;
var melShape = new int[melDims.Length];
Array.Copy(melDims, melShape, melDims.Length);
var vInputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("mel", new DenseTensor<float>(finalMel, melShape))
};
if (voc.InputMetadata.ContainsKey("f0"))
vInputs.Add(NamedOnnxValue.CreateFromTensor("f0", new DenseTensor<float>(f0, new[] { 1, nFrames })));
var wavOut = DiffSingerTensorCache.Run(voc, models.VocoderHash, vInputs, mTensorCache);
var audio = wavOut.First(v => v.Name == "waveform").AsTensor<float>().ToArray();
progress?.Report(1.0);
// —— 音素产物 ——
var phonemes = phones.Select(p => new SynthesizedPhoneme
{
Symbol = p.Symbol,
StartTime = p.StartTime,
EndTime = p.EndTime,
Note = origins[p.NoteIndex],
StretchWeight = p.IsVowel ? 1 : 0,
}).ToList();
// 标记缓存有效(首次渲染成功后保持)
mHasValidCache = true;
return new RenderResultEx(audio, renderStart, sr, phonemes, pitchReadback, varReadback,
melDims, finalMel);
}
// —— 用音素边界精化过渡区间 ——
// 修改【A B C D】中的【C】→ 重渲染【B C D】,过渡在 B 和 D 的内部进行。
// 前过渡:边界音素起始 + 3 帧;后过渡:边界音素结束 - 3 帧。
static void RefineStitchRange(IReadOnlyList<SynthesizedPhoneme> newPhonemes,
ref double stitchStart, ref double stitchEnd, double frameSec)
{
if (double.IsNaN(stitchStart) || double.IsNaN(stitchEnd))
return;
double margin = 3 * frameSec;
// 前边界:找 stitchStart 所在音素的「前一个」音素(即追溯音素),取其起始 + 3 帧
int startIdx = -1;
for (int i = 0; i < newPhonemes.Count; i++)
{
if (newPhonemes[i].StartTime <= stitchStart && newPhonemes[i].EndTime >= stitchStart)
{ startIdx = i; break; }
}
// 如果找到的音素就是第一个(无前一个),则用它本身
if (startIdx >= 0)
{
int boundaryIdx = startIdx > 0 ? startIdx - 1 : startIdx;
// 但如果起始音素已经是被修改区域内的(stitchStart > 它的起始),则前一个才是边界
// 如果起始音素的起始 > stitchStart,说明我们定位到了边界音素之后的第一个,那它的前一个是边界
if (newPhonemes[startIdx].StartTime > stitchStart && startIdx > 0)
boundaryIdx = startIdx - 1;
stitchStart = newPhonemes[boundaryIdx].StartTime + margin;
// 确保不超出该音素范围
if (stitchStart > newPhonemes[boundaryIdx].EndTime - margin)
stitchStart = newPhonemes[boundaryIdx].StartTime + margin * 0.5;
}
// 后边界:找 stitchEnd 所在音素的「后一个」音素(追溯音素),取其结束 - 3 帧
int endIdx = -1;
for (int i = newPhonemes.Count - 1; i >= 0; i--)
{
if (newPhonemes[i].StartTime <= stitchEnd && newPhonemes[i].EndTime >= stitchEnd)
{ endIdx = i; break; }
}
if (endIdx >= 0)
{
int boundaryIdx = endIdx < newPhonemes.Count - 1 ? endIdx + 1 : endIdx;
if (newPhonemes[endIdx].EndTime < stitchEnd && endIdx < newPhonemes.Count - 1)
boundaryIdx = endIdx + 1;
stitchEnd = newPhonemes[boundaryIdx].EndTime - margin;
if (stitchEnd < newPhonemes[boundaryIdx].StartTime + margin)
stitchEnd = newPhonemes[boundaryIdx].EndTime - margin * 0.5;
}
if (stitchStart >= stitchEnd)
{
stitchStart = double.NaN;
stitchEnd = double.NaN;
}
}
// —— 区段式 mel 拼贴 ——
// 将新 mel 的「受影响的 time range」替换到旧 mel 中,边界做 3 帧交叉过渡。
// 旧 mel 为 null 或区间无效时直接返回新 mel。
static float[] StitchMel(float[] newMel, float[]? oldMel, double[] frameTimes,
int nFrames, int numMelBins, double affectedStart, double affectedEnd)
{
if (oldMel == null || oldMel.Length != newMel.Length)
return newMel;
if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd))
return newMel;
if (affectedEnd <= frameTimes[0] || affectedStart >= frameTimes[^1])
return newMel; // affected 区间完全在渲染范围外
const int fadeFrames = 3;
int totalFrames = nFrames;
// 找 affected 区间对应的帧范围(前后各扩展 fadeFrames 帧)
int startFrame = totalFrames - 1;
int endFrame = 0;
for (int f = 0; f < totalFrames; f++)
{
if (frameTimes[f] >= affectedStart && frameTimes[f] <= affectedEnd)
{
if (f < startFrame) startFrame = f;
if (f > endFrame) endFrame = f;
}
}
startFrame = Math.Max(0, startFrame - fadeFrames);
endFrame = Math.Min(totalFrames - 1, endFrame + fadeFrames);
if (startFrame >= endFrame)
return newMel;
var result = new float[newMel.Length];
Array.Copy(oldMel, result, newMel.Length);
// 区间内直接替换为新 mel
for (int f = startFrame + fadeFrames; f <= endFrame - fadeFrames; f++)
for (int b = 0; b < numMelBins; b++)
result[f * numMelBins + b] = newMel[f * numMelBins + b];
// 前过渡区(fadeFrames 帧线性渐入)
for (int i = 0; i < fadeFrames && startFrame + i < totalFrames; i++)
{
float t = (float)(i + 1) / (fadeFrames + 1);
int f = startFrame + i;
for (int b = 0; b < numMelBins; b++)
{
int idx = f * numMelBins + b;
result[idx] = oldMel[idx] * (1 - t) + newMel[idx] * t;
}
}
// 后过渡区(fadeFrames 帧线性渐出)
for (int i = 0; i < fadeFrames && endFrame - i >= 0; i++)
{
float t = (float)(i + 1) / (fadeFrames + 1);
int f = endFrame - i;
for (int b = 0; b < numMelBins; b++)
{
int idx = f * numMelBins + b;
result[idx] = oldMel[idx] * (1 - t) + newMel[idx] * t;
}
}
return result;
}
// —— Point 列表拼贴 ——
// 将受 affected 区间内的 Point 替换为新列表中的值,其余保持旧列表不变。
// 旧列表为 null 或帧结构改变(点数不同)时直接返回新列表。
static IReadOnlyList<Point> StitchPoints(IReadOnlyList<Point> newPoints, IReadOnlyList<Point>? oldPoints,
double affectedStart, double affectedEnd)
{
if (oldPoints == null || oldPoints.Count == 0)
return newPoints;
if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd))
return newPoints;
if (newPoints.Count == 0)
return oldPoints;
if (newPoints.Count != oldPoints.Count)
return newPoints;
if (Math.Abs(newPoints[0].X - oldPoints[0].X) > 0.001)
return newPoints;
int oldStart = 0, oldEnd = oldPoints.Count;
while (oldStart < oldPoints.Count && oldPoints[oldStart].X < affectedStart) oldStart++;
while (oldEnd > 0 && oldPoints[oldEnd - 1].X > affectedEnd) oldEnd--;
if (oldStart >= oldEnd)
return newPoints;
int newStart = 0, newEnd = newPoints.Count;
while (newStart < newPoints.Count && newPoints[newStart].X < affectedStart) newStart++;
while (newEnd > 0 && newPoints[newEnd - 1].X > affectedEnd) newEnd--;
if (newStart >= newEnd)
return oldPoints;
// 拼接并在边界做 3 点线性过渡,避免 pitch 断层
const int fadeCount = 3;
var result = new List<Point>(oldStart + (newEnd - newStart) + (oldPoints.Count - oldEnd));
// 旧区间(前段)
for (int i = 0; i < oldStart; i++) result.Add(oldPoints[i]);
// 前过渡:fadeCount 个点的旧→新渐变
for (int i = 0; i < fadeCount && newStart + i < newEnd; i++)
{
float t = (float)(i + 1) / (fadeCount + 1);
double x = newPoints[newStart + i].X;
double y = oldPoints[oldStart + i].Y * (1 - t) + newPoints[newStart + i].Y * t;
result.Add(new Point(x, y));
}
// 中间段:全量新值(仅当区间足够长时)
for (int i = newStart + fadeCount; i < newEnd - fadeCount; i++)
result.Add(newPoints[i]);
// 后过渡:fadeCount 个点的新→旧渐变(按时间递增顺序)
int backStart = Math.Max(newStart, newEnd - fadeCount);
for (int i = backStart; i < newEnd; i++)
{
float t = (float)(newEnd - i) / (fadeCount + 1);
int oi = oldEnd - (newEnd - i);
if (oi < 0 || oi >= oldPoints.Count) continue;
double x = newPoints[i].X;
double y = oldPoints[oi].Y * (1 - t) + newPoints[i].Y * t;
result.Add(new Point(x, y));
}
// 旧区间(后段)
for (int i = oldEnd; i < oldPoints.Count; i++) result.Add(oldPoints[i]);
return result;
}
// 无 dur 预测器兜底:每 note 一元音、占满 note 时长(无对齐/无 head/tail 之外的处理)。
static List<PhonemeSpan> FallbackPhonemes(VoiceModels models, IReadOnlyList<SynthesisNoteSnapshot> notes, string[] noteLang)
{
var result = new List<PhonemeSpan>(notes.Count);
for (int i = 0; i < notes.Count; i++)
{
string sym = PickVowelSymbol(models, noteLang[i]);
result.Add(new PhonemeSpan(sym, notes[i].StartTime, notes[i].EndTime, i, true));
}
return result;
}
static long AcousticToken(VoiceModels models, string symbol)
=> models.TryGetPhoneme(symbol, out var id) ? id : 0;
static string PhonemeLang(string phoneme)
{
int slash = phoneme.IndexOf('/');
return slash > 0 ? phoneme[..slash] : string.Empty;
}
// 预测 x 与用户值 y(UI 单位,NaN 自由区代入中性)按 OpenUtau delta 函数合成,clamp 到声学值域。
// 预测缺失(null,即 !Predict 而声学仍需该输入)→ 以 0 为基线降级,仅叠加用户 delta。
static float[] CombineVariance(VarianceSpec spec, float[]? predicted, double[]? user, int n)
{
var result = new float[n];
for (int f = 0; f < n; f++)
{
float x = predicted == null ? 0f : (f < predicted.Length ? predicted[f] : predicted[^1]);
double y = user != null && !double.IsNaN(user[f]) ? user[f] : spec.Neutral;
result[f] = (float)Math.Clamp(spec.Delta(x, (float)y), spec.AcousticMin, spec.AcousticMax);
}
return result;
}
// 纯用户曲线 → 帧级声学输入:按帧求值用户轨(无轨 / NaN 自由区 → 中性),逐帧 convert。
// 不 clamp(OpenUtau 亦不 clamp,连续轨的 UI 量程已界定取值范围)。
static float[] BuildCurveInput(SynthesisSnapshot snapshot, string key, double neutral,
Func<double, double> convert, double[] frameTimes, int n)
{
double[]? user = snapshot.TryGetAutomation(key, out var auto)
? auto.Evaluator.Evaluate(frameTimes)
: null;
var result = new float[n];
for (int f = 0; f < n; f++)
{
double y = user != null && !double.IsNaN(user[f]) ? user[f] : neutral;
result[f] = (float)convert(y);
}
return result;
}
// GENC convert(OpenUtau DiffSingerRenderer):正 = formant 下移;缩放由声库增广范围 KeyShift*(=range)定。
// range 某端为 0 ⇒ 该方向 scale=0(不移位)。闭包按当前声库现算(每会话固定)。
Func<double, double> GenderConvert()
{
double posScale = mConfig.KeyShiftMax == 0 ? 0 : 12 / mConfig.KeyShiftMax / 100;
double negScale = mConfig.KeyShiftMin == 0 ? 0 : -12 / mConfig.KeyShiftMin / 100;
return x => x < 0 ? -x * posScale : -x * negScale;
}
// VELC convert(OpenUtau DiffSingerRenderer):对数标度,100 = 原速,每 +100 速度 ×2。
static double SpeedConvert(double x) => Math.Pow(2, (x - 100) / 100);
// 回显段:最终值(含用户包络),clamp 到声学值域,逐帧 (全局秒, 值)。
// 整个 SP 段做透明度过渡:外边界 → 声学最小值(背景=不可见),内边界 → 100%(实际值)。
// 注意:pitch 回显直接排除 SP 帧(不画),此处用渐变使曲线从背景平滑浮现/消失。
static List<Point> BuildReadbackSegment(VarianceSpec spec, float[] finalValues, double[] frameTimes, int n,
int headSpFrames = 0, int tailSpFrames = 0)
{
float fadeTarget = (float)spec.AcousticMin; // 声学最小值 = 曲线不可见的背景值
var points = new List<Point>(n);
for (int f = 0; f < n; f++)
{
float x = f < finalValues.Length ? finalValues[f] : finalValues[^1];
// 前 SP:整个 SP 段从外边界(f=0)到内边界线性渐入(0% → 100%)
if (f < headSpFrames)
{
float t = (float)(f + 1) / headSpFrames;
x = x * t + fadeTarget * (1 - t);
}
// 后 SP:整个 SP 段从内边界到外边界线性渐出(100% → 0%)
if (f >= n - tailSpFrames)
{
float t = (float)(n - f) / tailSpFrames;
x = x * t + fadeTarget * (1 - t);
}
points.Add(new Point(frameTimes[f], Math.Clamp(x, spec.AcousticMin, spec.AcousticMax)));
}
return points;
}
// G2P 查无时的单音素兜底:优先该语言的 /a,回退裸 a,再回退 SP(静音)。
static string PickVowelSymbol(VoiceModels models, string lang)
{
string keyed = string.IsNullOrEmpty(lang) ? "a" : $"{lang}/a";
if (models.TryGetPhoneme(keyed, out _)) return keyed;
if (models.TryGetPhoneme("a", out _)) return "a";
return "SP";
}
// —— 产物(数据线程发布、可跨线程读)——
public IReadOnlyList<IReadOnlyList<Point>> SynthesizedPitch
=> mPieces.Where(p => p.PitchReadback.Count > 0).Select(p => p.PitchReadback).ToList();
// 回显产物(数据线程发布、可跨线程读):按声明的回显轨 key 聚合各 piece 的预测段。
// 每 piece 整段作为一个 segment,用多 GradientStop 的 LinearGradientBrush 实现像素级平滑透明度。
// 透明度轮廓:head SP 0%→25%, body 25%, tail SP 25%→0%,无段间边界。
public IReadOnlyMap<string, SynthesizedParameter> SynthesizedParameters
{
get
{
var map = new Map<string, SynthesizedParameter>();
foreach (var kvp in mReadbackConfigs)
{
var segments = new List<IReadOnlyList<Point>>();
var stopSets = new List<IReadOnlyList<Point>>();
foreach (var piece in mPieces)
if (piece.VarianceReadback.TryGetValue(kvp.Key.Id, out var segment) && segment.Count > 0)
segments.Add(segment);
if (segments.Count > 0)
map.Add(kvp.Key.Id, new SynthesizedParameter { Segments = segments });
}
return map;
}
}
public IReadOnlyList<SynthesizedPhoneme> Phonemes
=> mPieces.SelectMany(p => p.Phonemes).ToList();
public IReadOnlyList<SynthesisStatusSegment> GetStatus()
{
var result = new List<SynthesisStatusSegment>(mPieces.Count);
foreach (var piece in mPieces)
{
var status = piece.Failed ? SynthesisSegmentStatus.Failed
: piece.Synthesizing ? SynthesisSegmentStatus.Synthesizing
: piece.Dirty || piece.Segment == null ? SynthesisSegmentStatus.Pending
: SynthesisSegmentStatus.Synthesized;
result.Add(new SynthesisStatusSegment
{
StartTime = piece.StartTime,
EndTime = piece.EndTime,
Status = status,
Message = piece.Failed ? piece.Error : piece.Synthesizing ? L.Tr("Synthesizing") : null,
Progress = piece.Synthesizing ? piece.Progress : 0,
});
}
return result;
}
public event Action? StatusChanged;
public void Dispose()
{
mNotesSubscription.Dispose();
mContext.Notes.ItemAdded -= OnNotesStructureChanged;
mContext.Notes.ItemRemoved -= OnNotesStructureChanged;
mContext.PartProperties.Modified -= MarkAllDirtyAndResegment;
mContext.Pitch.RangeModified -= OnRangeModified;
mContext.PitchDeviation.RangeModified -= OnRangeModified;
foreach (var automation in mSubscribedAutomations)
automation.RangeModified -= OnRangeModified;
foreach (var automation in mMixSubscriptions.Values)
automation.RangeModified -= OnRangeModified;
mContext.Committed -= OnCommitted;
foreach (var piece in mPieces)
piece.Segment?.Dispose();
mPieces.Clear();
// 模型会话归引擎级缓存所有、跨会话共享,不在此释放(引擎 Destroy 统一释放)。
}
// —— 分块(数据线程;按 note 间隙分块,note 集等价的块保留缓存与状态)——见 §5.9 重叠陷阱 ——
void Resegment()
{
mNeedResegment = false;
var groups = new List<List<ILiveNote>>();
List<ILiveNote>? current = null;
double groupMaxEnd = 0;
foreach (var note in mContext.Notes)
{
if (current == null || note.StartTime.Value > groupMaxEnd)
{
current = new List<ILiveNote>();
groups.Add(current);
groupMaxEnd = note.EndTime.Value;
}
else
{
groupMaxEnd = Math.Max(groupMaxEnd, note.EndTime.Value);
}
current.Add(note);
}
var newPieces = new List<Piece>(groups.Count);
foreach (var groupNotes in groups)
{
double pieceEnd = groupNotes.Max(n => n.EndTime.Value);
var existing = mPieces.FirstOrDefault(p => p.Notes.SequenceEqual(groupNotes));
if (existing != null)
{
mPieces.Remove(existing);
existing.StartTime = groupNotes[0].StartTime.Value;
existing.EndTime = pieceEnd;
newPieces.Add(existing);
}
else
{
newPieces.Add(new Piece
{
Notes = groupNotes,
StartTime = groupNotes[0].StartTime.Value,
EndTime = pieceEnd,
Dirty = true,
});
}
}
foreach (var piece in mPieces)
piece.Segment?.Dispose();
mPieces.Clear();
mPieces.AddRange(newPieces);
StatusChanged?.Invoke();
}
sealed record NoteHandlers(Action OnDur, Action OnPitch, Action OnLyric, Action OnProps);
void SubscribeNote(ILiveNote note)
{
Action onDur = () =>
{
SetAffectedRange(note.StartTime.Value, note.EndTime.Value);
MarkPieceDirty(note, clearPhones: true, clearPitch: false, clearVariance: true);
mNeedResegment = true;
};
Action onPitch = () =>
{
SetAffectedRange(note.StartTime.Value, note.EndTime.Value);
MarkPieceDirty(note, clearPhones: false, clearPitch: false, clearVariance: false);
};
Action onLyric = () =>
{
SetAffectedRange(note.StartTime.Value, note.EndTime.Value);
MarkPieceDirty(note, clearPhones: true, clearPitch: true, clearVariance: true);
mNeedResegment = true;
};
Action onProps = () =>
{
SetAffectedRange(note.StartTime.Value, note.EndTime.Value);
MarkPieceDirty(note, clearPhones: true, clearPitch: false, clearVariance: true);
mNeedResegment = true;
};
note.StartTime.Modified += onDur;
note.EndTime.Modified += onDur;
note.Phonemes.Modified += onDur;
note.Pitch.Modified += onPitch;
note.Lyric.Modified += onLyric;
note.Properties.Modified += onProps;
mNoteHandlers[note] = new NoteHandlers(onDur, onPitch, onLyric, onProps);
}
void MarkPieceDirty(ILiveNote note, bool clearPhones, bool clearPitch, bool clearVariance)
{
foreach (var piece in mPieces)
{
if (!piece.Notes.Contains(note)) continue;
piece.Dirty = true; piece.Failed = false;
if (clearPhones) piece.CachedPhones = null;
if (clearPitch) piece.CachedPitchPrediction = null;
if (clearVariance) piece.CachedVarianceCurves = default;
return;
}
}
void SetAffectedRange(double start, double end)
{
double pad = 0.1;
mAffectedStartTime = start - pad;
mAffectedEndTime = end + pad;
}
void UnsubscribeNote(ILiveNote note)
{
if (mNoteHandlers.Remove(note, out var h))
{
note.StartTime.Modified -= h.OnDur;
note.EndTime.Modified -= h.OnDur;
note.Phonemes.Modified -= h.OnDur;
note.Pitch.Modified -= h.OnPitch;
note.Lyric.Modified -= h.OnLyric;
note.Properties.Modified -= h.OnProps;
}
}
void OnNotesStructureChanged(ILiveNote note) { mNeedResegment = true; }
void MarkAllDirtyAndResegment()
{
foreach (var piece in mPieces)
{
piece.Dirty = true; piece.Failed = false;
// 段落属性(语言等)变更 → 清除音素/variance 缓存,强制使用新的 G2P
piece.CachedPhones = null;
piece.CachedVarianceCurves = default;
// 保留 pitch 缓存
}
mNeedResegment = true;
// part 属性变更可能增删了说话人混合轨:补订新出现的、退订已消失的,使后续画曲线(RangeModified)能标脏。
// 时序安全:宿主 OnPartPropertiesModified(part 构造期订阅)先于本会话 handler(会话构造期订阅)执行,
// 它已 RebuildAutomationConfigs 填好 Voice.AutomationConfigs,故此刻 TryGetAutomation 对已选轨即命中。
SyncMixSubscriptions();
}
// 同步说话人混合轨订阅到当前 part 属性已选集:遍历全量去重 speaker 表(无需枚举 part 属性,live 视图也不支持),
// 逐个 TryGetAutomation——命中(= 已声明 = 已选)且未订则订、不命中且已订则退。幂等,可反复调。
void SyncMixSubscriptions()
{
foreach (var (key, _) in SpeakerMixTracks(mConfig)) // key = mix:<suffix>
{
bool live = mContext.TryGetAutomation(key, out var automation);
bool subscribed = mMixSubscriptions.ContainsKey(key);
if (live && !subscribed)
{
automation!.RangeModified += OnRangeModified;
mMixSubscriptions[key] = automation;
}
else if (!live && subscribed)