From 2831b91ddba7cbfa4401e922b7bf016b27e21ab2 Mon Sep 17 00:00:00 2001 From: tachengP <2638591622@qq.com> Date: Fri, 19 Jun 2026 18:44:19 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E9=87=8D=E6=B8=B2=E6=9F=93=E6=9C=BA?= =?UTF-8?q?=E5=88=B6=E6=9B=B4=E6=96=B0=EF=BC=8Cdsdur=E3=80=81dspitch?= =?UTF-8?q?=E6=8F=90=E6=9D=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DiffSingerDeclarations.cs | 14 +- DiffSingerPredictor.cs | 17 +- DiffSingerSynthesisSession.cs | 610 ++++++++++++++++++++++++++++------ DiffSingerVariance.cs | 15 +- DiffSingerVoiceEngine.cs | 119 ++++++- 5 files changed, 664 insertions(+), 111 deletions(-) diff --git a/DiffSingerDeclarations.cs b/DiffSingerDeclarations.cs index 51c427d..04dbbac 100644 --- a/DiffSingerDeclarations.cs +++ b/DiffSingerDeclarations.cs @@ -91,7 +91,7 @@ public static ObjectConfig BuildPartConfig(VoicebankConfig config) }); if (HasLanguageChoice(config)) - properties.Add(KeyLanguage, LanguageCombo(config, config.Languages[0])); + properties.Add(KeyLanguage, LanguageCombo(config, string.Empty)); return new ObjectConfig { Properties = properties }; } @@ -102,7 +102,7 @@ public static ObjectConfig BuildNoteConfig(VoicebankConfig config, INoteProperty var properties = new OrderedMap(); if (HasLanguageChoice(config)) { - var partDefault = context.PartProperties.GetString(KeyLanguage, config.Languages[0]); + var partDefault = context.PartProperties.GetString(KeyLanguage, string.Empty); properties.Add(KeyLanguage, LanguageCombo(config, partDefault)); } return new ObjectConfig { Properties = properties }; @@ -113,15 +113,15 @@ public static ObjectConfig BuildNoteConfig(VoicebankConfig config, INoteProperty static ComboBoxConfig LanguageCombo(VoicebankConfig config, string defaultValue) => new() { DisplayText = L.Tr("Language"), - Options = ToOptions(config.Languages), + Options = LanguageOptions(config.Languages), DefaultOption = PropertyValue.Create(defaultValue), }; - static List ToOptions(IReadOnlyList values) + static List LanguageOptions(IReadOnlyList languages) { - var options = new List(values.Count); - foreach (var value in values) - options.Add(value); // 隐式转换:string → ComboBoxOption(值即显示文本) + var options = new List { new(PropertyValue.Create(string.Empty), "default") }; + foreach (var lang in languages) + options.Add(lang); return options; } diff --git a/DiffSingerPredictor.cs b/DiffSingerPredictor.cs index ac469a9..706fec0 100644 --- a/DiffSingerPredictor.cs +++ b/DiffSingerPredictor.cs @@ -109,6 +109,7 @@ public float[] GetEmbedding(string acousticSpeaker) } // —— 词典加载 —— + // 策略:先加载 dsdict.yaml 作为默认底库,再叠加载入语种特定文件(后面覆盖前面)。 Dictionary GetEntries(string lang) { lock (mLock) @@ -117,7 +118,19 @@ Dictionary GetEntries(string lang) return cached; var map = new Dictionary(StringComparer.Ordinal); - foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml", "dsdict.yaml" }) + + // 1. 加载默认底库 dsdict.yaml(总是存在) + var defaultPath = Path.Combine(mDir, "dsdict.yaml"); + if (File.Exists(defaultPath)) + { + var root = DeserializeDsDict(defaultPath); + foreach (var e in root.entries) + if (!string.IsNullOrEmpty(e.grapheme)) + map[e.grapheme] = e.phonemes.ToArray(); + } + + // 2. 叠加载入语种特定文件(若存在则覆盖/补充) + foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml" }) { var path = Path.Combine(mDir, file); if (!File.Exists(path)) continue; @@ -125,8 +138,8 @@ Dictionary GetEntries(string lang) foreach (var e in root.entries) if (!string.IsNullOrEmpty(e.grapheme)) map[e.grapheme] = e.phonemes.ToArray(); - break; } + mEntryCache[lang] = map; return map; } diff --git a/DiffSingerSynthesisSession.cs b/DiffSingerSynthesisSession.cs index 82d7cb5..4cfe592 100644 --- a/DiffSingerSynthesisSession.cs +++ b/DiffSingerSynthesisSession.cs @@ -11,12 +11,13 @@ namespace DiffSingerForTuneLab; -// 一条 part 的合成会话。本阶段实现「声明面」:四个声明方法是选中声库能力集(VoicebankConfig)的纯函数—— -// 据 use_*_embed 暴露可编辑曲线、据 predict_* 暴露只读回显轨、据 speakers/languages 暴露 part/note 属性。 -// 调度与 6 级合成管线、产物发布为后续阶段:GetNextSegment 暂报「无待合成」,故宿主不驱动 SynthesizeNext, -// 会话呈现属性面板与轨但不产音——诚实的中间态。 -// 声明面(轨集合/属性面板)已上移到 DiffSingerVoiceEngine(经 DiffSingerDeclarations);本会话仅承载运行时: -// 调度、6 级推理管线、产物发布。轨 key 与 variance/gender/speed 规格复用 DiffSingerDeclarations(using static 引入)。 +// 一条 part 的合成会话。 +// 关键设计: +// · 区段式重渲染:修改某个音素时,仅以该音素为中心前后各扩展一个音素作为「重渲染区段」, +// 渲染后将新 mel 通过频谱过渡拼贴到原序列的 mel 谱上,避免整个序列被改变。 +// · pitch 锁定:用户修改 pitch 曲线后,自动音高预测(dspitch)被锁定不再重新生成, +// 仅缓存的预测作为 NaN 自由区的回退。除非用户选择 Retake 或 RedrawPitch。 +// · dur 忠于 UI:音素时间线由用户界面(note 时长 / 钉死音素)决定,首次渲染后只有显式请求才重新 phonemize。 public sealed class DiffSingerSynthesisSession : ISynthesisSession { readonly VoicebankConfig mConfig; @@ -25,18 +26,28 @@ public sealed class DiffSingerSynthesisSession : ISynthesisSession readonly DiffSingerModelCache mModelCache; readonly int mSamplingSteps; - // 运行时复用的声明派生物(每会话固定,构造期据声库能力集算一次): - // 可编辑轨集合(构造期订阅其区间编辑)+ 回显轨集合(产物 SynthesizedParameters 按其 key 聚合)。 + internal string VoiceId => mVoiceId; + + enum RenderMode { Normal, Retake } + volatile RenderMode mRenderMode; + volatile bool mRenderModeConsumed; + readonly OrderedMap mAutomationConfigs; readonly OrderedMap mReadbackConfigs; - // —— 调度状态(数据线程;按 note 间隙分块,账本式托管失效与产物)—— readonly IDisposable mNotesSubscription; - readonly List mSubscribedAutomations = new(); // 已订阅 RangeModified 的可编辑轨(Dispose 退订) - readonly Dictionary mNoteHandlers = new(); + readonly List mSubscribedAutomations = new(); + readonly Dictionary mNoteHandlers = new(); readonly List mPieces = new(); bool mNeedResegment; + // 缓存有效标志 + bool mHasValidCache; + + // 受影响的 time range(来自 OnRangeModified 或 note 修改),供 mel 拼贴使用 + double mAffectedStartTime = double.NaN; + double mAffectedEndTime = double.NaN; + public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext context, string voiceId, DiffSingerModelCache modelCache, int samplingSteps) { @@ -46,11 +57,9 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont mModelCache = modelCache; mSamplingSteps = samplingSteps; - // 声明派生物据声库能力集算一次(与引擎声明同一套 DiffSingerDeclarations,单一真相源)。 mAutomationConfigs = BuildAutomationConfigs(config); mReadbackConfigs = BuildReadbackConfigs(config); - // 变更接线(handler 只做廉价标脏;重活延迟到 Committed 重分块)——见 §5.9。 mNotesSubscription = NotifiableExtensions.WhenAny(context.Notes, SubscribeNote, UnsubscribeNote); context.Notes.ItemAdded += OnNotesStructureChanged; context.Notes.ItemRemoved += OnNotesStructureChanged; @@ -59,8 +68,6 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont context.PitchDeviation.RangeModified += OnRangeModified; context.Committed += OnCommitted; - // 可编辑轨(variance / gender / speed)区间编辑订阅:SDK 把声明上移到引擎后,宿主在「建会话之前」即 - // RefreshDeclarations 填好 Voice.AutomationConfigs(见 MidiPart 时序),故构造期 TryGetAutomation 即命中、直接订阅。 foreach (var key in mAutomationConfigs.Keys) if (context.TryGetAutomation(key, out var automation)) { @@ -71,25 +78,82 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont mNeedResegment = true; } - // 新建 note 的默认歌词:中性占位,待词典 G2P 阶段按声库词典择一有效词细化。 public string DefaultLyric => "a"; - // —— 调度:窗内第一个脏块的纯值边界(peek 廉价、确定性)—— + // 根据区间找出应处理的 piece 集合(NaN 表示全曲) + IEnumerable PiecesInRange(double start, double end) + { + if (double.IsNaN(start) || double.IsNaN(end)) + return mPieces; + return mPieces.Where(p => p.StartTime < end && p.EndTime > start); + } + + internal void RequestRetake() + { + mHasValidCache = false; + ClearPieceCaches(PiecesInRange(mAffectedStartTime, mAffectedEndTime)); + StatusChanged?.Invoke(); + } + + internal void RequestRetakeScoped(double scopeStart, double scopeEnd) + { + mHasValidCache = false; + SetAffectedRange(scopeStart, scopeEnd); + ClearPieceCaches(PiecesInRange(scopeStart, scopeEnd)); + StatusChanged?.Invoke(); + } + + internal void RequestRedrawPitch() + { + foreach (var piece in PiecesInRange(mAffectedStartTime, mAffectedEndTime)) + { + if (piece.CachedPhones == null) continue; + piece.Dirty = true; piece.Failed = false; + piece.CachedPitchPrediction = null; + piece.RedrawPitchRequested = true; + } + StatusChanged?.Invoke(); + } + + internal void RequestRedrawPitchScoped(double scopeStart, double scopeEnd) + { + SetAffectedRange(scopeStart, scopeEnd); + foreach (var piece in PiecesInRange(scopeStart, scopeEnd)) + { + if (piece.CachedPhones == null) continue; + piece.Dirty = true; piece.Failed = false; + piece.CachedPitchPrediction = null; + piece.RedrawPitchRequested = true; + } + StatusChanged?.Invoke(); + } + + void ClearPieceCaches(IEnumerable pieces) + { + foreach (var piece in pieces) + { + piece.Dirty = true; piece.Failed = false; + piece.CachedPitchPrediction = null; + piece.CachedVarianceCurves = default; + piece.CachedPhones = null; + piece.CachedMel = null; piece.CachedMelDims = null; + piece.CachedAudio = null; + piece.CachedPitchReadback = null; + piece.CachedVarianceReadback = new Dictionary>(); + piece.RedrawPitchRequested = false; + } + } + public SynthesisSegment? GetNextSegment(double startTime, double endTime) => FindNextDirtyPiece(startTime, endTime) is { } p ? new SynthesisSegment(p.StartTime, p.EndTime) : null; - // peek 与 commit 共用同一查找(确定性 + 同调度 tick 无编辑 ⇒ commit 重算得到 peek 报出的同一块)。 Piece? FindNextDirtyPiece(double startTime, double endTime) { - if (mNeedResegment) - Resegment(); - + if (mNeedResegment) Resegment(); foreach (var piece in mPieces) { - if (!piece.Dirty || piece.Failed || piece.Synthesizing) - continue; - if (piece.EndTime < startTime || piece.StartTime > endTime) - continue; + if (!piece.Dirty || piece.Failed || piece.Synthesizing) continue; + if (piece.EndTime < startTime || piece.StartTime > endTime) continue; return piece; } return null; @@ -100,7 +164,6 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can if (FindNextDirtyPiece(segment.StartTime, segment.EndTime) is not { } piece) return; - // 同步前缀(数据线程):物化不可变快照(本块 note 全集 + 按 note 范围开窗)。 var snapshot = mContext.GetSnapshot(piece.Notes, piece.Notes[0].StartTime.Value, piece.Notes.Max(n => n.EndTime.Value)); piece.Dirty = false; piece.Synthesizing = true; @@ -110,18 +173,41 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can var report = new Progress(p => { piece.Progress = p; StatusChanged?.Invoke(); }); try { - // offload:worker 只读冻结快照跑 ONNX(绝不碰活视图);模型懒加载经引擎级缓存(首载触发原生加载)。 - var rendered = await Task.Run(() => Render(snapshot, piece.Notes, report, cancellation), CancellationToken.None); + var rendered = await Task.Run(() => Render(snapshot, piece.Notes, piece, report, cancellation), CancellationToken.None); if (rendered != null && mPieces.Contains(piece)) { int rate = rendered.SampleRate; + + // 区段式音频拼贴:仅替换受影响的 time range,其余保持旧音频不变 + var stitchedAudio = StitchAudio(rendered.Audio, piece.CachedAudio, + rendered.StartTime, rate, mAffectedStartTime, mAffectedEndTime); + + // 缓存旧音频供下次拼贴 + piece.CachedAudio = stitchedAudio; + piece.Segment?.Dispose(); - piece.Segment = mContext.CreateAudioSegment((long)(rendered.StartTime * rate), rendered.Audio.Length, rate); - piece.Segment.Write(0, rendered.Audio); + piece.Segment = mContext.CreateAudioSegment((long)(rendered.StartTime * rate), stitchedAudio.Length, rate); + piece.Segment.Write(0, stitchedAudio); piece.Segment.Commit(); piece.Phonemes = rendered.Phonemes; - piece.PitchReadback = rendered.PitchReadback; - piece.VarianceReadback = rendered.VarianceReadback; + + // 回显曲线也做区间拼贴:未修改区间的 pitch/tension 与缓存的旧曲线一致 + piece.PitchReadback = StitchPoints(rendered.PitchReadback, piece.CachedPitchReadback, + mAffectedStartTime, mAffectedEndTime); + piece.CachedPitchReadback = piece.PitchReadback; + + if (rendered.VarianceReadback.Count > 0) + { + var stitchedVar = new Dictionary>(); + foreach (var kvp in rendered.VarianceReadback) + { + piece.CachedVarianceReadback.TryGetValue(kvp.Key, out var oldVar); + stitchedVar[kvp.Key] = StitchPoints(kvp.Value, oldVar, + mAffectedStartTime, mAffectedEndTime); + } + piece.VarianceReadback = stitchedVar; + piece.CachedVarianceReadback = stitchedVar; + } } } catch (Exception ex) @@ -137,11 +223,67 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can } } - // 推理链(worker,只读冻结快照):忠实移植 OpenUtau phonemizer + renderer(见记忆 openutau-is-authority)。 - // phonemizer(dsdur) → 音素时间线;renderer 加 head/tail SP padding、tokens[SP..SP]、durations[8..8]、 - // f0(Hz over totalFrames)、variance 预测+用户 delta 合成喂声学(纯预测产回显轨)、spk by frame、depth/steps。 - // gender/velocity 走用户曲线 + OpenUtau GENC/VELC convert;pitch 自由区走 dspitch 预测轮廓、已画处用户值覆盖。 - RenderResult? Render(SynthesisSnapshot snapshot, IReadOnlyList origins, + // —— 音频拼贴 —— + // 将新音频的 affected 区间(前后各扩展 3 帧过渡)替换到旧音频中。 + // 若旧音频不存在(首次渲染)则直接返回新音频。 + static float[] StitchAudio(float[] newAudio, float[]? oldAudio, + double renderStartSec, int sampleRate, + double affectedStart, double affectedEnd) + { + if (oldAudio == null || oldAudio.Length == 0) + return newAudio; + if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd)) + return newAudio; // 无明确 affected 区间时全量替换 + if (oldAudio.Length != newAudio.Length) + return newAudio; // 长度不同无法拼贴 + + // 计算 affected 区间的采样点范围(前后扩展 3 帧 = 3 * hop_size 采样点) + int hop = 512; // DiffSinger 标准 hop_size + int fadeSamples = 3 * hop; + int startSample = Math.Max(0, (int)((affectedStart - renderStartSec) * sampleRate) - fadeSamples); + int endSample = Math.Min(newAudio.Length, (int)((affectedEnd - renderStartSec) * sampleRate) + fadeSamples); + + if (startSample >= endSample) + return newAudio; + + var result = new float[oldAudio.Length]; + Array.Copy(oldAudio, result, oldAudio.Length); + + // 拷贝 affected 区间的新音频 + int copyLen = endSample - startSample; + Array.Copy(newAudio, startSample, result, startSample, copyLen); + + // 前过渡区:第一帧(线性渐入) + int fadeLen = Math.Min(fadeSamples, copyLen / 2); + for (int i = 0; i < fadeLen; i++) + { + float t = (float)(i + 1) / (fadeLen + 1); + int idx = startSample + i; + if (idx >= 0 && idx < result.Length) + result[idx] = oldAudio[idx] * (1 - t) + newAudio[idx] * t; + } + + // 后过渡区:最后一帧(线性渐出) + for (int i = 0; i < fadeLen; i++) + { + float t = (float)(i + 1) / (fadeLen + 1); + int idx = endSample - 1 - i; + if (idx >= 0 && idx < result.Length) + result[idx] = oldAudio[idx] * (1 - t) + newAudio[idx] * t; + } + + return result; + } + + // —— 推理链(worker,只读冻结快照)—— + + // 推理结果 + sealed record RenderResultEx(float[] Audio, double StartTime, int SampleRate, + List Phonemes, List PitchReadback, + Dictionary> VarianceReadback, + int[] MelDims, float[] Mel); + + RenderResultEx? Render(SynthesisSnapshot snapshot, IReadOnlyList origins, Piece piece, IProgress? progress, CancellationToken cancellation) { var notes = snapshot.Notes; @@ -152,30 +294,59 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can int hop = models.HopSize, sr = models.SampleRate, hidden = models.HiddenSize; double frameSec = (double)hop / sr; int head = DiffSingerFrames.HeadFrames; + int numMelBins = models.NumMelBins; - string partLang = snapshot.PartProperties.GetString(KeyLanguage, mConfig.Languages.Count > 0 ? mConfig.Languages[0] : string.Empty); + string partLang = snapshot.PartProperties.GetString(KeyLanguage, string.Empty); string speaker = snapshot.PartProperties.GetString(KeySpeaker, mConfig.Speakers.Count > 0 ? mConfig.Speakers[0] : string.Empty); var noteLang = notes.Select(nt => nt.Properties.GetString(KeyLanguage, partLang)).ToArray(); - // —— Phonemizer:歌词 → 音素时间线(绝对秒、含前置辅音越界)—— + // 渲染模式(仅 Retake 用会话级,RedrawPitch 由 piece 级标记驱动) + bool isRetake = false; + if (!mRenderModeConsumed && mRenderMode == RenderMode.Retake) + { + isRetake = true; + mRenderModeConsumed = true; + mRenderMode = RenderMode.Normal; + } + bool isRedrawPitch = piece.RedrawPitchRequested; + piece.RedrawPitchRequested = false; + + bool pieceHasNoCache = piece.CachedPhones == null; + bool needFullPredict = isRetake || !mHasValidCache || pieceHasNoCache; + bool needPitchPredict = needFullPredict || isRedrawPitch; + + // —— 模型优先级:dsdur/dspitch 提级模型 —— var durPred = models.GetPredictor("dsdur"); - var phones = durPred != null - ? DiffSingerPhonemizer.Phonemize(durPred, notes, noteLang, speaker, hop, sr) - : FallbackPhonemes(models, notes, noteLang); // 无 dur 预测器:每 note 一元音兜底 + var pitchPred = models.GetPredictor("dspitch"); + var varPred = models.GetPredictor("dsvariance"); + + // —— Phonemizer(needFullPredict 时才重新运行,否则复用缓存)—— + List phones; + if (needFullPredict) + { + phones = durPred != null + ? DiffSingerPhonemizer.Phonemize(durPred, notes, noteLang, speaker, hop, sr) + : FallbackPhonemes(models, notes, noteLang); + piece.CachedPhones = phones; + } + else + { + phones = piece.CachedPhones ?? new List(); + } if (phones.Count == 0) return null; progress?.Report(0.2); if (cancellation.IsCancellationRequested) return null; - // —— 帧布局:[head SP][...phones...][tail SP],累积取整 → durations(len=phones+2)—— + // —— 帧布局 —— var phoneDurSec = phones.Select(p => Math.Max(0, p.EndTime - p.StartTime)).ToArray(); var durations = DiffSingerFrames.PaddedPhoneFrames(phoneDurSec, frameSec); - int nTokens = durations.Length; // phones + 2 + int nTokens = durations.Length; int nFrames = durations.Sum(); double renderStart = phones[0].StartTime - head * frameSec; - // 逐帧时刻 + 说话人逐帧混合(mix: 曲线,acoustic/pitch/variance 三域共享;不画时退化为默认 speaker 恒权重)。 + // 逐帧时刻 + 说话人逐帧混合 var frameTimes = new double[nFrames]; for (int f = 0; f < nFrames; f++) frameTimes[f] = renderStart + (f + 0.5) * frameSec; var mixTracks = new List<(string Suffix, double[] Sampled)>(); @@ -184,7 +355,7 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can mixTracks.Add((suffix, mixAuto.Evaluator.Evaluate(frameTimes))); var speakerMix = DiffSingerSpeakerMix.Create(Suffix(speaker), mixTracks, nFrames); - // tokens/languages:声学表,前后加 SP。 + // tokens/languages var tokens = new long[nTokens]; var langs = new long[nTokens]; tokens[0] = AcousticToken(models, "SP"); @@ -195,7 +366,7 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can langs[i + 1] = models.TryGetLanguage(PhonemeLang(phones[i].Symbol), out var lid) ? lid : 0; } - // 逐帧 note 音高回退(head→首 note,phone i→其 note,tail→末 note)。 + // 逐帧 note 音高回退 var framePitch = new double[nFrames]; int fi = 0; for (int seg = 0; seg < nTokens; seg++) @@ -207,44 +378,74 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can for (int k = 0; k < durations[seg]; k++) framePitch[fi++] = pitch; } - // —— dspitch 自然音高预测(纯从音符、retake 全 true、不吃用户音高):替代自由区的矩形 note-step 兜底 —— - // 用户已画处(Pitch 非 NaN)用户值覆盖;NaN 自由区用预测轮廓(无 dspitch ⇒ 仍用矩形 framePitch);PITD/vibrato 叠加在上。 - var predictedPitch = DiffSingerPitch.Predict( - models.GetPredictor("dspitch"), phones, notes, durations, - renderStart, frameSec, speakerMix, mConfig, mSamplingSteps); + // —— 自动音高预测(仅在 needPitchPredict 时跑;否则复用缓存)—— + float[]? predictedPitch; + if (needPitchPredict) + { + predictedPitch = DiffSingerPitch.Predict(pitchPred, phones, notes, durations, + renderStart, frameSec, speakerMix, mConfig, mSamplingSteps); + piece.CachedPitchPrediction = predictedPitch; + } + else + { + predictedPitch = piece.CachedPitchPrediction; + } progress?.Report(0.28); if (cancellation.IsCancellationRequested) return null; - // 逐帧 f0(Hz) + 半音曲线(variance 用):帧中心采样双通道音高,NaN 自由区回退预测轮廓(无则 note 音高)。 + // 逐帧 f0(Hz) + 半音曲线 var pitchCurve = snapshot.Pitch.Evaluator.Evaluate(frameTimes); var deviation = snapshot.PitchDeviation.Evaluator.Evaluate(frameTimes); var f0 = new float[nFrames]; var semis = new float[nFrames]; var pitchReadback = new List(nFrames); + + // —— 初始生成自动音高时排除前后 SP 音高数据(不画到用户界面)—— + // 判断哪些帧属于 head SP(第 0 个 token)和 tail SP(最后一个 token) + int spHeadFrames = durations[0]; // head SP 的帧数 + int spTailFrames = durations[^1]; // tail SP 的帧数 + for (int f = 0; f < nFrames; f++) { - double fallback = predictedPitch != null - ? (f < predictedPitch.Length ? predictedPitch[f] : predictedPitch[^1]) - : framePitch[f]; + double fallback; + if (predictedPitch != null) + fallback = f < predictedPitch.Length ? predictedPitch[f] : predictedPitch[^1]; + else + fallback = framePitch[f]; + double semitone = (double.IsNaN(pitchCurve[f]) ? fallback : pitchCurve[f]) + deviation[f]; semis[f] = (float)semitone; f0[f] = DiffSingerFrames.ToneToFreq(semitone); - pitchReadback.Add(new Point(frameTimes[f], semitone)); + + // 排除前后 SP 帧的音高回显(不在 pitchReadback 中添加) + bool isHeadOrTailSp = f < spHeadFrames || f >= nFrames - spTailFrames; + if (!isHeadOrTailSp) + { + pitchReadback.Add(new Point(frameTimes[f], semitone)); + } } progress?.Report(0.3); if (cancellation.IsCancellationRequested) return null; - // —— variance 预测(基线;下方与用户 delta 合成喂声学、纯预测产回显)—— - var varCurves = DiffSingerVariance.Predict( - models.GetPredictor("dsvariance"), phones.Select(p => p.Symbol).ToList(), - durations, semis, speakerMix, mConfig, mSamplingSteps); + // —— variance 预测(needFullPredict 时重新预测;否则复用缓存)—— + VarianceCurves varCurves; + if (needFullPredict) + { + varCurves = DiffSingerVariance.Predict(varPred, phones.Select(p => p.Symbol).ToList(), + durations, semis, speakerMix, mConfig, mSamplingSteps); + piece.CachedVarianceCurves = varCurves; + } + else + { + varCurves = piece.CachedVarianceCurves; + } progress?.Report(0.45); if (cancellation.IsCancellationRequested) return null; - // —— 声学输入(按 InputMetadata 条件构造)—— + // —— 声学输入 —— var ac = models.Acoustic; var inputs = new List(); void AddL(string name, long[] data, int[] dims) @@ -257,9 +458,7 @@ void AddF(string name, float[] data, int[] dims) AddL("durations", durations.Select(x => (long)x).ToArray(), new[] { 1, nTokens }); AddF("f0", f0, new[] { 1, nFrames }); - // —— variance:预测 + 用户 delta 合成喂声学,同时产纯预测回显 —— - // 用户曲线按帧求值(连续轨:未编辑处=中性基线 → Delta 恒得纯预测;编辑处 → 叠加),clamp 到声学值域。 - // 回显(Use && Predict)= 纯预测值,不含用户编辑。 + // —— variance:使用(缓存的)预测 + 用户 delta 合成喂声学 —— var varReadback = new Dictionary>(); foreach (var spec in Variances) { @@ -275,8 +474,7 @@ void AddF(string name, float[] data, int[] dims) varReadback[spec.Key] = BuildReadbackSegment(spec, predicted, frameTimes, nFrames); } - // —— gender / velocity:纯用户曲线(无方差器基线),按帧 convert 喂声学(忠实移植 OpenUtau GENC/VELC)—— - // 无轨 / NaN 自由区 → 中性 → convert 得中性 embed(gender 0、velocity 1);OpenUtau 不 clamp(UI 量程已界定)。 + // —— gender / velocity —— AddF("gender", BuildCurveInput(snapshot, KeyGender, GenderBaseline, GenderConvert(), frameTimes, nFrames), new[] { 1, nFrames }); AddF("velocity", BuildCurveInput(snapshot, KeySpeed, SpeedBaseline, SpeedConvert, frameTimes, nFrames), new[] { 1, nFrames }); @@ -299,22 +497,47 @@ void AddF(string name, float[] data, int[] dims) inputs.Add(NamedOnnxValue.CreateFromTensor("speedup", new DenseTensor(new[] { speedup }, new[] { 1 }))); } + // —— 声学模型:产 mel —— using var melOut = ac.Run(inputs); - var mel = melOut.First(v => v.Name == "mel").AsTensor(); + var melTensor = melOut.First(v => v.Name == "mel").AsTensor(); + var melDims = melTensor.Dimensions.ToArray(); + var newMel = melTensor.ToArray(); + + // —— 区段式 mel 拼贴:仅替换受影响的 time range,其余保持旧 mel —— + float[] finalMel; + if (piece.CachedMel != null && piece.CachedMelDims != null + && melDims.SequenceEqual(piece.CachedMelDims) && piece.CachedMel.Length == newMel.Length) + { + finalMel = StitchMel(newMel, piece.CachedMel, frameTimes, nFrames, numMelBins, mAffectedStartTime, mAffectedEndTime); + } + else + { + finalMel = newMel; + } + + // 更新 mel 缓存 + piece.CachedMel = finalMel; + piece.CachedMelDims = melDims; + progress?.Report(0.75); if (cancellation.IsCancellationRequested) return null; - // —— 声码器:mel (+ f0) → 波形 —— + // —— 声码器(使用原始 mel 形状创建张量)—— var voc = models.Vocoder; - var vInputs = new List { NamedOnnxValue.CreateFromTensor("mel", mel) }; + var melShape = new int[melDims.Length]; + Array.Copy(melDims, melShape, melDims.Length); + var vInputs = new List + { + NamedOnnxValue.CreateFromTensor("mel", new DenseTensor(finalMel, melShape)) + }; if (voc.InputMetadata.ContainsKey("f0")) vInputs.Add(NamedOnnxValue.CreateFromTensor("f0", new DenseTensor(f0, new[] { 1, nFrames }))); using var wavOut = voc.Run(vInputs); var audio = wavOut.First(v => v.Name == "waveform").AsTensor().ToArray(); progress?.Report(1.0); - // —— 音素产物(绝对秒、韵核吸收伸缩)—— + // —— 音素产物 —— var phonemes = phones.Select(p => new SynthesizedPhoneme { Symbol = p.Symbol, @@ -324,7 +547,148 @@ void AddF(string name, float[] data, int[] dims) StretchWeight = p.IsVowel ? 1 : 0, }).ToList(); - return new RenderResult(audio, renderStart, sr, phonemes, pitchReadback, varReadback); + // 标记缓存有效(首次渲染成功后保持) + mHasValidCache = true; + + return new RenderResultEx(audio, renderStart, sr, phonemes, pitchReadback, varReadback, + melDims, finalMel); + } + + // —— 区段式 mel 拼贴 —— + // 将新 mel 的「受影响的 time range」替换到旧 mel 中,边界做 3 帧交叉过渡。 + // 旧 mel 为 null 或区间无效时直接返回新 mel。 + static float[] StitchMel(float[] newMel, float[]? oldMel, double[] frameTimes, + int nFrames, int numMelBins, double affectedStart, double affectedEnd) + { + if (oldMel == null || oldMel.Length != newMel.Length) + return newMel; + if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd)) + return newMel; + if (affectedEnd <= frameTimes[0] || affectedStart >= frameTimes[^1]) + return newMel; // affected 区间完全在渲染范围外 + + const int fadeFrames = 3; + int totalFrames = nFrames; + + // 找 affected 区间对应的帧范围(前后各扩展 fadeFrames 帧) + int startFrame = totalFrames - 1; + int endFrame = 0; + for (int f = 0; f < totalFrames; f++) + { + if (frameTimes[f] >= affectedStart && frameTimes[f] <= affectedEnd) + { + if (f < startFrame) startFrame = f; + if (f > endFrame) endFrame = f; + } + } + startFrame = Math.Max(0, startFrame - fadeFrames); + endFrame = Math.Min(totalFrames - 1, endFrame + fadeFrames); + + if (startFrame >= endFrame) + return newMel; + + var result = new float[newMel.Length]; + Array.Copy(oldMel, result, newMel.Length); + + // 区间内直接替换为新 mel + for (int f = startFrame + fadeFrames; f <= endFrame - fadeFrames; f++) + for (int b = 0; b < numMelBins; b++) + result[f * numMelBins + b] = newMel[f * numMelBins + b]; + + // 前过渡区(fadeFrames 帧线性渐入) + for (int i = 0; i < fadeFrames && startFrame + i < totalFrames; i++) + { + float t = (float)(i + 1) / (fadeFrames + 1); + int f = startFrame + i; + for (int b = 0; b < numMelBins; b++) + { + int idx = f * numMelBins + b; + result[idx] = oldMel[idx] * (1 - t) + newMel[idx] * t; + } + } + + // 后过渡区(fadeFrames 帧线性渐出) + for (int i = 0; i < fadeFrames && endFrame - i >= 0; i++) + { + float t = (float)(i + 1) / (fadeFrames + 1); + int f = endFrame - i; + for (int b = 0; b < numMelBins; b++) + { + int idx = f * numMelBins + b; + result[idx] = oldMel[idx] * (1 - t) + newMel[idx] * t; + } + } + + return result; + } + + // —— Point 列表拼贴 —— + // 将受 affected 区间内的 Point 替换为新列表中的值,其余保持旧列表不变。 + // 旧列表为 null 或帧结构改变(点数不同)时直接返回新列表。 + static IReadOnlyList StitchPoints(IReadOnlyList newPoints, IReadOnlyList? oldPoints, + double affectedStart, double affectedEnd) + { + if (oldPoints == null || oldPoints.Count == 0) + return newPoints; + if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd)) + return newPoints; + if (newPoints.Count == 0) + return oldPoints; + if (newPoints.Count != oldPoints.Count) + return newPoints; + if (Math.Abs(newPoints[0].X - oldPoints[0].X) > 0.001) + return newPoints; + + int oldStart = 0, oldEnd = oldPoints.Count; + while (oldStart < oldPoints.Count && oldPoints[oldStart].X < affectedStart) oldStart++; + while (oldEnd > 0 && oldPoints[oldEnd - 1].X > affectedEnd) oldEnd--; + + if (oldStart >= oldEnd) + return newPoints; + + int newStart = 0, newEnd = newPoints.Count; + while (newStart < newPoints.Count && newPoints[newStart].X < affectedStart) newStart++; + while (newEnd > 0 && newPoints[newEnd - 1].X > affectedEnd) newEnd--; + + if (newStart >= newEnd) + return oldPoints; + + // 拼接并在边界做 3 点线性过渡,避免 pitch 断层 + const int fadeCount = 3; + var result = new List(oldStart + (newEnd - newStart) + (oldPoints.Count - oldEnd)); + + // 旧区间(前段) + for (int i = 0; i < oldStart; i++) result.Add(oldPoints[i]); + + // 前过渡:fadeCount 个点的旧→新渐变 + for (int i = 0; i < fadeCount && newStart + i < newEnd; i++) + { + float t = (float)(i + 1) / (fadeCount + 1); + double x = newPoints[newStart + i].X; + double y = oldPoints[oldStart + i].Y * (1 - t) + newPoints[newStart + i].Y * t; + result.Add(new Point(x, y)); + } + + // 中间段:全量新值(仅当区间足够长时) + for (int i = newStart + fadeCount; i < newEnd - fadeCount; i++) + result.Add(newPoints[i]); + + // 后过渡:fadeCount 个点的新→旧渐变(按时间递增顺序) + int backStart = Math.Max(newStart, newEnd - fadeCount); + for (int i = backStart; i < newEnd; i++) + { + float t = (float)(newEnd - i) / (fadeCount + 1); + int oi = oldEnd - (newEnd - i); + if (oi < 0 || oi >= oldPoints.Count) continue; + double x = newPoints[i].X; + double y = oldPoints[oi].Y * (1 - t) + newPoints[i].Y * t; + result.Add(new Point(x, y)); + } + + // 旧区间(后段) + for (int i = oldEnd; i < oldPoints.Count; i++) result.Add(oldPoints[i]); + + return result; } // 无 dur 预测器兜底:每 note 一元音、占满 note 时长(无对齐/无 head/tail 之外的处理)。 @@ -532,36 +896,78 @@ void Resegment() StatusChanged?.Invoke(); } + sealed record NoteHandlers(Action OnDur, Action OnPitch, Action OnLyric, Action OnProps); + void SubscribeNote(ILiveNote note) { - void Handler() + Action onDur = () => + { + SetAffectedRange(note.StartTime.Value, note.EndTime.Value); + MarkPieceDirty(note, clearPhones: true, clearPitch: false, clearVariance: true); + mNeedResegment = true; + }; + Action onPitch = () => + { + SetAffectedRange(note.StartTime.Value, note.EndTime.Value); + MarkPieceDirty(note, clearPhones: false, clearPitch: false, clearVariance: false); + }; + Action onLyric = () => + { + SetAffectedRange(note.StartTime.Value, note.EndTime.Value); + MarkPieceDirty(note, clearPhones: true, clearPitch: true, clearVariance: true); + mNeedResegment = true; + }; + Action onProps = () => { - foreach (var piece in mPieces) - if (piece.Notes.Contains(note)) { piece.Dirty = true; piece.Failed = false; } + SetAffectedRange(note.StartTime.Value, note.EndTime.Value); + MarkPieceDirty(note, clearPhones: true, clearPitch: false, clearVariance: true); mNeedResegment = true; + }; + + note.StartTime.Modified += onDur; + note.EndTime.Modified += onDur; + note.Phonemes.Modified += onDur; + note.Pitch.Modified += onPitch; + note.Lyric.Modified += onLyric; + note.Properties.Modified += onProps; + + mNoteHandlers[note] = new NoteHandlers(onDur, onPitch, onLyric, onProps); + } + + void MarkPieceDirty(ILiveNote note, bool clearPhones, bool clearPitch, bool clearVariance) + { + foreach (var piece in mPieces) + { + if (!piece.Notes.Contains(note)) continue; + piece.Dirty = true; piece.Failed = false; + if (clearPhones) piece.CachedPhones = null; + if (clearPitch) piece.CachedPitchPrediction = null; + if (clearVariance) piece.CachedVarianceCurves = default; + return; } - mNoteHandlers[note] = Handler; - note.StartTime.Modified += Handler; - note.EndTime.Modified += Handler; - note.Pitch.Modified += Handler; - note.Lyric.Modified += Handler; - note.Phonemes.Modified += Handler; - note.Properties.Modified += Handler; + } + + void SetAffectedRange(double start, double end) + { + double pad = 0.1; + mAffectedStartTime = start - pad; + mAffectedEndTime = end + pad; } void UnsubscribeNote(ILiveNote note) { - if (!mNoteHandlers.Remove(note, out var handler)) - return; - note.StartTime.Modified -= handler; - note.EndTime.Modified -= handler; - note.Pitch.Modified -= handler; - note.Lyric.Modified -= handler; - note.Phonemes.Modified -= handler; - note.Properties.Modified -= handler; + if (mNoteHandlers.Remove(note, out var h)) + { + note.StartTime.Modified -= h.OnDur; + note.EndTime.Modified -= h.OnDur; + note.Phonemes.Modified -= h.OnDur; + note.Pitch.Modified -= h.OnPitch; + note.Lyric.Modified -= h.OnLyric; + note.Properties.Modified -= h.OnProps; + } } - void OnNotesStructureChanged(ILiveNote note) => mNeedResegment = true; + void OnNotesStructureChanged(ILiveNote note) { mNeedResegment = true; } void MarkAllDirtyAndResegment() { @@ -577,6 +983,9 @@ void OnCommitted() void OnRangeModified(double startTime, double endTime) { + // 记录受影响的 time range(自动化曲线修改) + SetAffectedRange(startTime, endTime); + foreach (var piece in mPieces) { if (piece.EndTime < startTime || piece.StartTime > endTime) @@ -587,10 +996,6 @@ void OnRangeModified(double startTime, double endTime) StatusChanged?.Invoke(); } - sealed record RenderResult(float[] Audio, double StartTime, int SampleRate, - List Phonemes, List PitchReadback, - Dictionary> VarianceReadback); - sealed class Piece { public required IReadOnlyList Notes; @@ -605,5 +1010,20 @@ sealed class Piece public IReadOnlyList Phonemes = []; public IReadOnlyList PitchReadback = []; public IReadOnlyDictionary> VarianceReadback = new Dictionary>(); + // 回显曲线缓存(区间拼贴用) + public IReadOnlyList? CachedPitchReadback; + public IReadOnlyDictionary> CachedVarianceReadback = new Dictionary>(); + + // 缓存:note 代数(判断缓存有效性) + // 模型预测缓存(增量渲染时复用) + public float[]? CachedPitchPrediction; + public VarianceCurves CachedVarianceCurves; + public List? CachedPhones; + // mel 缓存(用于交叉过渡) + public float[]? CachedMel; + public int[]? CachedMelDims; + public float[]? CachedAudio; + // piece 级 RedrawPitch 请求标记 + public bool RedrawPitchRequested; } } diff --git a/DiffSingerVariance.cs b/DiffSingerVariance.cs index 34036cb..3c07e23 100644 --- a/DiffSingerVariance.cs +++ b/DiffSingerVariance.cs @@ -42,14 +42,21 @@ public static VarianceCurves Predict( var langs = symbols.Select(s => v.LangId(PhonemeLanguage(s))).Prepend(0L).Append(0L).ToArray(); var isVowel = symbols.Select(v.IsVowel).ToArray(); - // —— linguistic(词模式)—— - var (wordDiv, wordDur) = DiffSingerFrames.PaddedWordDivAndDur(isVowel, phDur); + // —— linguistic(据编码器实际输入选择词模式或音素模式)—— var lingInputs = new List { NvL("tokens", tokens, nTokens), - NvL("word_div", wordDiv, wordDiv.Length), - NvL("word_dur", wordDur, wordDur.Length), }; + if (v.LinguisticUsesWordBoundary) + { + var (wordDiv, wordDur) = DiffSingerFrames.PaddedWordDivAndDur(isVowel, phDur); + lingInputs.Add(NvL("word_div", wordDiv, wordDiv.Length)); + lingInputs.Add(NvL("word_dur", wordDur, wordDur.Length)); + } + else + { + lingInputs.Add(NvL("ph_dur", phDur.Select(x => (long)x).ToArray(), nTokens)); + } if (v.Linguistic.InputMetadata.ContainsKey("languages")) lingInputs.Add(NvL("languages", langs, nTokens)); using var lingOut = v.Linguistic.Run(lingInputs); diff --git a/DiffSingerVoiceEngine.cs b/DiffSingerVoiceEngine.cs index c52dba7..cfa1a3b 100644 --- a/DiffSingerVoiceEngine.cs +++ b/DiffSingerVoiceEngine.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using TuneLab.Foundation; using TuneLab.SDK; @@ -32,6 +33,44 @@ public void Destroy() mModelCache = null; } + // —— 会话注册表(引擎级,跨会话共享)—— + readonly List> mSessions = new(); + + internal void RegisterSession(DiffSingerSynthesisSession session) + { + lock (mSessions) + { + // 清理已回收的弱引用 + mSessions.RemoveAll(wr => !wr.TryGetTarget(out _)); + mSessions.Add(new WeakReference(session)); + } + } + + internal void UnregisterSession(DiffSingerSynthesisSession session) + { + lock (mSessions) + { + mSessions.RemoveAll(wr => !wr.TryGetTarget(out var s) || s == session); + } + } + + internal DiffSingerSynthesisSession? FindSessionByVoiceId(string voiceId) + { + lock (mSessions) + { + // 清理已回收 + mSessions.RemoveAll(wr => !wr.TryGetTarget(out _)); + // 取最后一个匹配 voiceId 的会话(通常只有一个) + DiffSingerSynthesisSession? found = null; + foreach (var wr in mSessions) + { + if (wr.TryGetTarget(out var s) && s.VoiceId == voiceId) + found = s; + } + return found; + } + } + public ISynthesisSession CreateSession(string voiceId, ISynthesisContext context) { if (!mState.Banks.ContainsKey(voiceId)) @@ -40,7 +79,9 @@ public ISynthesisSession CreateSession(string voiceId, ISynthesisContext context // 推理走引擎级模型缓存(懒加载、按 voiceId 共享);声明面(轨/面板)已上移到引擎方法、建会话前即填好。 var config = ConfigFor(voiceId)!; var samplingSteps = mSettings.GetInt(KeySamplingSteps, 20); - return new DiffSingerSynthesisSession(config, context, voiceId, EnsureModelCache(), samplingSteps); + var session = new DiffSingerSynthesisSession(config, context, voiceId, EnsureModelCache(), samplingSteps); + RegisterSession(session); + return session; } // —— 声明(引擎层、纯函数 of (voiceId, part 值);宿主在每次 part 参数 commit 时按当前值重算 diff 到 UI)—— @@ -52,10 +93,82 @@ public IReadOnlyOrderedMap GetSynthesizedParameterConf => ConfigFor(context.VoiceId) is { } c ? DiffSingerDeclarations.BuildReadbackConfigs(c) : EmptyAutomations; public ObjectConfig GetPartPropertyConfig(IPartPropertyContext context) - => ConfigFor(context.VoiceId) is { } c ? DiffSingerDeclarations.BuildPartConfig(c) : EmptyConfig; + { + if (ConfigFor(context.VoiceId) is not { } c) + return EmptyConfig; + + var baseConfig = DiffSingerDeclarations.BuildPartConfig(c); + var voiceId = context.VoiceId; + + // Properties 是 IReadOnlyOrderedMap,需要新建可写集合 + var props = new OrderedMap(); + foreach (var kvp in baseConfig.Properties) + props.Add(kvp.Key, kvp.Value); + + // 追加 Retake 和 Redraw Pitch 按钮 + props.Add("_retake", new ButtonConfig + { + DisplayText = L.Tr("Retake"), + Action = () => FindSessionByVoiceId(voiceId)?.RequestRetake(), + }); + props.Add("_redraw_pitch", new ButtonConfig + { + DisplayText = L.Tr("Redraw Pitch"), + Action = () => FindSessionByVoiceId(voiceId)?.RequestRedrawPitch(), + }); + + return new ObjectConfig { Properties = props }; + } public ObjectConfig GetNotePropertyConfig(INotePropertyContext context) - => ConfigFor(context.VoiceId) is { } c ? DiffSingerDeclarations.BuildNoteConfig(c, context) : EmptyConfig; + { + if (ConfigFor(context.VoiceId) is not { } c) + return EmptyConfig; + + var baseConfig = DiffSingerDeclarations.BuildNoteConfig(c, context); + var voiceId = context.VoiceId; + double selStart = context.SelectionStartTime; + double selEnd = context.SelectionEndTime; + bool hasSelection = !double.IsNaN(selStart) && !double.IsNaN(selEnd); + + var props = new OrderedMap(); + foreach (var kvp in baseConfig.Properties) + props.Add(kvp.Key, kvp.Value); + + // 追加 Retake 和 Redraw Pitch 按钮(使用选中音符的区间而非全曲) + props.Add("_retake", new ButtonConfig + { + DisplayText = L.Tr("Retake"), + Action = () => + { + var session = FindSessionByVoiceId(voiceId); + if (session != null) + { + if (hasSelection) + session.RequestRetakeScoped(selStart, selEnd); + else + session.RequestRetake(); + } + }, + }); + props.Add("_redraw_pitch", new ButtonConfig + { + DisplayText = L.Tr("Redraw Pitch"), + Action = () => + { + var session = FindSessionByVoiceId(voiceId); + if (session != null) + { + if (hasSelection) + session.RequestRedrawPitchScoped(selStart, selEnd); + else + session.RequestRedrawPitch(); + } + }, + }); + + return new ObjectConfig { Properties = props }; + } // 声库能力集按 voiceId 缓存(声明每次 commit 都调,避免重复解析 dsconfig);config 随声库不可变,扫描重建时清空。 VoicebankConfig? ConfigFor(string voiceId) From 9d8bd78f3ae71d521fca5da1e486230b23452c8e Mon Sep 17 00:00:00 2001 From: tachengP <2638591622@qq.com> Date: Sat, 20 Jun 2026 17:35:27 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E9=87=8D=E6=B8=B2?= =?UTF-8?q?=E6=9F=93=E6=8B=BC=E8=B4=B4=E7=BC=BA=E9=99=B7=EF=BC=8C=E6=81=A2?= =?UTF-8?q?=E5=A4=8D=E5=BB=B6=E9=9F=B3=E7=AC=A6+=E3=80=81-=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=EF=BC=8C=E5=AE=9E=E7=8E=B0=E9=9F=B3=E7=B4=A0=E7=BC=96?= =?UTF-8?q?=E8=BE=91=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DiffSingerForTuneLab.code-workspace | 4 + DiffSingerPhonemizer.cs | 58 ++++++- DiffSingerPitch.cs | 4 +- DiffSingerPredictor.cs | 102 +++++++++++- DiffSingerSynthesisSession.cs | 236 +++++++++++++++++++++++++--- DiffSingerVoiceEngine.cs | 72 +++++++++ 6 files changed, 449 insertions(+), 27 deletions(-) diff --git a/DiffSingerForTuneLab.code-workspace b/DiffSingerForTuneLab.code-workspace index 1131316..1c88dca 100644 --- a/DiffSingerForTuneLab.code-workspace +++ b/DiffSingerForTuneLab.code-workspace @@ -7,6 +7,10 @@ { "name": "TuneLab (参考:SDK/docs/范例)", "path": "../TuneLab" + }, + { + "name": "OpenUtau-lunai (参考:OpenUtau.Core)", + "path": "../OpenUtau-lunai" } ], "settings": { diff --git a/DiffSingerPhonemizer.cs b/DiffSingerPhonemizer.cs index 187d41e..03ac60b 100644 --- a/DiffSingerPhonemizer.cs +++ b/DiffSingerPhonemizer.cs @@ -48,9 +48,24 @@ public static List Phonemize( for (int i = 0; i < notes.Count; i++) { var note = notes[i]; + string lyric = note.Lyric ?? string.Empty; string[] symbols = GetSymbols(dur, note, noteLang[i], out pinned[i]); noteSymbolCount[i] = symbols.Length; + // 连音符:不产生新音素,只延展前音的时长(通过 group 自然吸收) + // (+) 额外在起始点插入一个空 vowel 边界组,强制对齐 + if (symbols.Length == 0 && (lyric == "-" || lyric == "+")) + { + if (lyric == "+") + { + // 插入一个空韵核组作为强制对齐边界 + groups.Add(new Group(note.StartTime, note.Pitch)); + } + // - 则完全不做任何事,前组自然吸收时长 + notePhIndex.Add(notePhIndex[^1]); + continue; + } + var wordGroups = ProcessWord(dur, note, symbols); groups[^1].Phonemes.AddRange(wordGroups[0].Phonemes); // 前置辅音并入前一组(侵入前一 note 尾) groups.AddRange(wordGroups.Skip(1)); // 韵核组(起点=note 起点) @@ -152,17 +167,56 @@ static List ProcessWord(DiffSingerPredictor dur, SynthesisNoteSnapshot no return wordGroups; } - // 取音素符号串:钉死=用 note.Phonemes 符号;否则 G2P。过滤到「类型已定义 且 dur 表可 tokenize」;空则 [SP]。 + // 取音素符号串:钉死/编辑器→用已有 phonemes 或 _phonemes 属性;连音符→空(slur 延展前音素);否则 G2P。 + // 空结果且非连音符→ [SP] 兜底。 static string[] GetSymbols(DiffSingerPredictor dur, SynthesisNoteSnapshot note, string lang, out bool pinned) { + string lyric = note.Lyric ?? string.Empty; + if (lyric == "-" || lyric == "+") + { + pinned = false; + return Array.Empty(); + } + + // 优先使用 _phonemes 属性(音素编辑器写入) + var phonemesProp = note.Properties.GetString("_phonemes", ""); + if (!string.IsNullOrEmpty(phonemesProp) && phonemesProp != "[]") + { + pinned = true; + return ParsePhonemesProperty(phonemesProp).Where(s => !string.IsNullOrEmpty(s) && dur.TryPhoneme(s, out _)).ToArray(); + } + + // 其次使用钉死音素 pinned = note.Phonemes.Count > 0; IEnumerable raw = pinned ? note.Phonemes.Select(p => p.Symbol) - : dur.G2P(note.Lyric ?? string.Empty, lang); + : dur.G2P(lyric, lang); var symbols = raw.Where(s => dur.IsKnownSymbol(s) && dur.TryPhoneme(s, out _)).ToArray(); return symbols.Length > 0 ? symbols : new[] { Pause }; } + // 从 JSON 字符串解析音素符号列表:[{"s":"ja/b","v":false},...] + static string[] ParsePhonemesProperty(string json) + { + var result = new List(); + if (string.IsNullOrEmpty(json) || json.Length < 2) return result.ToArray(); + try + { + int i = 0; + while (true) + { + int sIdx = json.IndexOf("\"s\":\"", i); + if (sIdx < 0) break; + sIdx += 5; + int eIdx = json.IndexOf('"', sIdx); + if (eIdx > sIdx) result.Add(json.Substring(sIdx, eIdx - sIdx)); + i = eIdx + 1; + } + } + catch { } + return result.ToArray(); + } + // OpenUtau stretch:source[from..from+count) 的帧时长按 ratio 缩放、终点对齐 endPos,返回各音素起点秒。 static IEnumerable Stretch(IReadOnlyList source, int from, int count, double ratio, double endPos) { diff --git a/DiffSingerPitch.cs b/DiffSingerPitch.cs index 02f321d..9e2dee8 100644 --- a/DiffSingerPitch.cs +++ b/DiffSingerPitch.cs @@ -129,8 +129,10 @@ public static class DiffSingerPitch } durSec.Add(Math.Max(0, note.EndTime - note.StartTime)); midiList.Add(note.Pitch); - if ((note.Lyric ?? string.Empty).StartsWith("+")) + string lyric = note.Lyric ?? string.Empty; + if (lyric.StartsWith("+") || lyric == "-") { + // slur 延音符继承前音 rest 状态,确保 pitch 模型为其生成正确过渡音高 restList.Add(restList[^1]); } else diff --git a/DiffSingerPredictor.cs b/DiffSingerPredictor.cs index 706fec0..270a657 100644 --- a/DiffSingerPredictor.cs +++ b/DiffSingerPredictor.cs @@ -75,13 +75,21 @@ public int PhonemeToken(string symbol) : throw new InvalidOperationException($"音素 \"{symbol}\" 不在 {Path.GetFileName(mDir)} 的音素表中"); public long LangId(string lang) => mLanguages.TryGetValue(lang, out var id) ? id : 0; - // —— G2P:按语言查 dsdict-{lang}.yaml 词条(grapheme→带前缀音素),exact 后小写回退 —— + // —— G2P:优先查语言特定词典(dsdict-{lang}.yaml),避免默认底库(dsdict.yaml 以 zh 为主)污染;再试 replacements;最后才兜底查合并词典。 —— public string[] G2P(string lyric, string lang) { - var entries = GetEntries(lang); var key = lyric.Trim(); - if (entries.TryGetValue(key, out var phs)) return phs; - if (entries.TryGetValue(key.ToLowerInvariant(), out phs)) return phs; + // 1. 语言特定词典(不含默认底库) + var langEntries = GetLanguageSpecificEntries(lang); + if (langEntries.TryGetValue(key, out var phs)) return phs; + if (langEntries.TryGetValue(key.ToLowerInvariant(), out phs)) return phs; + // 2. 替换规则(en/ko 等无 entries 的语种) + var replaced = ApplyReplacements(lyric, lang); + if (replaced.Length > 0) return replaced; + // 3. 最后才查合并词典(含默认底库 dsdict.yaml,作为未知字素的最终兜底) + var allEntries = GetEntries(lang); + if (allEntries.TryGetValue(key, out phs)) return phs; + if (allEntries.TryGetValue(key.ToLowerInvariant(), out phs)) return phs; return Array.Empty(); } @@ -108,8 +116,78 @@ public float[] GetEmbedding(string acousticSpeaker) } } + // —— 替换规则(用于 EN/KO 等无 entries 仅 replacements 的语种)—— + readonly Dictionary> mReplacements = new(StringComparer.Ordinal); + + void LoadReplacements(string lang) + { + if (mReplacements.ContainsKey(lang)) return; + var list = new List<(string from, string to)>(); + foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml", "dsdict.yaml" }) + { + var path = Path.Combine(mDir, file); + if (!File.Exists(path)) continue; + try + { + var yaml = new DeserializerBuilder().Build(); + var doc = yaml.Deserialize>(File.ReadAllText(path)); + if (doc != null && doc.TryGetValue("replacements", out var reps) && reps is List repList) + { + foreach (var r in repList) + { + if (r is Dictionary repDict) + { + string? from = repDict.TryGetValue("from", out var fv) ? fv?.ToString() : null; + string? to = repDict.TryGetValue("to", out var tv) ? tv?.ToString() : null; + if (!string.IsNullOrEmpty(from) && !string.IsNullOrEmpty(to)) + list.Add((from, to)); + } + } + } + } + catch { } + } + mReplacements[lang] = list; + } + + // 用替换规则将歌词转为音素(按最长匹配优先) + public string[] ApplyReplacements(string lyric, string lang) + { + LoadReplacements(lang); + if (!mReplacements.TryGetValue(lang, out var reps) || reps.Count == 0) + return Array.Empty(); + + var repsSorted = reps.OrderByDescending(r => r.from.Length).ToList(); + var result = new List(); + string text = lyric.ToLowerInvariant(); + int pos = 0; + while (pos < text.Length) + { + bool matched = false; + foreach (var (from, to) in repsSorted) + { + if (pos + from.Length <= text.Length && text.Substring(pos, from.Length) == from) + { + result.Add(to); + pos += from.Length; + matched = true; + break; + } + } + if (!matched) + { + // 单个字符作为独立音素 + string ch = text[pos].ToString(); + result.Add(ch); + pos++; + } + } + return result.ToArray(); + } + // —— 词典加载 —— // 策略:先加载 dsdict.yaml 作为默认底库,再叠加载入语种特定文件(后面覆盖前面)。 + // 若 entries 为空且 replacements 存在,留空返回(上层调用 ApplyReplacements)。 Dictionary GetEntries(string lang) { lock (mLock) @@ -145,6 +223,22 @@ Dictionary GetEntries(string lang) } } + // 仅加载语言特定词典(不含默认底库 dsdict.yaml),用于 G2P 的优先查表——避免 zh 底库污染其他语言的译音。 + Dictionary GetLanguageSpecificEntries(string lang) + { + var map = new Dictionary(StringComparer.Ordinal); + foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml" }) + { + var path = Path.Combine(mDir, file); + if (!File.Exists(path)) continue; + var root = DeserializeDsDict(path); + foreach (var e in root.entries) + if (!string.IsNullOrEmpty(e.grapheme)) + map[e.grapheme] = e.phonemes.ToArray(); + } + return map; + } + void LoadSymbolTypes(string dsdictPath) { if (!File.Exists(dsdictPath)) return; diff --git a/DiffSingerSynthesisSession.cs b/DiffSingerSynthesisSession.cs index 4cfe592..2547440 100644 --- a/DiffSingerSynthesisSession.cs +++ b/DiffSingerSynthesisSession.cs @@ -80,6 +80,53 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont public string DefaultLyric => "a"; + // 获取默认 G2P 音素(供音素编辑器填充) + internal List GetDefaultPhonemesForNote(double time, string partLang = "") + { + ILiveNote? targetNote = null; + foreach (var note in mContext.Notes) + { + if (note.StartTime.Value <= time && note.EndTime.Value >= time) + { targetNote = note; break; } + } + if (targetNote == null) return new List(); + + try + { + var models = mModelCache.GetOrLoad(mVoiceId, mConfig); + var durPred = models.GetPredictor("dsdur"); + if (durPred == null) return new List(); + + string lyric = targetNote.Lyric.Value ?? string.Empty; + // partLang 由调用方从快照传入(不依赖 live mContext.PartProperties),确保段落语言变更后能取到新值 + if (string.IsNullOrEmpty(partLang)) + { + var partLangVal = mContext.PartProperties.GetValue(KeyLanguage, PropertyValue.Create(string.Empty)); + partLang = partLangVal.ToString(out var pl) ? pl : string.Empty; + } + var noteLangVal = targetNote.Properties.GetValue(KeyLanguage, PropertyValue.Create(string.Empty)); + // 注意:ToString 对空字符串也返回 true,故需排除空串,才能正确回退到 partLang + string noteLang = noteLangVal.ToString(out var nl) && !string.IsNullOrEmpty(nl) ? nl : partLang; + + var result = new List(); + string[] symbols = durPred.G2P(lyric, noteLang); + if (symbols.Length == 0) return result; + + foreach (var sym in symbols) + { + if (string.IsNullOrEmpty(sym)) continue; + result.Add(new PhonemeEntry + { + Symbol = sym, + IsVowel = durPred.IsVowel(sym), + IsGlide = durPred.IsGlide(sym), + }); + } + return result; + } + catch { return new List(); } + } + // 根据区间找出应处理的 piece 集合(NaN 表示全曲) IEnumerable PiecesInRange(double start, double end) { @@ -178,9 +225,15 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can { int rate = rendered.SampleRate; + // 用渲染后的音素边界精化过渡区间:使过渡在追溯音素内部进行 + double stitchStart = mAffectedStartTime; + double stitchEnd = mAffectedEndTime; + RefineStitchRange(rendered.Phonemes, + ref stitchStart, ref stitchEnd, piece.FrameSec); + // 区段式音频拼贴:仅替换受影响的 time range,其余保持旧音频不变 var stitchedAudio = StitchAudio(rendered.Audio, piece.CachedAudio, - rendered.StartTime, rate, mAffectedStartTime, mAffectedEndTime); + rendered.StartTime, rate, stitchStart, stitchEnd); // 缓存旧音频供下次拼贴 piece.CachedAudio = stitchedAudio; @@ -295,6 +348,7 @@ sealed record RenderResultEx(float[] Audio, double StartTime, int SampleRate, double frameSec = (double)hop / sr; int head = DiffSingerFrames.HeadFrames; int numMelBins = models.NumMelBins; + piece.FrameSec = frameSec; string partLang = snapshot.PartProperties.GetString(KeyLanguage, string.Empty); string speaker = snapshot.PartProperties.GetString(KeySpeaker, mConfig.Speakers.Count > 0 ? mConfig.Speakers[0] : string.Empty); @@ -366,16 +420,18 @@ sealed record RenderResultEx(float[] Audio, double StartTime, int SampleRate, langs[i + 1] = models.TryGetLanguage(PhonemeLang(phones[i].Symbol), out var lid) ? lid : 0; } - // 逐帧 note 音高回退 + // 逐帧 note 音高回退(直接按 note 时间区间查找,不受 -/+ 延音符无音素的影响) var framePitch = new double[nFrames]; - int fi = 0; - for (int seg = 0; seg < nTokens; seg++) { - int ni = seg == 0 ? phones[0].NoteIndex - : seg == nTokens - 1 ? phones[^1].NoteIndex - : phones[seg - 1].NoteIndex; - int pitch = notes[ni].Pitch; - for (int k = 0; k < durations[seg]; k++) framePitch[fi++] = pitch; + int ni = 0; + for (int f = 0; f < nFrames; f++) + { + double t = frameTimes[f]; + // 找出包含当前帧的 note(延音符未产生音素,但 pitch 仍需跟随它的音高) + while (ni < notes.Count - 1 && t >= notes[ni + 1].StartTime) + ni++; + framePitch[f] = notes[ni].Pitch; + } } // —— 自动音高预测(仅在 needPitchPredict 时跑;否则复用缓存)—— @@ -459,6 +515,7 @@ void AddF(string name, float[] data, int[] dims) AddF("f0", f0, new[] { 1, nFrames }); // —— variance:使用(缓存的)预测 + 用户 delta 合成喂声学 —— + // 回显显示最终喂声学的值(预测+包络加权),包络修改时实参同步跟随 var varReadback = new Dictionary>(); foreach (var spec in Variances) { @@ -467,11 +524,12 @@ void AddF(string name, float[] data, int[] dims) ? auto.Evaluator.Evaluate(frameTimes) : null; + float[] combined = CombineVariance(spec, predicted, user, nFrames); if (ac.InputMetadata.ContainsKey(spec.Key)) - AddF(spec.Key, CombineVariance(spec, predicted, user, nFrames), new[] { 1, nFrames }); + AddF(spec.Key, combined, new[] { 1, nFrames }); if (spec.Use(mConfig) && spec.Predict(mConfig) && predicted != null) - varReadback[spec.Key] = BuildReadbackSegment(spec, predicted, frameTimes, nFrames); + varReadback[spec.Key] = BuildReadbackSegment(spec, combined, frameTimes, nFrames, spHeadFrames, spTailFrames); } // —— gender / velocity —— @@ -554,6 +612,61 @@ void AddF(string name, float[] data, int[] dims) melDims, finalMel); } + // —— 用音素边界精化过渡区间 —— + // 修改【A B C D】中的【C】→ 重渲染【B C D】,过渡在 B 和 D 的内部进行。 + // 前过渡:边界音素起始 + 3 帧;后过渡:边界音素结束 - 3 帧。 + static void RefineStitchRange(IReadOnlyList newPhonemes, + ref double stitchStart, ref double stitchEnd, double frameSec) + { + if (double.IsNaN(stitchStart) || double.IsNaN(stitchEnd)) + return; + double margin = 3 * frameSec; + + // 前边界:找 stitchStart 所在音素的「前一个」音素(即追溯音素),取其起始 + 3 帧 + int startIdx = -1; + for (int i = 0; i < newPhonemes.Count; i++) + { + if (newPhonemes[i].StartTime <= stitchStart && newPhonemes[i].EndTime >= stitchStart) + { startIdx = i; break; } + } + // 如果找到的音素就是第一个(无前一个),则用它本身 + if (startIdx >= 0) + { + int boundaryIdx = startIdx > 0 ? startIdx - 1 : startIdx; + // 但如果起始音素已经是被修改区域内的(stitchStart > 它的起始),则前一个才是边界 + // 如果起始音素的起始 > stitchStart,说明我们定位到了边界音素之后的第一个,那它的前一个是边界 + if (newPhonemes[startIdx].StartTime > stitchStart && startIdx > 0) + boundaryIdx = startIdx - 1; + stitchStart = newPhonemes[boundaryIdx].StartTime + margin; + // 确保不超出该音素范围 + if (stitchStart > newPhonemes[boundaryIdx].EndTime - margin) + stitchStart = newPhonemes[boundaryIdx].StartTime + margin * 0.5; + } + + // 后边界:找 stitchEnd 所在音素的「后一个」音素(追溯音素),取其结束 - 3 帧 + int endIdx = -1; + for (int i = newPhonemes.Count - 1; i >= 0; i--) + { + if (newPhonemes[i].StartTime <= stitchEnd && newPhonemes[i].EndTime >= stitchEnd) + { endIdx = i; break; } + } + if (endIdx >= 0) + { + int boundaryIdx = endIdx < newPhonemes.Count - 1 ? endIdx + 1 : endIdx; + if (newPhonemes[endIdx].EndTime < stitchEnd && endIdx < newPhonemes.Count - 1) + boundaryIdx = endIdx + 1; + stitchEnd = newPhonemes[boundaryIdx].EndTime - margin; + if (stitchEnd < newPhonemes[boundaryIdx].StartTime + margin) + stitchEnd = newPhonemes[boundaryIdx].EndTime - margin * 0.5; + } + + if (stitchStart >= stitchEnd) + { + stitchStart = double.NaN; + stitchEnd = double.NaN; + } + } + // —— 区段式 mel 拼贴 —— // 将新 mel 的「受影响的 time range」替换到旧 mel 中,边界做 3 帧交叉过渡。 // 旧 mel 为 null 或区间无效时直接返回新 mel。 @@ -755,13 +868,29 @@ Func GenderConvert() // VELC convert(OpenUtau DiffSingerRenderer):对数标度,100 = 原速,每 +100 速度 ×2。 static double SpeedConvert(double x) => Math.Pow(2, (x - 100) / 100); - // 回显段:纯预测值(不含用户编辑),clamp 到声学值域,逐帧 (全局秒, 值)。 - static List BuildReadbackSegment(VarianceSpec spec, float[] predicted, double[] frameTimes, int n) + // 回显段:最终值(含用户包络),clamp 到声学值域,逐帧 (全局秒, 值)。 + // 整个 SP 段做透明度过渡:外边界 → 声学最小值(背景=不可见),内边界 → 100%(实际值)。 + // 注意:pitch 回显直接排除 SP 帧(不画),此处用渐变使曲线从背景平滑浮现/消失。 + static List BuildReadbackSegment(VarianceSpec spec, float[] finalValues, double[] frameTimes, int n, + int headSpFrames = 0, int tailSpFrames = 0) { + float fadeTarget = (float)spec.AcousticMin; // 声学最小值 = 曲线不可见的背景值 var points = new List(n); for (int f = 0; f < n; f++) { - float x = f < predicted.Length ? predicted[f] : predicted[^1]; + float x = f < finalValues.Length ? finalValues[f] : finalValues[^1]; + // 前 SP:整个 SP 段从外边界(f=0)到内边界线性渐入(0% → 100%) + if (f < headSpFrames) + { + float t = (float)(f + 1) / headSpFrames; + x = x * t + fadeTarget * (1 - t); + } + // 后 SP:整个 SP 段从内边界到外边界线性渐出(100% → 0%) + if (f >= n - tailSpFrames) + { + float t = (float)(n - f) / tailSpFrames; + x = x * t + fadeTarget * (1 - t); + } points.Add(new Point(frameTimes[f], Math.Clamp(x, spec.AcousticMin, spec.AcousticMax))); } return points; @@ -780,7 +909,9 @@ static string PickVowelSymbol(VoiceModels models, string lang) public IReadOnlyList> SynthesizedPitch => mPieces.Where(p => p.PitchReadback.Count > 0).Select(p => p.PitchReadback).ToList(); - // 回显产物(数据线程发布、可跨线程读):按声明的回显轨 key 聚合各 piece 的纯预测段(每 piece 一段、段间断开)。 + // 回显产物(数据线程发布、可跨线程读):按声明的回显轨 key 聚合各 piece 的预测段。 + // 每 piece 整段作为一个 segment,用多 GradientStop 的 LinearGradientBrush 实现像素级平滑透明度。 + // 透明度轮廓:head SP 0%→25%, body 25%, tail SP 25%→0%,无段间边界。 public IReadOnlyMap SynthesizedParameters { get @@ -789,11 +920,68 @@ public IReadOnlyMap SynthesizedParameters foreach (var kvp in mReadbackConfigs) { var segments = new List>(); + var stopSets = new List>(); foreach (var piece in mPieces) - if (piece.VarianceReadback.TryGetValue(kvp.Key, out var segment) && segment.Count > 0) - segments.Add(segment); + { + if (!piece.VarianceReadback.TryGetValue(kvp.Key, out var allPoints) || allPoints.Count == 0) + continue; + + var phones = piece.CachedPhones; + if (phones == null || phones.Count == 0 || allPoints.Count < 2) + { + segments.Add(allPoints); + stopSets.Add([new(0, 0.25), new(1, 0.25)]); + continue; + } + + double bodyStart = phones[0].StartTime; + double bodyEnd = phones[^1].EndTime; + int headEnd = 0, bodyEndIdx = allPoints.Count; + while (headEnd < allPoints.Count && allPoints[headEnd].X < bodyStart) headEnd++; + int bodyStartIdx = headEnd; + while (bodyEndIdx > 0 && allPoints[bodyEndIdx - 1].X > bodyEnd) bodyEndIdx--; + int total = allPoints.Count; + bool hasHead = headEnd > 0; + bool hasTail = bodyEndIdx < total; + + // 整段作为一个 segment + segments.Add(allPoints); + + // 构造 GradientStop,按有无 head/tail 调整 + var stops = new List(4); + if (hasHead) + { + stops.Add(new(0.0, 0.0)); + double headRatio = (double)headEnd / total; + stops.Add(new(Math.Clamp(headRatio, 0, 1), 0.25)); + } + else + { + stops.Add(new(0.0, 0.25)); + } + + double bodyEndRatio = (double)bodyEndIdx / total; + if (hasTail) + { + stops.Add(new(Math.Clamp(bodyEndRatio, 0, 1), 0.25)); + stops.Add(new(1.0, 0.0)); + } + else + { + stops.Add(new(1.0, 0.25)); + } + + // 移除相邻等偏移的退化 stop + var dedup = new List(stops.Count); + for (int i = 0; i < stops.Count; i++) + if (i == stops.Count - 1 || Math.Abs(stops[i].X - stops[i + 1].X) > 0.0001) + dedup.Add(stops[i]); + while (dedup.Count < 2) dedup.Add(new(1, 0.25)); + + stopSets.Add(dedup); + } if (segments.Count > 0) - map.Add(kvp.Key, new SynthesizedParameter { Segments = segments }); + map.Add(kvp.Key, new SynthesizedParameter { Segments = segments, SegmentOpacityStops = stopSets }); } return map; } @@ -971,7 +1159,14 @@ void UnsubscribeNote(ILiveNote note) void MarkAllDirtyAndResegment() { - foreach (var piece in mPieces) { piece.Dirty = true; piece.Failed = false; } + foreach (var piece in mPieces) + { + piece.Dirty = true; piece.Failed = false; + // 段落属性(语言等)变更 → 清除音素/variance 缓存,强制使用新的 G2P + piece.CachedPhones = null; + piece.CachedVarianceCurves = default; + // 保留 pitch 缓存 + } mNeedResegment = true; } @@ -1023,7 +1218,8 @@ sealed class Piece public float[]? CachedMel; public int[]? CachedMelDims; public float[]? CachedAudio; - // piece 级 RedrawPitch 请求标记 public bool RedrawPitchRequested; + // 帧时长(秒),用于过渡区间计算 + public double FrameSec; } } diff --git a/DiffSingerVoiceEngine.cs b/DiffSingerVoiceEngine.cs index cfa1a3b..888c8c5 100644 --- a/DiffSingerVoiceEngine.cs +++ b/DiffSingerVoiceEngine.cs @@ -135,6 +135,46 @@ public ObjectConfig GetNotePropertyConfig(INotePropertyContext context) foreach (var kvp in baseConfig.Properties) props.Add(kvp.Key, kvp.Value); + // —— 音素编辑器 —— + var phonemesJson = context.NoteProperties.GetString("_phonemes", "[]"); + var phonemeEntries = ParsePhonemeJson(phonemesJson); + if (phonemeEntries.Count == 0 && c.Languages.Count > 0 && !double.IsNaN(selStart)) + { + // 未自定义音素时,尝试从会话获取默认 G2P 音素 + double midTime = (selStart + selEnd) / 2; + var session = FindSessionByVoiceId(voiceId); + if (session != null) + { + // 从上下文快照取段落语言(保证段落语言变更后能传新值给 G2P) + string snapPartLang = context.PartProperties.GetString(DiffSingerDeclarations.KeyLanguage, string.Empty); + phonemeEntries = session.GetDefaultPhonemesForNote(midTime, snapPartLang); + } + } + if (c.Languages.Count > 0) + { + props.Add("_phoneme_editor", new PhonemeEditorConfig + { + DisplayText = L.Tr("Phonemes"), + Phonemes = phonemeEntries, + AvailableLanguages = c.Languages, + LanguageDataKey = DiffSingerDeclarations.KeyLanguage, + CanDeleteConsonant = phonemeEntries.Count(e => !e.IsVowel) > 1, + CanDeleteVowel = phonemeEntries.Count(e => e.IsVowel) > 1, + OnChanged = _ => + { + // 音素被编辑 → 触发区间重渲染 + var s = FindSessionByVoiceId(voiceId); + if (s != null) + { + if (hasSelection) + s.RequestRetakeScoped(selStart, selEnd); + else + s.RequestRetake(); + } + }, + }); + } + // 追加 Retake 和 Redraw Pitch 按钮(使用选中音符的区间而非全曲) props.Add("_retake", new ButtonConfig { @@ -276,6 +316,38 @@ static void EnsureDefaultDirectory() catch { } } + // 解析音素 JSON:[{"s":"ja/b","v":false},...] + static List ParsePhonemeJson(string json) + { + var result = new List(); + if (string.IsNullOrEmpty(json) || json.Length < 2) return result; + try + { + // 简单的手动 JSON 解析(避免加依赖) + int i = json.IndexOf('['); + if (i < 0) return result; + while (true) + { + i = json.IndexOf('{', i); + if (i < 0) break; + string entry = json.Substring(i, Math.Min(json.Length - i, json.IndexOf('}', i) - i + 1)); + var phoneme = new PhonemeEntry(); + int sIdx = entry.IndexOf("\"s\":\""); + if (sIdx >= 0) + { + sIdx += 5; + int eIdx = entry.IndexOf('"', sIdx); + if (eIdx > sIdx) phoneme.Symbol = entry.Substring(sIdx, eIdx - sIdx); + } + phoneme.IsVowel = entry.Contains("\"v\":true") || entry.Contains("\"v\": true"); + result.Add(phoneme); + i = json.IndexOf('}', i) + 1; + } + } + catch { } + return result; + } + // 不可变扫描结果,整体替换发布:get 侧读引用、扫描侧建好新实例后一次性换上,无需锁。 sealed record State( IReadOnlyOrderedMap Infos, From ef0b20c2a99e363c25eaa075353c0918c392b1dd Mon Sep 17 00:00:00 2001 From: tachengP <2638591622@qq.com> Date: Sun, 21 Jun 2026 19:32:27 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8DDirectML=E5=B9=B6?= =?UTF-8?q?=E8=A1=8C=EF=BC=8C=E4=BF=AE=E5=A4=8D=E5=BB=B6=E9=9F=B3=E7=AC=A6?= =?UTF-8?q?=E3=80=90+=E3=80=91=E3=80=90-=E3=80=91=E5=8F=B7=E5=B7=A5?= =?UTF-8?q?=E4=BD=9Cbug=EF=BC=8C=E8=AE=BE=E7=BD=AE=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E6=9C=80=E5=A4=A7=E6=B8=B2=E6=9F=93=E4=BB=BB=E5=8A=A1=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + DiffSingerModels.cs | 14 ++++++++++++++ DiffSingerPhonemizer.cs | 31 ++++++++++++++++++++++--------- DiffSingerPitch.cs | 4 ++-- DiffSingerPredictor.cs | 13 +++++++++++++ DiffSingerSynthesisSession.cs | 12 +++++++++--- DiffSingerVariance.cs | 4 ++-- DiffSingerVoiceEngine.cs | 26 +++++++++++++++++++++++++- 8 files changed, 88 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 04d0096..e1b0701 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ obj/ # 打包产物 *.tlx /build/ +/tools/CheckModel diff --git a/DiffSingerModels.cs b/DiffSingerModels.cs index f863c3b..d38453b 100644 --- a/DiffSingerModels.cs +++ b/DiffSingerModels.cs @@ -120,9 +120,23 @@ public sealed class VoiceModels : IDisposable readonly Dictionary mPredictors = new(StringComparer.Ordinal); readonly object mPredictorLock = new(); + readonly object mAcousticLock = new(); + readonly object mVocoderLock = new(); + public InferenceSession Acoustic { get; } public InferenceSession Vocoder { get; } + // 线程安全的声学推理包装(DirectML EP 需要串行化 Run 调用) + public IDisposableReadOnlyCollection RunAcoustic(List inputs) + { + lock (mAcousticLock) return Acoustic.Run(inputs); + } + + public IDisposableReadOnlyCollection RunVocoder(List inputs) + { + lock (mVocoderLock) return Vocoder.Run(inputs); + } + public int HiddenSize => mConfig.HiddenSize; public int HopSize => mConfig.HopSize; public int SampleRate => mConfig.SampleRate; diff --git a/DiffSingerPhonemizer.cs b/DiffSingerPhonemizer.cs index 03ac60b..95fdb8b 100644 --- a/DiffSingerPhonemizer.cs +++ b/DiffSingerPhonemizer.cs @@ -53,15 +53,30 @@ public static List Phonemize( noteSymbolCount[i] = symbols.Length; // 连音符:不产生新音素,只延展前音的时长(通过 group 自然吸收) - // (+) 额外在起始点插入一个空 vowel 边界组,强制对齐 + // 连音符:- 不产生边界(前组自然吸收),+ 拆前组末音素到独立组强制 dur 边界 if (symbols.Length == 0 && (lyric == "-" || lyric == "+")) { if (lyric == "+") { - // 插入一个空韵核组作为强制对齐边界 - groups.Add(new Group(note.StartTime, note.Pitch)); + // 把前一个非空组的最后一个音素拆分到 + 组(仅当 >1 音素,否则退化为空组边界) + string moved = "AP"; + bool splitted = false; + for (int gi = groups.Count - 1; gi >= 0; gi--) + { + if (groups[gi].Phonemes.Count > 1) + { + int lastIdx = groups[gi].Phonemes.Count - 1; + moved = groups[gi].Phonemes[lastIdx]; + groups[gi].Phonemes.RemoveAt(lastIdx); + splitted = true; + break; + } + } + var g = new Group(note.StartTime, note.Pitch); + if (splitted) g.Phonemes.Add(moved); + groups.Add(g); } - // - 则完全不做任何事,前组自然吸收时长 + // - 则完全不做任何事,前组自然吸收时长,不影响 dur notePhIndex.Add(notePhIndex[^1]); continue; } @@ -262,7 +277,7 @@ static double[] RunDur(DiffSingerPredictor dur, long[] tokens, long[] langs, if (dur.Linguistic.InputMetadata.ContainsKey("languages")) lingInputs.Add(Nv("languages", langs, nTokens)); - using var lingOut = dur.Linguistic.Run(lingInputs); + using var lingOut = dur.RunLinguistic(lingInputs); var enc = lingOut.First(v => v.Name == "encoder_out").AsTensor(); var mask = lingOut.First(v => v.Name == "x_masks").AsTensor(); var encDense = new DenseTensor(enc.ToArray(), enc.Dimensions.ToArray()); @@ -272,15 +287,13 @@ static double[] RunDur(DiffSingerPredictor dur, long[] tokens, long[] langs, var spk = new float[nTokens * hidden]; for (int i = 0; i < nTokens; i++) Array.Copy(emb, 0, spk, i * hidden, hidden); - var durModel = dur.Model("dur"); - var durInputs = new List + using var durOut = dur.RunModel("dur", new List { NamedOnnxValue.CreateFromTensor("encoder_out", encDense), NamedOnnxValue.CreateFromTensor("x_masks", maskDense), Nv("ph_midi", phMidi.Select(x => (long)x).ToArray(), nTokens), NamedOnnxValue.CreateFromTensor("spk_embed", new DenseTensor(spk, new[] { 1, nTokens, hidden })), - }; - using var durOut = durModel.Run(durInputs); + }); return durOut.First(v => v.Name == "ph_dur_pred").AsTensor().Select(x => (double)x).ToArray(); } diff --git a/DiffSingerPitch.cs b/DiffSingerPitch.cs index 9e2dee8..ceccb79 100644 --- a/DiffSingerPitch.cs +++ b/DiffSingerPitch.cs @@ -52,7 +52,7 @@ public static class DiffSingerPitch var langs = phones.Select(p => v.LangId(PhonemeLanguage(p.Symbol))).Prepend(0L).Append(0L).ToArray(); lingInputs.Add(NvL("languages", langs, nTokens)); } - using var lingOut = v.Linguistic.Run(lingInputs); + using var lingOut = v.RunLinguistic(lingInputs); var enc = lingOut.First(o => o.Name == "encoder_out").AsTensor(); var encDense = new DenseTensor(enc.ToArray(), enc.Dimensions.ToArray()); @@ -95,7 +95,7 @@ public static class DiffSingerPitch inputs.Add(NamedOnnxValue.CreateFromTensor("note_rest", new DenseTensor(noteRest, new[] { 1, noteRest.Length }))); - using var outputs = model.Run(inputs); + using var outputs = v.RunModel("pitch", inputs); return outputs.First().AsTensor().ToArray(); } diff --git a/DiffSingerPredictor.cs b/DiffSingerPredictor.cs index 270a657..0457298 100644 --- a/DiffSingerPredictor.cs +++ b/DiffSingerPredictor.cs @@ -27,12 +27,25 @@ public sealed class DiffSingerPredictor : IDisposable readonly Dictionary> mEntryCache = new(StringComparer.Ordinal); readonly Dictionary mSymbolTypes = new(StringComparer.Ordinal); // symbol → type(合并 dsdict) readonly object mLock = new(); + // 推理锁:DirectML EP 的 InferenceSession.Run() 非线程安全,串行化所有 Run 调用。 + readonly object mRunLock = new(); public InferenceSession Linguistic { get; } public int HiddenSize => mHidden; // linguistic 是否吃 word_div/word_dur(dsdur/dsvariance 词边界;dspitch 用已知 ph_dur)。 public bool LinguisticUsesWordBoundary { get; } + // 线程安全的推理包装(DirectML EP 需要串行化 Run 调用) + public IDisposableReadOnlyCollection RunLinguistic(List inputs) + { + lock (mRunLock) return Linguistic.Run(inputs); + } + + public IDisposableReadOnlyCollection RunModel(string role, List inputs) + { + lock (mRunLock) return mModels[role].Run(inputs); + } + public DiffSingerPredictor(string dir, Func load) { mDir = dir; diff --git a/DiffSingerSynthesisSession.cs b/DiffSingerSynthesisSession.cs index 2547440..25c9745 100644 --- a/DiffSingerSynthesisSession.cs +++ b/DiffSingerSynthesisSession.cs @@ -48,14 +48,17 @@ enum RenderMode { Normal, Retake } double mAffectedStartTime = double.NaN; double mAffectedEndTime = double.NaN; + readonly SemaphoreSlim mRenderSemaphore; + public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext context, - string voiceId, DiffSingerModelCache modelCache, int samplingSteps) + string voiceId, DiffSingerModelCache modelCache, int samplingSteps, SemaphoreSlim renderSemaphore) { mConfig = config; mContext = context; mVoiceId = voiceId; mModelCache = modelCache; mSamplingSteps = samplingSteps; + mRenderSemaphore = renderSemaphore; mAutomationConfigs = BuildAutomationConfigs(config); mReadbackConfigs = BuildReadbackConfigs(config); @@ -218,6 +221,8 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can StatusChanged?.Invoke(); var report = new Progress(p => { piece.Progress = p; StatusChanged?.Invoke(); }); + + await mRenderSemaphore.WaitAsync(cancellation); try { var rendered = await Task.Run(() => Render(snapshot, piece.Notes, piece, report, cancellation), CancellationToken.None); @@ -271,6 +276,7 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can } finally { + mRenderSemaphore.Release(); piece.Synthesizing = false; StatusChanged?.Invoke(); } @@ -556,7 +562,7 @@ void AddF(string name, float[] data, int[] dims) } // —— 声学模型:产 mel —— - using var melOut = ac.Run(inputs); + using var melOut = models.RunAcoustic(inputs); var melTensor = melOut.First(v => v.Name == "mel").AsTensor(); var melDims = melTensor.Dimensions.ToArray(); var newMel = melTensor.ToArray(); @@ -591,7 +597,7 @@ void AddF(string name, float[] data, int[] dims) }; if (voc.InputMetadata.ContainsKey("f0")) vInputs.Add(NamedOnnxValue.CreateFromTensor("f0", new DenseTensor(f0, new[] { 1, nFrames }))); - using var wavOut = voc.Run(vInputs); + using var wavOut = models.RunVocoder(vInputs); var audio = wavOut.First(v => v.Name == "waveform").AsTensor().ToArray(); progress?.Report(1.0); diff --git a/DiffSingerVariance.cs b/DiffSingerVariance.cs index 3c07e23..c74bb8f 100644 --- a/DiffSingerVariance.cs +++ b/DiffSingerVariance.cs @@ -59,7 +59,7 @@ public static VarianceCurves Predict( } if (v.Linguistic.InputMetadata.ContainsKey("languages")) lingInputs.Add(NvL("languages", langs, nTokens)); - using var lingOut = v.Linguistic.Run(lingInputs); + using var lingOut = v.RunLinguistic(lingInputs); var enc = lingOut.First(o => o.Name == "encoder_out").AsTensor(); var encDense = new DenseTensor(enc.ToArray(), enc.Dimensions.ToArray()); @@ -101,7 +101,7 @@ void Channel(bool predict, string name) new DenseTensor(spk, new[] { 1, totalFrames, hidden }))); } - using var outputs = model.Run(inputs); + using var outputs = v.RunModel("variance", inputs); float[]? Out(bool predict, string name) => predict ? outputs.First(o => o.Name == name).AsTensor().ToArray() : null; return new VarianceCurves( diff --git a/DiffSingerVoiceEngine.cs b/DiffSingerVoiceEngine.cs index 888c8c5..c60055c 100644 --- a/DiffSingerVoiceEngine.cs +++ b/DiffSingerVoiceEngine.cs @@ -18,6 +18,7 @@ public sealed class DiffSingerVoiceEngine : IVoiceEngine, IExtensionSettings const string KeyVoicebankDirs = "voicebank_dirs"; const string KeyExecutionProvider = "execution_provider"; const string KeySamplingSteps = "sampling_steps"; + const string KeyMaxConcurrentRenderings = "max_concurrent_renderings"; public IReadOnlyOrderedMap VoiceSourceInfos => mState.Infos; @@ -33,6 +34,18 @@ public void Destroy() mModelCache = null; } + // —— 并发渲染限流(DirectML 多轨并发需要限制同时渲染的轨数)—— + SemaphoreSlim mRenderSemaphore = new(1, 1); + + internal void UpdateRenderSemaphore(int maxConcurrent) + { + maxConcurrent = Math.Max(1, maxConcurrent); + var old = Interlocked.Exchange(ref mRenderSemaphore, new SemaphoreSlim(maxConcurrent, maxConcurrent)); + old.Dispose(); + } + + internal SemaphoreSlim RenderSemaphore => mRenderSemaphore; + // —— 会话注册表(引擎级,跨会话共享)—— readonly List> mSessions = new(); @@ -79,7 +92,7 @@ public ISynthesisSession CreateSession(string voiceId, ISynthesisContext context // 推理走引擎级模型缓存(懒加载、按 voiceId 共享);声明面(轨/面板)已上移到引擎方法、建会话前即填好。 var config = ConfigFor(voiceId)!; var samplingSteps = mSettings.GetInt(KeySamplingSteps, 20); - var session = new DiffSingerSynthesisSession(config, context, voiceId, EnsureModelCache(), samplingSteps); + var session = new DiffSingerSynthesisSession(config, context, voiceId, EnsureModelCache(), samplingSteps, mRenderSemaphore); RegisterSession(session); return session; } @@ -266,6 +279,15 @@ public ObjectConfig GetSettingsConfig(IExtensionSettingsContext context) DefaultValue = 20, MinValue = 1, MaxValue = 1000, IsInteger = true, } }, + { + // DirectML 最大同时渲染轨数:默认 1(串行安全),提高可让 CPU 端并行加速。 + KeyMaxConcurrentRenderings, + new SliderConfig + { + DisplayText = L.Tr("Max concurrent renderings"), + DefaultValue = 1, MinValue = 1, MaxValue = 8, IsInteger = true, + } + }, }; return new ObjectConfig { Properties = properties }; } @@ -273,6 +295,8 @@ public ObjectConfig GetSettingsConfig(IExtensionSettingsContext context) public void ApplySettings(PropertyObject settings) { mSettings = settings; + var maxConcurrent = mSettings.GetInt(KeyMaxConcurrentRenderings, 1); + UpdateRenderSemaphore(maxConcurrent); Rescan(); }