diff --git a/.gitignore b/.gitignore index 04d0096..e1b0701 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ obj/ # 打包产物 *.tlx /build/ +/tools/CheckModel diff --git a/DiffSingerDeclarations.cs b/DiffSingerDeclarations.cs index 5417bf9..53347ed 100644 --- a/DiffSingerDeclarations.cs +++ b/DiffSingerDeclarations.cs @@ -176,11 +176,11 @@ public static ObjectConfig BuildNoteConfig(VoicebankConfig config, INoteProperty DefaultOption = PropertyValue.Create(defaultValue), }; - static List ToOptions(IReadOnlyList values) + static List LanguageOptions(IReadOnlyList languages) { - var options = new List(values.Count); - foreach (var value in values) - options.Add(value); // 隐式转换:string → ComboBoxOption(值即显示文本) + var options = new List { new(PropertyValue.Create(string.Empty), "default") }; + foreach (var lang in languages) + options.Add(lang); return options; } diff --git a/DiffSingerForTuneLab.code-workspace b/DiffSingerForTuneLab.code-workspace index 1131316..1c88dca 100644 --- a/DiffSingerForTuneLab.code-workspace +++ b/DiffSingerForTuneLab.code-workspace @@ -7,6 +7,10 @@ { "name": "TuneLab (参考:SDK/docs/范例)", "path": "../TuneLab" + }, + { + "name": "OpenUtau-lunai (参考:OpenUtau.Core)", + "path": "../OpenUtau-lunai" } ], "settings": { diff --git a/DiffSingerModels.cs b/DiffSingerModels.cs index b6e6954..f29603f 100644 --- a/DiffSingerModels.cs +++ b/DiffSingerModels.cs @@ -121,6 +121,9 @@ public sealed class VoiceModels : IDisposable readonly Dictionary mPredictors = new(StringComparer.Ordinal); readonly object mPredictorLock = new(); + readonly object mAcousticLock = new(); + readonly object mVocoderLock = new(); + public InferenceSession Acoustic { get; } public InferenceSession Vocoder { get; } diff --git a/DiffSingerPhonemizer.cs b/DiffSingerPhonemizer.cs index 8fc5c9e..2e2cb80 100644 --- a/DiffSingerPhonemizer.cs +++ b/DiffSingerPhonemizer.cs @@ -68,6 +68,35 @@ public static List Phonemize( string[] symbols = GetSymbols(dur, note, noteLang[i], out pinned[i]); noteSymbolCount[i] = symbols.Length; + // 连音符:不产生新音素,只延展前音的时长(通过 group 自然吸收) + // 连音符:- 不产生边界(前组自然吸收),+ 拆前组末音素到独立组强制 dur 边界 + if (symbols.Length == 0 && (lyric == "-" || lyric == "+")) + { + if (lyric == "+") + { + // 把前一个非空组的最后一个音素拆分到 + 组(仅当 >1 音素,否则退化为空组边界) + string moved = "AP"; + bool splitted = false; + for (int gi = groups.Count - 1; gi >= 0; gi--) + { + if (groups[gi].Phonemes.Count > 1) + { + int lastIdx = groups[gi].Phonemes.Count - 1; + moved = groups[gi].Phonemes[lastIdx]; + groups[gi].Phonemes.RemoveAt(lastIdx); + splitted = true; + break; + } + } + var g = new Group(note.StartTime, note.Pitch); + if (splitted) g.Phonemes.Add(moved); + groups.Add(g); + } + // - 则完全不做任何事,前组自然吸收时长,不影响 dur + notePhIndex.Add(notePhIndex[^1]); + continue; + } + var wordGroups = ProcessWord(dur, note, symbols); groups[^1].Phonemes.AddRange(wordGroups[0].Phonemes); // 前置辅音并入前一组(侵入前一 note 尾) groups.AddRange(wordGroups.Skip(1)); // 韵核组(起点=note 起点) @@ -169,17 +198,56 @@ static List ProcessWord(DiffSingerPredictor dur, SynthesisNoteSnapshot no return wordGroups; } - // 取音素符号串:钉死=用 note.Phonemes 符号;否则 G2P。过滤到「类型已定义 且 dur 表可 tokenize」;空则 [SP]。 + // 取音素符号串:钉死/编辑器→用已有 phonemes 或 _phonemes 属性;连音符→空(slur 延展前音素);否则 G2P。 + // 空结果且非连音符→ [SP] 兜底。 static string[] GetSymbols(DiffSingerPredictor dur, SynthesisNoteSnapshot note, string lang, out bool pinned) { + string lyric = note.Lyric ?? string.Empty; + if (lyric == "-" || lyric == "+") + { + pinned = false; + return Array.Empty(); + } + + // 优先使用 _phonemes 属性(音素编辑器写入) + var phonemesProp = note.Properties.GetString("_phonemes", ""); + if (!string.IsNullOrEmpty(phonemesProp) && phonemesProp != "[]") + { + pinned = true; + return ParsePhonemesProperty(phonemesProp).Where(s => !string.IsNullOrEmpty(s) && dur.TryPhoneme(s, out _)).ToArray(); + } + + // 其次使用钉死音素 pinned = note.Phonemes.Count > 0; IEnumerable raw = pinned ? note.Phonemes.Select(p => p.Symbol) - : dur.G2P(note.Lyric ?? string.Empty, lang); + : dur.G2P(lyric, lang); var symbols = raw.Where(s => dur.IsKnownSymbol(s) && dur.TryPhoneme(s, out _)).ToArray(); return symbols.Length > 0 ? symbols : new[] { Pause }; } + // 从 JSON 字符串解析音素符号列表:[{"s":"ja/b","v":false},...] + static string[] ParsePhonemesProperty(string json) + { + var result = new List(); + if (string.IsNullOrEmpty(json) || json.Length < 2) return result.ToArray(); + try + { + int i = 0; + while (true) + { + int sIdx = json.IndexOf("\"s\":\"", i); + if (sIdx < 0) break; + sIdx += 5; + int eIdx = json.IndexOf('"', sIdx); + if (eIdx > sIdx) result.Add(json.Substring(sIdx, eIdx - sIdx)); + i = eIdx + 1; + } + } + catch { } + return result.ToArray(); + } + // OpenUtau stretch:source[from..from+count) 的帧时长按 ratio 缩放、终点对齐 endPos,返回各音素起点秒。 static IEnumerable Stretch(IReadOnlyList source, int from, int count, double ratio, double endPos) { @@ -235,8 +303,7 @@ static double[] RunDur(DiffSingerPredictor dur, long[] tokens, long[] langs, var spk = new float[nTokens * hidden]; for (int i = 0; i < nTokens; i++) Array.Copy(emb, 0, spk, i * hidden, hidden); - var durModel = dur.Model("dur"); - var durInputs = new List + using var durOut = dur.RunModel("dur", new List { NamedOnnxValue.CreateFromTensor("encoder_out", encDense), NamedOnnxValue.CreateFromTensor("x_masks", maskDense), diff --git a/DiffSingerPitch.cs b/DiffSingerPitch.cs index 8e59585..9a84a06 100644 --- a/DiffSingerPitch.cs +++ b/DiffSingerPitch.cs @@ -139,6 +139,7 @@ public static class DiffSingerPitch // 延音符("-"/"+"):无自身音素,沿用前一个 note 的 rest 状态(前为发声 ⇒ 本帧也发声、携自身 MIDI 滑过去)。 if (DiffSingerPhonemizer.IsSlur(note.Lyric)) { + // slur 延音符继承前音 rest 状态,确保 pitch 模型为其生成正确过渡音高 restList.Add(restList[^1]); } else diff --git a/DiffSingerPredictor.cs b/DiffSingerPredictor.cs index de883fc..466438b 100644 --- a/DiffSingerPredictor.cs +++ b/DiffSingerPredictor.cs @@ -27,6 +27,8 @@ public sealed class DiffSingerPredictor : IDisposable readonly Dictionary> mEntryCache = new(StringComparer.Ordinal); readonly Dictionary mSymbolTypes = new(StringComparer.Ordinal); // symbol → type(合并 dsdict) readonly object mLock = new(); + // 推理锁:DirectML EP 的 InferenceSession.Run() 非线程安全,串行化所有 Run 调用。 + readonly object mRunLock = new(); public InferenceSession Linguistic { get; } public int HiddenSize => mHidden; @@ -86,13 +88,21 @@ public int PhonemeToken(string symbol) : throw new InvalidOperationException($"音素 \"{symbol}\" 不在 {Path.GetFileName(mDir)} 的音素表中"); public long LangId(string lang) => mLanguages.TryGetValue(lang, out var id) ? id : 0; - // —— G2P:按语言查 dsdict-{lang}.yaml 词条(grapheme→带前缀音素),exact 后小写回退 —— + // —— G2P:优先查语言特定词典(dsdict-{lang}.yaml),避免默认底库(dsdict.yaml 以 zh 为主)污染;再试 replacements;最后才兜底查合并词典。 —— public string[] G2P(string lyric, string lang) { - var entries = GetEntries(lang); var key = lyric.Trim(); - if (entries.TryGetValue(key, out var phs)) return phs; - if (entries.TryGetValue(key.ToLowerInvariant(), out phs)) return phs; + // 1. 语言特定词典(不含默认底库) + var langEntries = GetLanguageSpecificEntries(lang); + if (langEntries.TryGetValue(key, out var phs)) return phs; + if (langEntries.TryGetValue(key.ToLowerInvariant(), out phs)) return phs; + // 2. 替换规则(en/ko 等无 entries 的语种) + var replaced = ApplyReplacements(lyric, lang); + if (replaced.Length > 0) return replaced; + // 3. 最后才查合并词典(含默认底库 dsdict.yaml,作为未知字素的最终兜底) + var allEntries = GetEntries(lang); + if (allEntries.TryGetValue(key, out phs)) return phs; + if (allEntries.TryGetValue(key.ToLowerInvariant(), out phs)) return phs; return Array.Empty(); } @@ -119,7 +129,78 @@ public float[] GetEmbedding(string acousticSpeaker) } } + // —— 替换规则(用于 EN/KO 等无 entries 仅 replacements 的语种)—— + readonly Dictionary> mReplacements = new(StringComparer.Ordinal); + + void LoadReplacements(string lang) + { + if (mReplacements.ContainsKey(lang)) return; + var list = new List<(string from, string to)>(); + foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml", "dsdict.yaml" }) + { + var path = Path.Combine(mDir, file); + if (!File.Exists(path)) continue; + try + { + var yaml = new DeserializerBuilder().Build(); + var doc = yaml.Deserialize>(File.ReadAllText(path)); + if (doc != null && doc.TryGetValue("replacements", out var reps) && reps is List repList) + { + foreach (var r in repList) + { + if (r is Dictionary repDict) + { + string? from = repDict.TryGetValue("from", out var fv) ? fv?.ToString() : null; + string? to = repDict.TryGetValue("to", out var tv) ? tv?.ToString() : null; + if (!string.IsNullOrEmpty(from) && !string.IsNullOrEmpty(to)) + list.Add((from, to)); + } + } + } + } + catch { } + } + mReplacements[lang] = list; + } + + // 用替换规则将歌词转为音素(按最长匹配优先) + public string[] ApplyReplacements(string lyric, string lang) + { + LoadReplacements(lang); + if (!mReplacements.TryGetValue(lang, out var reps) || reps.Count == 0) + return Array.Empty(); + + var repsSorted = reps.OrderByDescending(r => r.from.Length).ToList(); + var result = new List(); + string text = lyric.ToLowerInvariant(); + int pos = 0; + while (pos < text.Length) + { + bool matched = false; + foreach (var (from, to) in repsSorted) + { + if (pos + from.Length <= text.Length && text.Substring(pos, from.Length) == from) + { + result.Add(to); + pos += from.Length; + matched = true; + break; + } + } + if (!matched) + { + // 单个字符作为独立音素 + string ch = text[pos].ToString(); + result.Add(ch); + pos++; + } + } + return result.ToArray(); + } + // —— 词典加载 —— + // 策略:先加载 dsdict.yaml 作为默认底库,再叠加载入语种特定文件(后面覆盖前面)。 + // 若 entries 为空且 replacements 存在,留空返回(上层调用 ApplyReplacements)。 Dictionary GetEntries(string lang) { lock (mLock) @@ -128,7 +209,19 @@ Dictionary GetEntries(string lang) return cached; var map = new Dictionary(StringComparer.Ordinal); - foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml", "dsdict.yaml" }) + + // 1. 加载默认底库 dsdict.yaml(总是存在) + var defaultPath = Path.Combine(mDir, "dsdict.yaml"); + if (File.Exists(defaultPath)) + { + var root = DeserializeDsDict(defaultPath); + foreach (var e in root.entries) + if (!string.IsNullOrEmpty(e.grapheme)) + map[e.grapheme] = e.phonemes.ToArray(); + } + + // 2. 叠加载入语种特定文件(若存在则覆盖/补充) + foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml" }) { var path = Path.Combine(mDir, file); if (!File.Exists(path)) continue; @@ -136,13 +229,29 @@ Dictionary GetEntries(string lang) foreach (var e in root.entries) if (!string.IsNullOrEmpty(e.grapheme)) map[e.grapheme] = e.phonemes.ToArray(); - break; } + mEntryCache[lang] = map; return map; } } + // 仅加载语言特定词典(不含默认底库 dsdict.yaml),用于 G2P 的优先查表——避免 zh 底库污染其他语言的译音。 + Dictionary GetLanguageSpecificEntries(string lang) + { + var map = new Dictionary(StringComparer.Ordinal); + foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml" }) + { + var path = Path.Combine(mDir, file); + if (!File.Exists(path)) continue; + var root = DeserializeDsDict(path); + foreach (var e in root.entries) + if (!string.IsNullOrEmpty(e.grapheme)) + map[e.grapheme] = e.phonemes.ToArray(); + } + return map; + } + void LoadSymbolTypes(string dsdictPath) { if (!File.Exists(dsdictPath)) return; diff --git a/DiffSingerSynthesisSession.cs b/DiffSingerSynthesisSession.cs index 4f2f07e..c2a71b3 100644 --- a/DiffSingerSynthesisSession.cs +++ b/DiffSingerSynthesisSession.cs @@ -11,12 +11,13 @@ namespace DiffSingerForTuneLab; -// 一条 part 的合成会话。本阶段实现「声明面」:四个声明方法是选中声库能力集(VoicebankConfig)的纯函数—— -// 据 use_*_embed 暴露可编辑曲线、据 predict_* 暴露只读回显轨、据 speakers/languages 暴露 part/note 属性。 -// 调度与 6 级合成管线、产物发布为后续阶段:GetNextSegment 暂报「无待合成」,故宿主不驱动 SynthesizeNext, -// 会话呈现属性面板与轨但不产音——诚实的中间态。 -// 声明面(轨集合/属性面板)已上移到 DiffSingerVoiceEngine(经 DiffSingerDeclarations);本会话仅承载运行时: -// 调度、6 级推理管线、产物发布。轨 key 与 variance/gender/speed 规格复用 DiffSingerDeclarations(using static 引入)。 +// 一条 part 的合成会话。 +// 关键设计: +// · 区段式重渲染:修改某个音素时,仅以该音素为中心前后各扩展一个音素作为「重渲染区段」, +// 渲染后将新 mel 通过频谱过渡拼贴到原序列的 mel 谱上,避免整个序列被改变。 +// · pitch 锁定:用户修改 pitch 曲线后,自动音高预测(dspitch)被锁定不再重新生成, +// 仅缓存的预测作为 NaN 自由区的回退。除非用户选择 Retake 或 RedrawPitch。 +// · dur 忠于 UI:音素时间线由用户界面(note 时长 / 钉死音素)决定,首次渲染后只有显式请求才重新 phonemize。 public sealed class DiffSingerSynthesisSession : ISynthesisSession { readonly VoicebankConfig mConfig; @@ -31,7 +32,6 @@ public sealed class DiffSingerSynthesisSession : ISynthesisSession // 可编辑轨集合(构造期订阅其区间编辑)+ 回显轨集合(产物 SynthesizedParameters 按其 key 聚合)。 readonly OrderedMap mReadbackConfigs; - // —— 调度状态(数据线程;按 note 间隙分块,账本式托管失效与产物)—— readonly IDisposable mNotesSubscription; readonly List mSubscribedAutomations = new(); // 已订阅 RangeModified 的固定轨(variance/gender/speed,恒定,Dispose 退订) readonly Dictionary mMixSubscriptions = new(); // 已订阅的说话人混合轨(动态,key=mix:suffix,随 part 属性增减) @@ -39,6 +39,15 @@ public sealed class DiffSingerSynthesisSession : ISynthesisSession readonly List mPieces = new(); bool mNeedResegment; + // 缓存有效标志 + bool mHasValidCache; + + // 受影响的 time range(来自 OnRangeModified 或 note 修改),供 mel 拼贴使用 + double mAffectedStartTime = double.NaN; + double mAffectedEndTime = double.NaN; + + readonly SemaphoreSlim mRenderSemaphore; + public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext context, string voiceId, DiffSingerModelCache modelCache, int samplingSteps, bool tensorCache, int cacheMaxSizeMb) { @@ -53,7 +62,6 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont // 声明派生物据声库能力集算一次(与引擎声明同一套 DiffSingerDeclarations,单一真相源)。 mReadbackConfigs = BuildReadbackConfigs(config); - // 变更接线(handler 只做廉价标脏;重活延迟到 Committed 重分块)——见 §5.9。 mNotesSubscription = NotifiableExtensions.WhenAny(context.Notes, SubscribeNote, UnsubscribeNote); context.Notes.ItemAdded += OnNotesStructureChanged; context.Notes.ItemRemoved += OnNotesStructureChanged; @@ -79,25 +87,19 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont mNeedResegment = true; } - // 新建 note 的默认歌词:中性占位,待词典 G2P 阶段按声库词典择一有效词细化。 public string DefaultLyric => "a"; // —— 调度:窗内第一个脏块的纯值边界(peek 廉价、确定性)—— public SynthesisRange? GetNextSegment(double startTime, double endTime) => FindNextDirtyPiece(startTime, endTime) is { } p ? new SynthesisRange(p.StartTime, p.EndTime) : null; - // peek 与 commit 共用同一查找(确定性 + 同调度 tick 无编辑 ⇒ commit 重算得到 peek 报出的同一块)。 Piece? FindNextDirtyPiece(double startTime, double endTime) { - if (mNeedResegment) - Resegment(); - + if (mNeedResegment) Resegment(); foreach (var piece in mPieces) { - if (!piece.Dirty || piece.Failed || piece.Synthesizing) - continue; - if (piece.EndTime < startTime || piece.StartTime > endTime) - continue; + if (!piece.Dirty || piece.Failed || piece.Synthesizing) continue; + if (piece.EndTime < startTime || piece.StartTime > endTime) continue; return piece; } return null; @@ -108,7 +110,6 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT if (FindNextDirtyPiece(startTime, endTime) is not { } piece) return; - // 同步前缀(数据线程):物化不可变快照(本块 note 全集 + 按 note 范围开窗)。 var snapshot = mContext.GetSnapshot(piece.Notes, piece.Notes[0].StartTime.Value, piece.Notes.Max(n => n.EndTime.Value)); piece.Dirty = false; piece.Synthesizing = true; @@ -116,6 +117,8 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT StatusChanged?.Invoke(); var report = new Progress(p => { piece.Progress = p; StatusChanged?.Invoke(); }); + + await mRenderSemaphore.WaitAsync(cancellation); try { // offload:worker 只读冻结快照跑 ONNX(绝不碰活视图);模型懒加载经引擎级缓存(首载触发原生加载)。 @@ -130,13 +133,43 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT if (rendered != null && mPieces.Contains(piece)) { int rate = rendered.SampleRate; + + // 用渲染后的音素边界精化过渡区间:使过渡在追溯音素内部进行 + double stitchStart = mAffectedStartTime; + double stitchEnd = mAffectedEndTime; + RefineStitchRange(rendered.Phonemes, + ref stitchStart, ref stitchEnd, piece.FrameSec); + + // 区段式音频拼贴:仅替换受影响的 time range,其余保持旧音频不变 + var stitchedAudio = StitchAudio(rendered.Audio, piece.CachedAudio, + rendered.StartTime, rate, stitchStart, stitchEnd); + + // 缓存旧音频供下次拼贴 + piece.CachedAudio = stitchedAudio; + piece.Segment?.Dispose(); - piece.Segment = mContext.CreateAudioSegment((long)(rendered.StartTime * rate), rendered.Audio.Length, rate); - piece.Segment.Write(0, rendered.Audio); + piece.Segment = mContext.CreateAudioSegment((long)(rendered.StartTime * rate), stitchedAudio.Length, rate); + piece.Segment.Write(0, stitchedAudio); piece.Segment.Commit(); piece.Phonemes = rendered.Phonemes; - piece.PitchReadback = rendered.PitchReadback; - piece.VarianceReadback = rendered.VarianceReadback; + + // 回显曲线也做区间拼贴:未修改区间的 pitch/tension 与缓存的旧曲线一致 + piece.PitchReadback = StitchPoints(rendered.PitchReadback, piece.CachedPitchReadback, + mAffectedStartTime, mAffectedEndTime); + piece.CachedPitchReadback = piece.PitchReadback; + + if (rendered.VarianceReadback.Count > 0) + { + var stitchedVar = new Dictionary>(); + foreach (var kvp in rendered.VarianceReadback) + { + piece.CachedVarianceReadback.TryGetValue(kvp.Key, out var oldVar); + stitchedVar[kvp.Key] = StitchPoints(kvp.Value, oldVar, + mAffectedStartTime, mAffectedEndTime); + } + piece.VarianceReadback = stitchedVar; + piece.CachedVarianceReadback = stitchedVar; + } } } catch (Exception ex) @@ -147,16 +180,73 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT } finally { + mRenderSemaphore.Release(); piece.Synthesizing = false; StatusChanged?.Invoke(); } } - // 推理链(worker,只读冻结快照):忠实移植 OpenUtau phonemizer + renderer(见记忆 openutau-is-authority)。 - // phonemizer(dsdur) → 音素时间线;renderer 加 head/tail SP padding、tokens[SP..SP]、durations[8..8]、 - // f0(Hz over totalFrames)、variance 预测+用户 delta 合成喂声学(纯预测产回显轨)、spk by frame、depth/steps。 - // gender/velocity 走用户曲线 + OpenUtau GENC/VELC convert;pitch 自由区走 dspitch 预测轮廓、已画处用户值覆盖。 - RenderResult? Render(SynthesisSnapshot snapshot, IReadOnlyList origins, + // —— 音频拼贴 —— + // 将新音频的 affected 区间(前后各扩展 3 帧过渡)替换到旧音频中。 + // 若旧音频不存在(首次渲染)则直接返回新音频。 + static float[] StitchAudio(float[] newAudio, float[]? oldAudio, + double renderStartSec, int sampleRate, + double affectedStart, double affectedEnd) + { + if (oldAudio == null || oldAudio.Length == 0) + return newAudio; + if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd)) + return newAudio; // 无明确 affected 区间时全量替换 + if (oldAudio.Length != newAudio.Length) + return newAudio; // 长度不同无法拼贴 + + // 计算 affected 区间的采样点范围(前后扩展 3 帧 = 3 * hop_size 采样点) + int hop = 512; // DiffSinger 标准 hop_size + int fadeSamples = 3 * hop; + int startSample = Math.Max(0, (int)((affectedStart - renderStartSec) * sampleRate) - fadeSamples); + int endSample = Math.Min(newAudio.Length, (int)((affectedEnd - renderStartSec) * sampleRate) + fadeSamples); + + if (startSample >= endSample) + return newAudio; + + var result = new float[oldAudio.Length]; + Array.Copy(oldAudio, result, oldAudio.Length); + + // 拷贝 affected 区间的新音频 + int copyLen = endSample - startSample; + Array.Copy(newAudio, startSample, result, startSample, copyLen); + + // 前过渡区:第一帧(线性渐入) + int fadeLen = Math.Min(fadeSamples, copyLen / 2); + for (int i = 0; i < fadeLen; i++) + { + float t = (float)(i + 1) / (fadeLen + 1); + int idx = startSample + i; + if (idx >= 0 && idx < result.Length) + result[idx] = oldAudio[idx] * (1 - t) + newAudio[idx] * t; + } + + // 后过渡区:最后一帧(线性渐出) + for (int i = 0; i < fadeLen; i++) + { + float t = (float)(i + 1) / (fadeLen + 1); + int idx = endSample - 1 - i; + if (idx >= 0 && idx < result.Length) + result[idx] = oldAudio[idx] * (1 - t) + newAudio[idx] * t; + } + + return result; + } + + // —— 推理链(worker,只读冻结快照)—— + + // 推理结果 + sealed record RenderResultEx(float[] Audio, double StartTime, int SampleRate, + List Phonemes, List PitchReadback, + Dictionary> VarianceReadback, + int[] MelDims, float[] Mel); + + RenderResultEx? Render(SynthesisSnapshot snapshot, IReadOnlyList origins, Piece piece, IProgress? progress, CancellationToken cancellation) { var notes = snapshot.Notes; @@ -167,12 +257,29 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT int hop = models.HopSize, sr = models.SampleRate, hidden = models.HiddenSize; double frameSec = (double)hop / sr; int head = DiffSingerFrames.HeadFrames; + int numMelBins = models.NumMelBins; + piece.FrameSec = frameSec; - string partLang = snapshot.PartProperties.GetString(KeyLanguage, mConfig.Languages.Count > 0 ? mConfig.Languages[0] : string.Empty); + string partLang = snapshot.PartProperties.GetString(KeyLanguage, string.Empty); string speaker = snapshot.PartProperties.GetString(KeySpeaker, mConfig.Speakers.Count > 0 ? mConfig.Speakers[0] : string.Empty); var noteLang = notes.Select(nt => nt.Properties.GetString(KeyLanguage, partLang)).ToArray(); - // —— Phonemizer:歌词 → 音素时间线(绝对秒、含前置辅音越界)—— + // 渲染模式(仅 Retake 用会话级,RedrawPitch 由 piece 级标记驱动) + bool isRetake = false; + if (!mRenderModeConsumed && mRenderMode == RenderMode.Retake) + { + isRetake = true; + mRenderModeConsumed = true; + mRenderMode = RenderMode.Normal; + } + bool isRedrawPitch = piece.RedrawPitchRequested; + piece.RedrawPitchRequested = false; + + bool pieceHasNoCache = piece.CachedPhones == null; + bool needFullPredict = isRetake || !mHasValidCache || pieceHasNoCache; + bool needPitchPredict = needFullPredict || isRedrawPitch; + + // —— 模型优先级:dsdur/dspitch 提级模型 —— var durPred = models.GetPredictor("dsdur"); var phones = durPred != null ? DiffSingerPhonemizer.Phonemize(durPred, notes, noteLang, speaker, hop, sr, mTensorCache) @@ -183,10 +290,10 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT if (cancellation.IsCancellationRequested) return null; - // —— 帧布局:[head SP][...phones...][tail SP],累积取整 → durations(len=phones+2)—— + // —— 帧布局 —— var phoneDurSec = phones.Select(p => Math.Max(0, p.EndTime - p.StartTime)).ToArray(); var durations = DiffSingerFrames.PaddedPhoneFrames(phoneDurSec, frameSec); - int nTokens = durations.Length; // phones + 2 + int nTokens = durations.Length; int nFrames = durations.Sum(); double renderStart = phones[0].StartTime - head * frameSec; @@ -201,7 +308,7 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT mixTracks.Add((suffix, mixAuto.Evaluator.Evaluate(frameTimes))); var speakerMix = DiffSingerSpeakerMix.Create(Suffix(speaker), mixTracks, nFrames); - // tokens/languages:声学表,前后加 SP。 + // tokens/languages var tokens = new long[nTokens]; var langs = new long[nTokens]; tokens[0] = AcousticToken(models, "SP"); @@ -212,16 +319,18 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT langs[i + 1] = models.TryGetLanguage(PhonemeLang(phones[i].Symbol), out var lid) ? lid : 0; } - // 逐帧 note 音高回退(head→首 note,phone i→其 note,tail→末 note)。 + // 逐帧 note 音高回退(直接按 note 时间区间查找,不受 -/+ 延音符无音素的影响) var framePitch = new double[nFrames]; - int fi = 0; - for (int seg = 0; seg < nTokens; seg++) { - int ni = seg == 0 ? phones[0].NoteIndex - : seg == nTokens - 1 ? phones[^1].NoteIndex - : phones[seg - 1].NoteIndex; - int pitch = notes[ni].Pitch; - for (int k = 0; k < durations[seg]; k++) framePitch[fi++] = pitch; + int ni = 0; + for (int f = 0; f < nFrames; f++) + { + double t = frameTimes[f]; + // 找出包含当前帧的 note(延音符未产生音素,但 pitch 仍需跟随它的音高) + while (ni < notes.Count - 1 && t >= notes[ni + 1].StartTime) + ni++; + framePitch[f] = notes[ni].Pitch; + } } // —— dspitch 自然音高预测(纯从音符、retake 全 true、不吃用户音高):替代自由区的矩形 note-step 兜底 —— @@ -233,21 +342,36 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT if (cancellation.IsCancellationRequested) return null; - // 逐帧 f0(Hz) + 半音曲线(variance 用):帧中心采样双通道音高,NaN 自由区回退预测轮廓(无则 note 音高)。 + // 逐帧 f0(Hz) + 半音曲线 var pitchCurve = snapshot.Pitch.Evaluator.Evaluate(frameTimes); var deviation = snapshot.PitchDeviation.Evaluator.Evaluate(frameTimes); var f0 = new float[nFrames]; var semis = new float[nFrames]; var pitchReadback = new List(nFrames); + + // —— 初始生成自动音高时排除前后 SP 音高数据(不画到用户界面)—— + // 判断哪些帧属于 head SP(第 0 个 token)和 tail SP(最后一个 token) + int spHeadFrames = durations[0]; // head SP 的帧数 + int spTailFrames = durations[^1]; // tail SP 的帧数 + for (int f = 0; f < nFrames; f++) { - double fallback = predictedPitch != null - ? (f < predictedPitch.Length ? predictedPitch[f] : predictedPitch[^1]) - : framePitch[f]; + double fallback; + if (predictedPitch != null) + fallback = f < predictedPitch.Length ? predictedPitch[f] : predictedPitch[^1]; + else + fallback = framePitch[f]; + double semitone = (double.IsNaN(pitchCurve[f]) ? fallback : pitchCurve[f]) + deviation[f]; semis[f] = (float)semitone; f0[f] = DiffSingerFrames.ToneToFreq(semitone); - pitchReadback.Add(new Point(frameTimes[f], semitone)); + + // 排除前后 SP 帧的音高回显(不在 pitchReadback 中添加) + bool isHeadOrTailSp = f < spHeadFrames || f >= nFrames - spTailFrames; + if (!isHeadOrTailSp) + { + pitchReadback.Add(new Point(frameTimes[f], semitone)); + } } progress?.Report(0.3); if (cancellation.IsCancellationRequested) @@ -261,7 +385,7 @@ public async Task SynthesizeNext(double startTime, double endTime, CancellationT if (cancellation.IsCancellationRequested) return null; - // —— 声学输入(按 InputMetadata 条件构造)—— + // —— 声学输入 —— var ac = models.Acoustic; var inputs = new List(); void AddL(string name, long[] data, int[] dims) @@ -274,9 +398,8 @@ void AddF(string name, float[] data, int[] dims) AddL("durations", durations.Select(x => (long)x).ToArray(), new[] { 1, nTokens }); AddF("f0", f0, new[] { 1, nFrames }); - // —— variance:预测 + 用户 delta 合成喂声学,同时产纯预测回显 —— - // 用户曲线按帧求值(连续轨:未编辑处=中性基线 → Delta 恒得纯预测;编辑处 → 叠加),clamp 到声学值域。 - // 回显(Use && Predict)= 纯预测值,不含用户编辑。 + // —— variance:使用(缓存的)预测 + 用户 delta 合成喂声学 —— + // 回显显示最终喂声学的值(预测+包络加权),包络修改时实参同步跟随 var varReadback = new Dictionary>(); foreach (var spec in Variances) { @@ -285,15 +408,15 @@ void AddF(string name, float[] data, int[] dims) ? auto.Evaluator.Evaluate(frameTimes) : null; + float[] combined = CombineVariance(spec, predicted, user, nFrames); if (ac.InputMetadata.ContainsKey(spec.Key)) - AddF(spec.Key, CombineVariance(spec, predicted, user, nFrames), new[] { 1, nFrames }); + AddF(spec.Key, combined, new[] { 1, nFrames }); if (spec.Use(mConfig) && spec.Predict(mConfig) && predicted != null) - varReadback[spec.Key] = BuildReadbackSegment(spec, predicted, frameTimes, nFrames); + varReadback[spec.Key] = BuildReadbackSegment(spec, combined, frameTimes, nFrames, spHeadFrames, spTailFrames); } - // —— gender / velocity:纯用户曲线(无方差器基线),按帧 convert 喂声学(忠实移植 OpenUtau GENC/VELC)—— - // 无轨 / NaN 自由区 → 中性 → convert 得中性 embed(gender 0、velocity 1);OpenUtau 不 clamp(UI 量程已界定)。 + // —— gender / velocity —— AddF("gender", BuildCurveInput(snapshot, KeyGender, GenderBaseline, GenderConvert(), frameTimes, nFrames), new[] { 1, nFrames }); AddF("velocity", BuildCurveInput(snapshot, KeySpeed, SpeedBaseline, SpeedConvert, frameTimes, nFrames), new[] { 1, nFrames }); @@ -322,16 +445,21 @@ void AddF(string name, float[] data, int[] dims) if (cancellation.IsCancellationRequested) return null; - // —— 声码器:mel (+ f0) → 波形 —— + // —— 声码器(使用原始 mel 形状创建张量)—— var voc = models.Vocoder; - var vInputs = new List { NamedOnnxValue.CreateFromTensor("mel", mel) }; + var melShape = new int[melDims.Length]; + Array.Copy(melDims, melShape, melDims.Length); + var vInputs = new List + { + NamedOnnxValue.CreateFromTensor("mel", new DenseTensor(finalMel, melShape)) + }; if (voc.InputMetadata.ContainsKey("f0")) vInputs.Add(NamedOnnxValue.CreateFromTensor("f0", new DenseTensor(f0, new[] { 1, nFrames }))); var wavOut = DiffSingerTensorCache.Run(voc, models.VocoderHash, vInputs, mTensorCache); var audio = wavOut.First(v => v.Name == "waveform").AsTensor().ToArray(); progress?.Report(1.0); - // —— 音素产物(绝对秒、韵核吸收伸缩)—— + // —— 音素产物 —— var phonemes = phones.Select(p => new SynthesizedPhoneme { Symbol = p.Symbol, @@ -341,7 +469,203 @@ void AddF(string name, float[] data, int[] dims) StretchWeight = p.IsVowel ? 1 : 0, }).ToList(); - return new RenderResult(audio, renderStart, sr, phonemes, pitchReadback, varReadback); + // 标记缓存有效(首次渲染成功后保持) + mHasValidCache = true; + + return new RenderResultEx(audio, renderStart, sr, phonemes, pitchReadback, varReadback, + melDims, finalMel); + } + + // —— 用音素边界精化过渡区间 —— + // 修改【A B C D】中的【C】→ 重渲染【B C D】,过渡在 B 和 D 的内部进行。 + // 前过渡:边界音素起始 + 3 帧;后过渡:边界音素结束 - 3 帧。 + static void RefineStitchRange(IReadOnlyList newPhonemes, + ref double stitchStart, ref double stitchEnd, double frameSec) + { + if (double.IsNaN(stitchStart) || double.IsNaN(stitchEnd)) + return; + double margin = 3 * frameSec; + + // 前边界:找 stitchStart 所在音素的「前一个」音素(即追溯音素),取其起始 + 3 帧 + int startIdx = -1; + for (int i = 0; i < newPhonemes.Count; i++) + { + if (newPhonemes[i].StartTime <= stitchStart && newPhonemes[i].EndTime >= stitchStart) + { startIdx = i; break; } + } + // 如果找到的音素就是第一个(无前一个),则用它本身 + if (startIdx >= 0) + { + int boundaryIdx = startIdx > 0 ? startIdx - 1 : startIdx; + // 但如果起始音素已经是被修改区域内的(stitchStart > 它的起始),则前一个才是边界 + // 如果起始音素的起始 > stitchStart,说明我们定位到了边界音素之后的第一个,那它的前一个是边界 + if (newPhonemes[startIdx].StartTime > stitchStart && startIdx > 0) + boundaryIdx = startIdx - 1; + stitchStart = newPhonemes[boundaryIdx].StartTime + margin; + // 确保不超出该音素范围 + if (stitchStart > newPhonemes[boundaryIdx].EndTime - margin) + stitchStart = newPhonemes[boundaryIdx].StartTime + margin * 0.5; + } + + // 后边界:找 stitchEnd 所在音素的「后一个」音素(追溯音素),取其结束 - 3 帧 + int endIdx = -1; + for (int i = newPhonemes.Count - 1; i >= 0; i--) + { + if (newPhonemes[i].StartTime <= stitchEnd && newPhonemes[i].EndTime >= stitchEnd) + { endIdx = i; break; } + } + if (endIdx >= 0) + { + int boundaryIdx = endIdx < newPhonemes.Count - 1 ? endIdx + 1 : endIdx; + if (newPhonemes[endIdx].EndTime < stitchEnd && endIdx < newPhonemes.Count - 1) + boundaryIdx = endIdx + 1; + stitchEnd = newPhonemes[boundaryIdx].EndTime - margin; + if (stitchEnd < newPhonemes[boundaryIdx].StartTime + margin) + stitchEnd = newPhonemes[boundaryIdx].EndTime - margin * 0.5; + } + + if (stitchStart >= stitchEnd) + { + stitchStart = double.NaN; + stitchEnd = double.NaN; + } + } + + // —— 区段式 mel 拼贴 —— + // 将新 mel 的「受影响的 time range」替换到旧 mel 中,边界做 3 帧交叉过渡。 + // 旧 mel 为 null 或区间无效时直接返回新 mel。 + static float[] StitchMel(float[] newMel, float[]? oldMel, double[] frameTimes, + int nFrames, int numMelBins, double affectedStart, double affectedEnd) + { + if (oldMel == null || oldMel.Length != newMel.Length) + return newMel; + if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd)) + return newMel; + if (affectedEnd <= frameTimes[0] || affectedStart >= frameTimes[^1]) + return newMel; // affected 区间完全在渲染范围外 + + const int fadeFrames = 3; + int totalFrames = nFrames; + + // 找 affected 区间对应的帧范围(前后各扩展 fadeFrames 帧) + int startFrame = totalFrames - 1; + int endFrame = 0; + for (int f = 0; f < totalFrames; f++) + { + if (frameTimes[f] >= affectedStart && frameTimes[f] <= affectedEnd) + { + if (f < startFrame) startFrame = f; + if (f > endFrame) endFrame = f; + } + } + startFrame = Math.Max(0, startFrame - fadeFrames); + endFrame = Math.Min(totalFrames - 1, endFrame + fadeFrames); + + if (startFrame >= endFrame) + return newMel; + + var result = new float[newMel.Length]; + Array.Copy(oldMel, result, newMel.Length); + + // 区间内直接替换为新 mel + for (int f = startFrame + fadeFrames; f <= endFrame - fadeFrames; f++) + for (int b = 0; b < numMelBins; b++) + result[f * numMelBins + b] = newMel[f * numMelBins + b]; + + // 前过渡区(fadeFrames 帧线性渐入) + for (int i = 0; i < fadeFrames && startFrame + i < totalFrames; i++) + { + float t = (float)(i + 1) / (fadeFrames + 1); + int f = startFrame + i; + for (int b = 0; b < numMelBins; b++) + { + int idx = f * numMelBins + b; + result[idx] = oldMel[idx] * (1 - t) + newMel[idx] * t; + } + } + + // 后过渡区(fadeFrames 帧线性渐出) + for (int i = 0; i < fadeFrames && endFrame - i >= 0; i++) + { + float t = (float)(i + 1) / (fadeFrames + 1); + int f = endFrame - i; + for (int b = 0; b < numMelBins; b++) + { + int idx = f * numMelBins + b; + result[idx] = oldMel[idx] * (1 - t) + newMel[idx] * t; + } + } + + return result; + } + + // —— Point 列表拼贴 —— + // 将受 affected 区间内的 Point 替换为新列表中的值,其余保持旧列表不变。 + // 旧列表为 null 或帧结构改变(点数不同)时直接返回新列表。 + static IReadOnlyList StitchPoints(IReadOnlyList newPoints, IReadOnlyList? oldPoints, + double affectedStart, double affectedEnd) + { + if (oldPoints == null || oldPoints.Count == 0) + return newPoints; + if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd)) + return newPoints; + if (newPoints.Count == 0) + return oldPoints; + if (newPoints.Count != oldPoints.Count) + return newPoints; + if (Math.Abs(newPoints[0].X - oldPoints[0].X) > 0.001) + return newPoints; + + int oldStart = 0, oldEnd = oldPoints.Count; + while (oldStart < oldPoints.Count && oldPoints[oldStart].X < affectedStart) oldStart++; + while (oldEnd > 0 && oldPoints[oldEnd - 1].X > affectedEnd) oldEnd--; + + if (oldStart >= oldEnd) + return newPoints; + + int newStart = 0, newEnd = newPoints.Count; + while (newStart < newPoints.Count && newPoints[newStart].X < affectedStart) newStart++; + while (newEnd > 0 && newPoints[newEnd - 1].X > affectedEnd) newEnd--; + + if (newStart >= newEnd) + return oldPoints; + + // 拼接并在边界做 3 点线性过渡,避免 pitch 断层 + const int fadeCount = 3; + var result = new List(oldStart + (newEnd - newStart) + (oldPoints.Count - oldEnd)); + + // 旧区间(前段) + for (int i = 0; i < oldStart; i++) result.Add(oldPoints[i]); + + // 前过渡:fadeCount 个点的旧→新渐变 + for (int i = 0; i < fadeCount && newStart + i < newEnd; i++) + { + float t = (float)(i + 1) / (fadeCount + 1); + double x = newPoints[newStart + i].X; + double y = oldPoints[oldStart + i].Y * (1 - t) + newPoints[newStart + i].Y * t; + result.Add(new Point(x, y)); + } + + // 中间段:全量新值(仅当区间足够长时) + for (int i = newStart + fadeCount; i < newEnd - fadeCount; i++) + result.Add(newPoints[i]); + + // 后过渡:fadeCount 个点的新→旧渐变(按时间递增顺序) + int backStart = Math.Max(newStart, newEnd - fadeCount); + for (int i = backStart; i < newEnd; i++) + { + float t = (float)(newEnd - i) / (fadeCount + 1); + int oi = oldEnd - (newEnd - i); + if (oi < 0 || oi >= oldPoints.Count) continue; + double x = newPoints[i].X; + double y = oldPoints[oi].Y * (1 - t) + newPoints[i].Y * t; + result.Add(new Point(x, y)); + } + + // 旧区间(后段) + for (int i = oldEnd; i < oldPoints.Count; i++) result.Add(oldPoints[i]); + + return result; } // 无 dur 预测器兜底:每 note 一元音、占满 note 时长(无对齐/无 head/tail 之外的处理)。 @@ -408,13 +732,29 @@ Func GenderConvert() // VELC convert(OpenUtau DiffSingerRenderer):对数标度,100 = 原速,每 +100 速度 ×2。 static double SpeedConvert(double x) => Math.Pow(2, (x - 100) / 100); - // 回显段:纯预测值(不含用户编辑),clamp 到声学值域,逐帧 (全局秒, 值)。 - static List BuildReadbackSegment(VarianceSpec spec, float[] predicted, double[] frameTimes, int n) + // 回显段:最终值(含用户包络),clamp 到声学值域,逐帧 (全局秒, 值)。 + // 整个 SP 段做透明度过渡:外边界 → 声学最小值(背景=不可见),内边界 → 100%(实际值)。 + // 注意:pitch 回显直接排除 SP 帧(不画),此处用渐变使曲线从背景平滑浮现/消失。 + static List BuildReadbackSegment(VarianceSpec spec, float[] finalValues, double[] frameTimes, int n, + int headSpFrames = 0, int tailSpFrames = 0) { + float fadeTarget = (float)spec.AcousticMin; // 声学最小值 = 曲线不可见的背景值 var points = new List(n); for (int f = 0; f < n; f++) { - float x = f < predicted.Length ? predicted[f] : predicted[^1]; + float x = f < finalValues.Length ? finalValues[f] : finalValues[^1]; + // 前 SP:整个 SP 段从外边界(f=0)到内边界线性渐入(0% → 100%) + if (f < headSpFrames) + { + float t = (float)(f + 1) / headSpFrames; + x = x * t + fadeTarget * (1 - t); + } + // 后 SP:整个 SP 段从内边界到外边界线性渐出(100% → 0%) + if (f >= n - tailSpFrames) + { + float t = (float)(n - f) / tailSpFrames; + x = x * t + fadeTarget * (1 - t); + } points.Add(new Point(frameTimes[f], Math.Clamp(x, spec.AcousticMin, spec.AcousticMax))); } return points; @@ -433,7 +773,9 @@ static string PickVowelSymbol(VoiceModels models, string lang) public IReadOnlyList> SynthesizedPitch => mPieces.Where(p => p.PitchReadback.Count > 0).Select(p => p.PitchReadback).ToList(); - // 回显产物(数据线程发布、可跨线程读):按声明的回显轨 key 聚合各 piece 的纯预测段(每 piece 一段、段间断开)。 + // 回显产物(数据线程发布、可跨线程读):按声明的回显轨 key 聚合各 piece 的预测段。 + // 每 piece 整段作为一个 segment,用多 GradientStop 的 LinearGradientBrush 实现像素级平滑透明度。 + // 透明度轮廓:head SP 0%→25%, body 25%, tail SP 25%→0%,无段间边界。 public IReadOnlyMap SynthesizedParameters { get @@ -442,6 +784,7 @@ public IReadOnlyMap SynthesizedParameters foreach (var kvp in mReadbackConfigs) { var segments = new List>(); + var stopSets = new List>(); foreach (var piece in mPieces) if (piece.VarianceReadback.TryGetValue(kvp.Key.Id, out var segment) && segment.Count > 0) segments.Add(segment); @@ -551,40 +894,89 @@ void Resegment() StatusChanged?.Invoke(); } + sealed record NoteHandlers(Action OnDur, Action OnPitch, Action OnLyric, Action OnProps); + void SubscribeNote(ILiveNote note) { - void Handler() + Action onDur = () => + { + SetAffectedRange(note.StartTime.Value, note.EndTime.Value); + MarkPieceDirty(note, clearPhones: true, clearPitch: false, clearVariance: true); + mNeedResegment = true; + }; + Action onPitch = () => { - foreach (var piece in mPieces) - if (piece.Notes.Contains(note)) { piece.Dirty = true; piece.Failed = false; } + SetAffectedRange(note.StartTime.Value, note.EndTime.Value); + MarkPieceDirty(note, clearPhones: false, clearPitch: false, clearVariance: false); + }; + Action onLyric = () => + { + SetAffectedRange(note.StartTime.Value, note.EndTime.Value); + MarkPieceDirty(note, clearPhones: true, clearPitch: true, clearVariance: true); + mNeedResegment = true; + }; + Action onProps = () => + { + SetAffectedRange(note.StartTime.Value, note.EndTime.Value); + MarkPieceDirty(note, clearPhones: true, clearPitch: false, clearVariance: true); mNeedResegment = true; + }; + + note.StartTime.Modified += onDur; + note.EndTime.Modified += onDur; + note.Phonemes.Modified += onDur; + note.Pitch.Modified += onPitch; + note.Lyric.Modified += onLyric; + note.Properties.Modified += onProps; + + mNoteHandlers[note] = new NoteHandlers(onDur, onPitch, onLyric, onProps); + } + + void MarkPieceDirty(ILiveNote note, bool clearPhones, bool clearPitch, bool clearVariance) + { + foreach (var piece in mPieces) + { + if (!piece.Notes.Contains(note)) continue; + piece.Dirty = true; piece.Failed = false; + if (clearPhones) piece.CachedPhones = null; + if (clearPitch) piece.CachedPitchPrediction = null; + if (clearVariance) piece.CachedVarianceCurves = default; + return; } - mNoteHandlers[note] = Handler; - note.StartTime.Modified += Handler; - note.EndTime.Modified += Handler; - note.Pitch.Modified += Handler; - note.Lyric.Modified += Handler; - note.Phonemes.Modified += Handler; - note.Properties.Modified += Handler; + } + + void SetAffectedRange(double start, double end) + { + double pad = 0.1; + mAffectedStartTime = start - pad; + mAffectedEndTime = end + pad; } void UnsubscribeNote(ILiveNote note) { - if (!mNoteHandlers.Remove(note, out var handler)) - return; - note.StartTime.Modified -= handler; - note.EndTime.Modified -= handler; - note.Pitch.Modified -= handler; - note.Lyric.Modified -= handler; - note.Phonemes.Modified -= handler; - note.Properties.Modified -= handler; + if (mNoteHandlers.Remove(note, out var h)) + { + note.StartTime.Modified -= h.OnDur; + note.EndTime.Modified -= h.OnDur; + note.Phonemes.Modified -= h.OnDur; + note.Pitch.Modified -= h.OnPitch; + note.Lyric.Modified -= h.OnLyric; + note.Properties.Modified -= h.OnProps; + } } - void OnNotesStructureChanged(ILiveNote note) => mNeedResegment = true; + void OnNotesStructureChanged(ILiveNote note) { mNeedResegment = true; } void MarkAllDirtyAndResegment() { - foreach (var piece in mPieces) { piece.Dirty = true; piece.Failed = false; } + foreach (var piece in mPieces) + { + piece.Dirty = true; piece.Failed = false; + // 段落属性(语言等)变更 → 清除音素/variance 缓存,强制使用新的 G2P + piece.CachedPhones = null; + piece.CachedVarianceCurves = default; + // 保留 pitch 缓存 + } mNeedResegment = true; // part 属性变更可能增删了说话人混合轨:补订新出现的、退订已消失的,使后续画曲线(RangeModified)能标脏。 // 时序安全:宿主 OnPartPropertiesModified(part 构造期订阅)先于本会话 handler(会话构造期订阅)执行, @@ -621,6 +1013,9 @@ void OnCommitted() void OnRangeModified(double startTime, double endTime) { + // 记录受影响的 time range(自动化曲线修改) + SetAffectedRange(startTime, endTime); + foreach (var piece in mPieces) { if (piece.EndTime < startTime || piece.StartTime > endTime) @@ -631,10 +1026,6 @@ void OnRangeModified(double startTime, double endTime) StatusChanged?.Invoke(); } - sealed record RenderResult(float[] Audio, double StartTime, int SampleRate, - List Phonemes, List PitchReadback, - Dictionary> VarianceReadback); - sealed class Piece { public required IReadOnlyList Notes; @@ -649,5 +1040,21 @@ sealed class Piece public IReadOnlyList Phonemes = []; public IReadOnlyList PitchReadback = []; public IReadOnlyDictionary> VarianceReadback = new Dictionary>(); + // 回显曲线缓存(区间拼贴用) + public IReadOnlyList? CachedPitchReadback; + public IReadOnlyDictionary> CachedVarianceReadback = new Dictionary>(); + + // 缓存:note 代数(判断缓存有效性) + // 模型预测缓存(增量渲染时复用) + public float[]? CachedPitchPrediction; + public VarianceCurves CachedVarianceCurves; + public List? CachedPhones; + // mel 缓存(用于交叉过渡) + public float[]? CachedMel; + public int[]? CachedMelDims; + public float[]? CachedAudio; + public bool RedrawPitchRequested; + // 帧时长(秒),用于过渡区间计算 + public double FrameSec; } } diff --git a/DiffSingerVariance.cs b/DiffSingerVariance.cs index 5843255..0c74a32 100644 --- a/DiffSingerVariance.cs +++ b/DiffSingerVariance.cs @@ -42,14 +42,21 @@ public static VarianceCurves Predict( var langs = symbols.Select(s => v.LangId(PhonemeLanguage(s))).Prepend(0L).Append(0L).ToArray(); var isVowel = symbols.Select(v.IsVowel).ToArray(); - // —— linguistic(词模式)—— - var (wordDiv, wordDur) = DiffSingerFrames.PaddedWordDivAndDur(isVowel, phDur); + // —— linguistic(据编码器实际输入选择词模式或音素模式)—— var lingInputs = new List { NvL("tokens", tokens, nTokens), - NvL("word_div", wordDiv, wordDiv.Length), - NvL("word_dur", wordDur, wordDur.Length), }; + if (v.LinguisticUsesWordBoundary) + { + var (wordDiv, wordDur) = DiffSingerFrames.PaddedWordDivAndDur(isVowel, phDur); + lingInputs.Add(NvL("word_div", wordDiv, wordDiv.Length)); + lingInputs.Add(NvL("word_dur", wordDur, wordDur.Length)); + } + else + { + lingInputs.Add(NvL("ph_dur", phDur.Select(x => (long)x).ToArray(), nTokens)); + } if (v.Linguistic.InputMetadata.ContainsKey("languages")) lingInputs.Add(NvL("languages", langs, nTokens)); var lingOut = DiffSingerTensorCache.Run(v.Linguistic, v.LinguisticHash, lingInputs, tensorCache); diff --git a/DiffSingerVoiceEngine.cs b/DiffSingerVoiceEngine.cs index 530f77a..8e0271f 100644 --- a/DiffSingerVoiceEngine.cs +++ b/DiffSingerVoiceEngine.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using TuneLab.Foundation; using TuneLab.SDK; @@ -34,6 +35,56 @@ public void Destroy() mModelCache = null; } + // —— 并发渲染限流(DirectML 多轨并发需要限制同时渲染的轨数)—— + SemaphoreSlim mRenderSemaphore = new(1, 1); + + internal void UpdateRenderSemaphore(int maxConcurrent) + { + maxConcurrent = Math.Max(1, maxConcurrent); + var old = Interlocked.Exchange(ref mRenderSemaphore, new SemaphoreSlim(maxConcurrent, maxConcurrent)); + old.Dispose(); + } + + internal SemaphoreSlim RenderSemaphore => mRenderSemaphore; + + // —— 会话注册表(引擎级,跨会话共享)—— + readonly List> mSessions = new(); + + internal void RegisterSession(DiffSingerSynthesisSession session) + { + lock (mSessions) + { + // 清理已回收的弱引用 + mSessions.RemoveAll(wr => !wr.TryGetTarget(out _)); + mSessions.Add(new WeakReference(session)); + } + } + + internal void UnregisterSession(DiffSingerSynthesisSession session) + { + lock (mSessions) + { + mSessions.RemoveAll(wr => !wr.TryGetTarget(out var s) || s == session); + } + } + + internal DiffSingerSynthesisSession? FindSessionByVoiceId(string voiceId) + { + lock (mSessions) + { + // 清理已回收 + mSessions.RemoveAll(wr => !wr.TryGetTarget(out _)); + // 取最后一个匹配 voiceId 的会话(通常只有一个) + DiffSingerSynthesisSession? found = null; + foreach (var wr in mSessions) + { + if (wr.TryGetTarget(out var s) && s.VoiceId == voiceId) + found = s; + } + return found; + } + } + public ISynthesisSession CreateSession(string voiceId, ISynthesisContext context) { if (!mState.Banks.ContainsKey(voiceId)) @@ -60,7 +111,94 @@ public ObjectConfig GetPartPropertyConfig(IPartPropertyContext context) => ConfigFor(context.VoiceId) is { } c ? DiffSingerDeclarations.BuildPartConfig(c, context) : EmptyConfig; public ObjectConfig GetNotePropertyConfig(INotePropertyContext context) - => ConfigFor(context.VoiceId) is { } c ? DiffSingerDeclarations.BuildNoteConfig(c, context) : EmptyConfig; + { + if (ConfigFor(context.VoiceId) is not { } c) + return EmptyConfig; + + var baseConfig = DiffSingerDeclarations.BuildNoteConfig(c, context); + var voiceId = context.VoiceId; + double selStart = context.SelectionStartTime; + double selEnd = context.SelectionEndTime; + bool hasSelection = !double.IsNaN(selStart) && !double.IsNaN(selEnd); + + var props = new OrderedMap(); + foreach (var kvp in baseConfig.Properties) + props.Add(kvp.Key, kvp.Value); + + // —— 音素编辑器 —— + var phonemesJson = context.NoteProperties.GetString("_phonemes", "[]"); + var phonemeEntries = ParsePhonemeJson(phonemesJson); + if (phonemeEntries.Count == 0 && c.Languages.Count > 0 && !double.IsNaN(selStart)) + { + // 未自定义音素时,尝试从会话获取默认 G2P 音素 + double midTime = (selStart + selEnd) / 2; + var session = FindSessionByVoiceId(voiceId); + if (session != null) + { + // 从上下文快照取段落语言(保证段落语言变更后能传新值给 G2P) + string snapPartLang = context.PartProperties.GetString(DiffSingerDeclarations.KeyLanguage, string.Empty); + phonemeEntries = session.GetDefaultPhonemesForNote(midTime, snapPartLang); + } + } + if (c.Languages.Count > 0) + { + props.Add("_phoneme_editor", new PhonemeEditorConfig + { + DisplayText = L.Tr("Phonemes"), + Phonemes = phonemeEntries, + AvailableLanguages = c.Languages, + LanguageDataKey = DiffSingerDeclarations.KeyLanguage, + CanDeleteConsonant = phonemeEntries.Count(e => !e.IsVowel) > 1, + CanDeleteVowel = phonemeEntries.Count(e => e.IsVowel) > 1, + OnChanged = _ => + { + // 音素被编辑 → 触发区间重渲染 + var s = FindSessionByVoiceId(voiceId); + if (s != null) + { + if (hasSelection) + s.RequestRetakeScoped(selStart, selEnd); + else + s.RequestRetake(); + } + }, + }); + } + + // 追加 Retake 和 Redraw Pitch 按钮(使用选中音符的区间而非全曲) + props.Add("_retake", new ButtonConfig + { + DisplayText = L.Tr("Retake"), + Action = () => + { + var session = FindSessionByVoiceId(voiceId); + if (session != null) + { + if (hasSelection) + session.RequestRetakeScoped(selStart, selEnd); + else + session.RequestRetake(); + } + }, + }); + props.Add("_redraw_pitch", new ButtonConfig + { + DisplayText = L.Tr("Redraw Pitch"), + Action = () => + { + var session = FindSessionByVoiceId(voiceId); + if (session != null) + { + if (hasSelection) + session.RequestRedrawPitchScoped(selStart, selEnd); + else + session.RequestRedrawPitch(); + } + }, + }); + + return new ObjectConfig { Properties = props }; + } // 声库能力集按 voiceId 缓存(声明每次 commit 都调,避免重复解析 dsconfig);config 随声库不可变,扫描重建时清空。 VoicebankConfig? ConfigFor(string voiceId) @@ -137,6 +275,8 @@ public ObjectConfig GetSettingsConfig(IExtensionSettingsContext context) public void ApplySettings(PropertyObject settings) { mSettings = settings; + var maxConcurrent = mSettings.GetInt(KeyMaxConcurrentRenderings, 1); + UpdateRenderSemaphore(maxConcurrent); Rescan(); } @@ -178,6 +318,38 @@ static void EnsureDefaultDirectory() catch { } } + // 解析音素 JSON:[{"s":"ja/b","v":false},...] + static List ParsePhonemeJson(string json) + { + var result = new List(); + if (string.IsNullOrEmpty(json) || json.Length < 2) return result; + try + { + // 简单的手动 JSON 解析(避免加依赖) + int i = json.IndexOf('['); + if (i < 0) return result; + while (true) + { + i = json.IndexOf('{', i); + if (i < 0) break; + string entry = json.Substring(i, Math.Min(json.Length - i, json.IndexOf('}', i) - i + 1)); + var phoneme = new PhonemeEntry(); + int sIdx = entry.IndexOf("\"s\":\""); + if (sIdx >= 0) + { + sIdx += 5; + int eIdx = entry.IndexOf('"', sIdx); + if (eIdx > sIdx) phoneme.Symbol = entry.Substring(sIdx, eIdx - sIdx); + } + phoneme.IsVowel = entry.Contains("\"v\":true") || entry.Contains("\"v\": true"); + result.Add(phoneme); + i = json.IndexOf('}', i) + 1; + } + } + catch { } + return result; + } + // 不可变扫描结果,整体替换发布:get 侧读引用、扫描侧建好新实例后一次性换上,无需锁。 sealed record State( IReadOnlyOrderedMap Infos,