From 2831b91ddba7cbfa4401e922b7bf016b27e21ab2 Mon Sep 17 00:00:00 2001
From: tachengP <2638591622@qq.com>
Date: Fri, 19 Jun 2026 18:44:19 +0800
Subject: [PATCH 1/3] =?UTF-8?q?=E9=87=8D=E6=B8=B2=E6=9F=93=E6=9C=BA?=
 =?UTF-8?q?=E5=88=B6=E6=9B=B4=E6=96=B0=EF=BC=8Cdsdur=E3=80=81dspitch?=
 =?UTF-8?q?=E6=8F=90=E6=9D=83?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 DiffSingerDeclarations.cs     |  14 +-
 DiffSingerPredictor.cs        |  17 +-
 DiffSingerSynthesisSession.cs | 610 ++++++++++++++++++++++++++++------
 DiffSingerVariance.cs         |  15 +-
 DiffSingerVoiceEngine.cs      | 119 ++++++-
 5 files changed, 664 insertions(+), 111 deletions(-)
diff --git a/DiffSingerDeclarations.cs b/DiffSingerDeclarations.cs
index 51c427d..04dbbac 100644
--- a/DiffSingerDeclarations.cs
+++ b/DiffSingerDeclarations.cs
@@ -91,7 +91,7 @@ public static ObjectConfig BuildPartConfig(VoicebankConfig config)
             });
 
         if (HasLanguageChoice(config))
-            properties.Add(KeyLanguage, LanguageCombo(config, config.Languages[0]));
+            properties.Add(KeyLanguage, LanguageCombo(config, string.Empty));
 
         return new ObjectConfig { Properties = properties };
     }
@@ -102,7 +102,7 @@ public static ObjectConfig BuildNoteConfig(VoicebankConfig config, INoteProperty
         var properties = new OrderedMap<string, IControllerConfig>();
         if (HasLanguageChoice(config))
         {
-            var partDefault = context.PartProperties.GetString(KeyLanguage, config.Languages[0]);
+            var partDefault = context.PartProperties.GetString(KeyLanguage, string.Empty);
             properties.Add(KeyLanguage, LanguageCombo(config, partDefault));
         }
         return new ObjectConfig { Properties = properties };
@@ -113,15 +113,15 @@ public static ObjectConfig BuildNoteConfig(VoicebankConfig config, INoteProperty
     static ComboBoxConfig LanguageCombo(VoicebankConfig config, string defaultValue) => new()
     {
         DisplayText = L.Tr("Language"),
-        Options = ToOptions(config.Languages),
+        Options = LanguageOptions(config.Languages),
         DefaultOption = PropertyValue.Create(defaultValue),
     };
 
-    static List<ComboBoxOption> ToOptions(IReadOnlyList<string> values)
+    static List<ComboBoxOption> LanguageOptions(IReadOnlyList<string> languages)
     {
-        var options = new List<ComboBoxOption>(values.Count);
-        foreach (var value in values)
-            options.Add(value);   // 隐式转换：string → ComboBoxOption（值即显示文本）
+        var options = new List<ComboBoxOption> { new(PropertyValue.Create(string.Empty), "default") };
+        foreach (var lang in languages)
+            options.Add(lang);
         return options;
     }
 
diff --git a/DiffSingerPredictor.cs b/DiffSingerPredictor.cs
index ac469a9..706fec0 100644
--- a/DiffSingerPredictor.cs
+++ b/DiffSingerPredictor.cs
@@ -109,6 +109,7 @@ public float[] GetEmbedding(string acousticSpeaker)
     }
 
     // —— 词典加载 ——
+    // 策略：先加载 dsdict.yaml 作为默认底库，再叠加载入语种特定文件（后面覆盖前面）。
     Dictionary<string, string[]> GetEntries(string lang)
     {
         lock (mLock)
@@ -117,7 +118,19 @@ Dictionary<string, string[]> GetEntries(string lang)
                 return cached;
 
             var map = new Dictionary<string, string[]>(StringComparer.Ordinal);
-            foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml", "dsdict.yaml" })
+
+            // 1. 加载默认底库 dsdict.yaml（总是存在）
+            var defaultPath = Path.Combine(mDir, "dsdict.yaml");
+            if (File.Exists(defaultPath))
+            {
+                var root = DeserializeDsDict(defaultPath);
+                foreach (var e in root.entries)
+                    if (!string.IsNullOrEmpty(e.grapheme))
+                        map[e.grapheme] = e.phonemes.ToArray();
+            }
+
+            // 2. 叠加载入语种特定文件（若存在则覆盖/补充）
+            foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml" })
             {
                 var path = Path.Combine(mDir, file);
                 if (!File.Exists(path)) continue;
@@ -125,8 +138,8 @@ Dictionary<string, string[]> GetEntries(string lang)
                 foreach (var e in root.entries)
                     if (!string.IsNullOrEmpty(e.grapheme))
                         map[e.grapheme] = e.phonemes.ToArray();
-                break;
             }
+
             mEntryCache[lang] = map;
             return map;
         }
diff --git a/DiffSingerSynthesisSession.cs b/DiffSingerSynthesisSession.cs
index 82d7cb5..4cfe592 100644
--- a/DiffSingerSynthesisSession.cs
+++ b/DiffSingerSynthesisSession.cs
@@ -11,12 +11,13 @@
 
 namespace DiffSingerForTuneLab;
 
-// 一条 part 的合成会话。本阶段实现「声明面」：四个声明方法是选中声库能力集（VoicebankConfig）的纯函数——
-// 据 use_*_embed 暴露可编辑曲线、据 predict_* 暴露只读回显轨、据 speakers/languages 暴露 part/note 属性。
-// 调度与 6 级合成管线、产物发布为后续阶段：GetNextSegment 暂报「无待合成」，故宿主不驱动 SynthesizeNext，
-// 会话呈现属性面板与轨但不产音——诚实的中间态。
-// 声明面（轨集合/属性面板）已上移到 DiffSingerVoiceEngine（经 DiffSingerDeclarations）；本会话仅承载运行时：
-// 调度、6 级推理管线、产物发布。轨 key 与 variance/gender/speed 规格复用 DiffSingerDeclarations（using static 引入）。
+// 一条 part 的合成会话。
+// 关键设计：
+//   · 区段式重渲染：修改某个音素时，仅以该音素为中心前后各扩展一个音素作为「重渲染区段」，
+//     渲染后将新 mel 通过频谱过渡拼贴到原序列的 mel 谱上，避免整个序列被改变。
+//   · pitch 锁定：用户修改 pitch 曲线后，自动音高预测（dspitch）被锁定不再重新生成，
+//     仅缓存的预测作为 NaN 自由区的回退。除非用户选择 Retake 或 RedrawPitch。
+//   · dur 忠于 UI：音素时间线由用户界面（note 时长 / 钉死音素）决定，首次渲染后只有显式请求才重新 phonemize。
 public sealed class DiffSingerSynthesisSession : ISynthesisSession
 {
     readonly VoicebankConfig mConfig;
@@ -25,18 +26,28 @@ public sealed class DiffSingerSynthesisSession : ISynthesisSession
     readonly DiffSingerModelCache mModelCache;
     readonly int mSamplingSteps;
 
-    // 运行时复用的声明派生物（每会话固定，构造期据声库能力集算一次）：
-    //   可编辑轨集合（构造期订阅其区间编辑）+ 回显轨集合（产物 SynthesizedParameters 按其 key 聚合）。
+    internal string VoiceId => mVoiceId;
+
+    enum RenderMode { Normal, Retake }
+    volatile RenderMode mRenderMode;
+    volatile bool mRenderModeConsumed;
+
     readonly OrderedMap<string, AutomationConfig> mAutomationConfigs;
     readonly OrderedMap<string, AutomationConfig> mReadbackConfigs;
 
-    // —— 调度状态（数据线程；按 note 间隙分块，账本式托管失效与产物）——
     readonly IDisposable mNotesSubscription;
-    readonly List<ILiveAutomation> mSubscribedAutomations = new();   // 已订阅 RangeModified 的可编辑轨（Dispose 退订）
-    readonly Dictionary<ILiveNote, Action> mNoteHandlers = new();
+    readonly List<ILiveAutomation> mSubscribedAutomations = new();
+    readonly Dictionary<ILiveNote, NoteHandlers> mNoteHandlers = new();
     readonly List<Piece> mPieces = new();
     bool mNeedResegment;
 
+    // 缓存有效标志
+    bool mHasValidCache;
+
+    // 受影响的 time range（来自 OnRangeModified 或 note 修改），供 mel 拼贴使用
+    double mAffectedStartTime = double.NaN;
+    double mAffectedEndTime = double.NaN;
+
     public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext context,
         string voiceId, DiffSingerModelCache modelCache, int samplingSteps)
     {
@@ -46,11 +57,9 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont
         mModelCache = modelCache;
         mSamplingSteps = samplingSteps;
 
-        // 声明派生物据声库能力集算一次（与引擎声明同一套 DiffSingerDeclarations，单一真相源）。
         mAutomationConfigs = BuildAutomationConfigs(config);
         mReadbackConfigs = BuildReadbackConfigs(config);
 
-        // 变更接线（handler 只做廉价标脏；重活延迟到 Committed 重分块）——见 §5.9。
         mNotesSubscription = NotifiableExtensions.WhenAny(context.Notes, SubscribeNote, UnsubscribeNote);
         context.Notes.ItemAdded += OnNotesStructureChanged;
         context.Notes.ItemRemoved += OnNotesStructureChanged;
@@ -59,8 +68,6 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont
         context.PitchDeviation.RangeModified += OnRangeModified;
         context.Committed += OnCommitted;
 
-        // 可编辑轨（variance / gender / speed）区间编辑订阅：SDK 把声明上移到引擎后，宿主在「建会话之前」即
-        //   RefreshDeclarations 填好 Voice.AutomationConfigs（见 MidiPart 时序），故构造期 TryGetAutomation 即命中、直接订阅。
         foreach (var key in mAutomationConfigs.Keys)
             if (context.TryGetAutomation(key, out var automation))
             {
@@ -71,25 +78,82 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont
         mNeedResegment = true;
     }
 
-    // 新建 note 的默认歌词：中性占位，待词典 G2P 阶段按声库词典择一有效词细化。
     public string DefaultLyric => "a";
 
-    // —— 调度：窗内第一个脏块的纯值边界（peek 廉价、确定性）——
+    // 根据区间找出应处理的 piece 集合（NaN 表示全曲）
+    IEnumerable<Piece> PiecesInRange(double start, double end)
+    {
+        if (double.IsNaN(start) || double.IsNaN(end))
+            return mPieces;
+        return mPieces.Where(p => p.StartTime < end && p.EndTime > start);
+    }
+
+    internal void RequestRetake()
+    {
+        mHasValidCache = false;
+        ClearPieceCaches(PiecesInRange(mAffectedStartTime, mAffectedEndTime));
+        StatusChanged?.Invoke();
+    }
+
+    internal void RequestRetakeScoped(double scopeStart, double scopeEnd)
+    {
+        mHasValidCache = false;
+        SetAffectedRange(scopeStart, scopeEnd);
+        ClearPieceCaches(PiecesInRange(scopeStart, scopeEnd));
+        StatusChanged?.Invoke();
+    }
+
+    internal void RequestRedrawPitch()
+    {
+        foreach (var piece in PiecesInRange(mAffectedStartTime, mAffectedEndTime))
+        {
+            if (piece.CachedPhones == null) continue;
+            piece.Dirty = true; piece.Failed = false;
+            piece.CachedPitchPrediction = null;
+            piece.RedrawPitchRequested = true;
+        }
+        StatusChanged?.Invoke();
+    }
+
+    internal void RequestRedrawPitchScoped(double scopeStart, double scopeEnd)
+    {
+        SetAffectedRange(scopeStart, scopeEnd);
+        foreach (var piece in PiecesInRange(scopeStart, scopeEnd))
+        {
+            if (piece.CachedPhones == null) continue;
+            piece.Dirty = true; piece.Failed = false;
+            piece.CachedPitchPrediction = null;
+            piece.RedrawPitchRequested = true;
+        }
+        StatusChanged?.Invoke();
+    }
+
+    void ClearPieceCaches(IEnumerable<Piece> pieces)
+    {
+        foreach (var piece in pieces)
+        {
+            piece.Dirty = true; piece.Failed = false;
+            piece.CachedPitchPrediction = null;
+            piece.CachedVarianceCurves = default;
+            piece.CachedPhones = null;
+            piece.CachedMel = null; piece.CachedMelDims = null;
+            piece.CachedAudio = null;
+            piece.CachedPitchReadback = null;
+            piece.CachedVarianceReadback = new Dictionary<string, IReadOnlyList<Point>>();
+            piece.RedrawPitchRequested = false;
+        }
+    }
+
     public SynthesisSegment? GetNextSegment(double startTime, double endTime)
         => FindNextDirtyPiece(startTime, endTime) is { } p ? new SynthesisSegment(p.StartTime, p.EndTime) : null;
 
-    // peek 与 commit 共用同一查找（确定性 + 同调度 tick 无编辑 ⇒ commit 重算得到 peek 报出的同一块）。
     Piece? FindNextDirtyPiece(double startTime, double endTime)
     {
-        if (mNeedResegment)
-            Resegment();
-
+        if (mNeedResegment) Resegment();
         foreach (var piece in mPieces)
         {
-            if (!piece.Dirty || piece.Failed || piece.Synthesizing)
-                continue;
-            if (piece.EndTime < startTime || piece.StartTime > endTime)
-                continue;
+            if (!piece.Dirty || piece.Failed || piece.Synthesizing) continue;
+            if (piece.EndTime < startTime || piece.StartTime > endTime) continue;
             return piece;
         }
         return null;
@@ -100,7 +164,6 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
         if (FindNextDirtyPiece(segment.StartTime, segment.EndTime) is not { } piece)
             return;
 
-        // 同步前缀（数据线程）：物化不可变快照（本块 note 全集 + 按 note 范围开窗）。
         var snapshot = mContext.GetSnapshot(piece.Notes, piece.Notes[0].StartTime.Value, piece.Notes.Max(n => n.EndTime.Value));
         piece.Dirty = false;
         piece.Synthesizing = true;
@@ -110,18 +173,41 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
         var report = new Progress<double>(p => { piece.Progress = p; StatusChanged?.Invoke(); });
         try
         {
-            // offload：worker 只读冻结快照跑 ONNX（绝不碰活视图）；模型懒加载经引擎级缓存（首载触发原生加载）。
-            var rendered = await Task.Run(() => Render(snapshot, piece.Notes, report, cancellation), CancellationToken.None);
+            var rendered = await Task.Run(() => Render(snapshot, piece.Notes, piece, report, cancellation), CancellationToken.None);
             if (rendered != null && mPieces.Contains(piece))
             {
                 int rate = rendered.SampleRate;
+
+                // 区段式音频拼贴：仅替换受影响的 time range，其余保持旧音频不变
+                var stitchedAudio = StitchAudio(rendered.Audio, piece.CachedAudio,
+                    rendered.StartTime, rate, mAffectedStartTime, mAffectedEndTime);
+
+                // 缓存旧音频供下次拼贴
+                piece.CachedAudio = stitchedAudio;
+
                 piece.Segment?.Dispose();
-                piece.Segment = mContext.CreateAudioSegment((long)(rendered.StartTime * rate), rendered.Audio.Length, rate);
-                piece.Segment.Write(0, rendered.Audio);
+                piece.Segment = mContext.CreateAudioSegment((long)(rendered.StartTime * rate), stitchedAudio.Length, rate);
+                piece.Segment.Write(0, stitchedAudio);
                 piece.Segment.Commit();
                 piece.Phonemes = rendered.Phonemes;
-                piece.PitchReadback = rendered.PitchReadback;
-                piece.VarianceReadback = rendered.VarianceReadback;
+
+                // 回显曲线也做区间拼贴：未修改区间的 pitch/tension 与缓存的旧曲线一致
+                piece.PitchReadback = StitchPoints(rendered.PitchReadback, piece.CachedPitchReadback,
+                    mAffectedStartTime, mAffectedEndTime);
+                piece.CachedPitchReadback = piece.PitchReadback;
+
+                if (rendered.VarianceReadback.Count > 0)
+                {
+                    var stitchedVar = new Dictionary<string, IReadOnlyList<Point>>();
+                    foreach (var kvp in rendered.VarianceReadback)
+                    {
+                        piece.CachedVarianceReadback.TryGetValue(kvp.Key, out var oldVar);
+                        stitchedVar[kvp.Key] = StitchPoints(kvp.Value, oldVar,
+                            mAffectedStartTime, mAffectedEndTime);
+                    }
+                    piece.VarianceReadback = stitchedVar;
+                    piece.CachedVarianceReadback = stitchedVar;
+                }
             }
         }
         catch (Exception ex)
@@ -137,11 +223,67 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
         }
     }
 
-    // 推理链（worker，只读冻结快照）：忠实移植 OpenUtau phonemizer + renderer（见记忆 openutau-is-authority）。
-    //   phonemizer(dsdur) → 音素时间线；renderer 加 head/tail SP padding、tokens[SP..SP]、durations[8..8]、
-    //   f0(Hz over totalFrames)、variance 预测+用户 delta 合成喂声学（纯预测产回显轨）、spk by frame、depth/steps。
-    //   gender/velocity 走用户曲线 + OpenUtau GENC/VELC convert；pitch 自由区走 dspitch 预测轮廓、已画处用户值覆盖。
-    RenderResult? Render(SynthesisSnapshot snapshot, IReadOnlyList<ILiveNote> origins,
+    // —— 音频拼贴 ——
+    // 将新音频的 affected 区间（前后各扩展 3 帧过渡）替换到旧音频中。
+    // 若旧音频不存在（首次渲染）则直接返回新音频。
+    static float[] StitchAudio(float[] newAudio, float[]? oldAudio,
+        double renderStartSec, int sampleRate,
+        double affectedStart, double affectedEnd)
+    {
+        if (oldAudio == null || oldAudio.Length == 0)
+            return newAudio;
+        if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd))
+            return newAudio; // 无明确 affected 区间时全量替换
+        if (oldAudio.Length != newAudio.Length)
+            return newAudio; // 长度不同无法拼贴
+
+        // 计算 affected 区间的采样点范围（前后扩展 3 帧 = 3 * hop_size 采样点）
+        int hop = 512; // DiffSinger 标准 hop_size
+        int fadeSamples = 3 * hop;
+        int startSample = Math.Max(0, (int)((affectedStart - renderStartSec) * sampleRate) - fadeSamples);
+        int endSample = Math.Min(newAudio.Length, (int)((affectedEnd - renderStartSec) * sampleRate) + fadeSamples);
+
+        if (startSample >= endSample)
+            return newAudio;
+
+        var result = new float[oldAudio.Length];
+        Array.Copy(oldAudio, result, oldAudio.Length);
+
+        // 拷贝 affected 区间的新音频
+        int copyLen = endSample - startSample;
+        Array.Copy(newAudio, startSample, result, startSample, copyLen);
+
+        // 前过渡区：第一帧（线性渐入）
+        int fadeLen = Math.Min(fadeSamples, copyLen / 2);
+        for (int i = 0; i < fadeLen; i++)
+        {
+            float t = (float)(i + 1) / (fadeLen + 1);
+            int idx = startSample + i;
+            if (idx >= 0 && idx < result.Length)
+                result[idx] = oldAudio[idx] * (1 - t) + newAudio[idx] * t;
+        }
+
+        // 后过渡区：最后一帧（线性渐出）
+        for (int i = 0; i < fadeLen; i++)
+        {
+            float t = (float)(i + 1) / (fadeLen + 1);
+            int idx = endSample - 1 - i;
+            if (idx >= 0 && idx < result.Length)
+                result[idx] = oldAudio[idx] * (1 - t) + newAudio[idx] * t;
+        }
+
+        return result;
+    }
+
+    // —— 推理链（worker，只读冻结快照）——
+
+    // 推理结果
+    sealed record RenderResultEx(float[] Audio, double StartTime, int SampleRate,
+        List<SynthesizedPhoneme> Phonemes, List<Point> PitchReadback,
+        Dictionary<string, IReadOnlyList<Point>> VarianceReadback,
+        int[] MelDims, float[] Mel);
+
+    RenderResultEx? Render(SynthesisSnapshot snapshot, IReadOnlyList<ILiveNote> origins, Piece piece,
         IProgress<double>? progress, CancellationToken cancellation)
     {
         var notes = snapshot.Notes;
@@ -152,30 +294,59 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
         int hop = models.HopSize, sr = models.SampleRate, hidden = models.HiddenSize;
         double frameSec = (double)hop / sr;
         int head = DiffSingerFrames.HeadFrames;
+        int numMelBins = models.NumMelBins;
 
-        string partLang = snapshot.PartProperties.GetString(KeyLanguage, mConfig.Languages.Count > 0 ? mConfig.Languages[0] : string.Empty);
+        string partLang = snapshot.PartProperties.GetString(KeyLanguage, string.Empty);
         string speaker = snapshot.PartProperties.GetString(KeySpeaker, mConfig.Speakers.Count > 0 ? mConfig.Speakers[0] : string.Empty);
         var noteLang = notes.Select(nt => nt.Properties.GetString(KeyLanguage, partLang)).ToArray();
 
-        // —— Phonemizer：歌词 → 音素时间线（绝对秒、含前置辅音越界）——
+        // 渲染模式（仅 Retake 用会话级，RedrawPitch 由 piece 级标记驱动）
+        bool isRetake = false;
+        if (!mRenderModeConsumed && mRenderMode == RenderMode.Retake)
+        {
+            isRetake = true;
+            mRenderModeConsumed = true;
+            mRenderMode = RenderMode.Normal;
+        }
+        bool isRedrawPitch = piece.RedrawPitchRequested;
+        piece.RedrawPitchRequested = false;
+
+        bool pieceHasNoCache = piece.CachedPhones == null;
+        bool needFullPredict = isRetake || !mHasValidCache || pieceHasNoCache;
+        bool needPitchPredict = needFullPredict || isRedrawPitch;
+
+        // —— 模型优先级：dsdur/dspitch 提级模型 ——
         var durPred = models.GetPredictor("dsdur");
-        var phones = durPred != null
-            ? DiffSingerPhonemizer.Phonemize(durPred, notes, noteLang, speaker, hop, sr)
-            : FallbackPhonemes(models, notes, noteLang);   // 无 dur 预测器：每 note 一元音兜底
+        var pitchPred = models.GetPredictor("dspitch");
+        var varPred = models.GetPredictor("dsvariance");
+
+        // —— Phonemizer（needFullPredict 时才重新运行，否则复用缓存）——
+        List<PhonemeSpan> phones;
+        if (needFullPredict)
+        {
+            phones = durPred != null
+                ? DiffSingerPhonemizer.Phonemize(durPred, notes, noteLang, speaker, hop, sr)
+                : FallbackPhonemes(models, notes, noteLang);
+            piece.CachedPhones = phones;
+        }
+        else
+        {
+            phones = piece.CachedPhones ?? new List<PhonemeSpan>();
+        }
         if (phones.Count == 0)
             return null;
         progress?.Report(0.2);
         if (cancellation.IsCancellationRequested)
             return null;
 
-        // —— 帧布局：[head SP][...phones...][tail SP]，累积取整 → durations（len=phones+2）——
+        // —— 帧布局 ——
         var phoneDurSec = phones.Select(p => Math.Max(0, p.EndTime - p.StartTime)).ToArray();
         var durations = DiffSingerFrames.PaddedPhoneFrames(phoneDurSec, frameSec);
-        int nTokens = durations.Length;          // phones + 2
+        int nTokens = durations.Length;
         int nFrames = durations.Sum();
         double renderStart = phones[0].StartTime - head * frameSec;
 
-        // 逐帧时刻 + 说话人逐帧混合（mix:<suffix> 曲线，acoustic/pitch/variance 三域共享；不画时退化为默认 speaker 恒权重）。
+        // 逐帧时刻 + 说话人逐帧混合
         var frameTimes = new double[nFrames];
         for (int f = 0; f < nFrames; f++) frameTimes[f] = renderStart + (f + 0.5) * frameSec;
         var mixTracks = new List<(string Suffix, double[] Sampled)>();
@@ -184,7 +355,7 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
                 mixTracks.Add((suffix, mixAuto.Evaluator.Evaluate(frameTimes)));
         var speakerMix = DiffSingerSpeakerMix.Create(Suffix(speaker), mixTracks, nFrames);
 
-        // tokens/languages：声学表，前后加 SP。
+        // tokens/languages
         var tokens = new long[nTokens];
         var langs = new long[nTokens];
         tokens[0] = AcousticToken(models, "SP");
@@ -195,7 +366,7 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
             langs[i + 1] = models.TryGetLanguage(PhonemeLang(phones[i].Symbol), out var lid) ? lid : 0;
         }
 
-        // 逐帧 note 音高回退（head→首 note，phone i→其 note，tail→末 note）。
+        // 逐帧 note 音高回退
         var framePitch = new double[nFrames];
         int fi = 0;
         for (int seg = 0; seg < nTokens; seg++)
@@ -207,44 +378,74 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
             for (int k = 0; k < durations[seg]; k++) framePitch[fi++] = pitch;
         }
 
-        // —— dspitch 自然音高预测（纯从音符、retake 全 true、不吃用户音高）：替代自由区的矩形 note-step 兜底 ——
-        //   用户已画处（Pitch 非 NaN）用户值覆盖；NaN 自由区用预测轮廓（无 dspitch ⇒ 仍用矩形 framePitch）；PITD/vibrato 叠加在上。
-        var predictedPitch = DiffSingerPitch.Predict(
-            models.GetPredictor("dspitch"), phones, notes, durations,
-            renderStart, frameSec, speakerMix, mConfig, mSamplingSteps);
+        // —— 自动音高预测（仅在 needPitchPredict 时跑；否则复用缓存）——
+        float[]? predictedPitch;
+        if (needPitchPredict)
+        {
+            predictedPitch = DiffSingerPitch.Predict(pitchPred, phones, notes, durations,
+                renderStart, frameSec, speakerMix, mConfig, mSamplingSteps);
+            piece.CachedPitchPrediction = predictedPitch;
+        }
+        else
+        {
+            predictedPitch = piece.CachedPitchPrediction;
+        }
         progress?.Report(0.28);
         if (cancellation.IsCancellationRequested)
             return null;
 
-        // 逐帧 f0(Hz) + 半音曲线（variance 用）：帧中心采样双通道音高，NaN 自由区回退预测轮廓（无则 note 音高）。
+        // 逐帧 f0(Hz) + 半音曲线
         var pitchCurve = snapshot.Pitch.Evaluator.Evaluate(frameTimes);
         var deviation = snapshot.PitchDeviation.Evaluator.Evaluate(frameTimes);
         var f0 = new float[nFrames];
         var semis = new float[nFrames];
         var pitchReadback = new List<Point>(nFrames);
+
+        // —— 初始生成自动音高时排除前后 SP 音高数据（不画到用户界面）——
+        //   判断哪些帧属于 head SP（第 0 个 token）和 tail SP（最后一个 token）
+        int spHeadFrames = durations[0];    // head SP 的帧数
+        int spTailFrames = durations[^1];   // tail SP 的帧数
+
         for (int f = 0; f < nFrames; f++)
         {
-            double fallback = predictedPitch != null
-                ? (f < predictedPitch.Length ? predictedPitch[f] : predictedPitch[^1])
-                : framePitch[f];
+            double fallback;
+            if (predictedPitch != null)
+                fallback = f < predictedPitch.Length ? predictedPitch[f] : predictedPitch[^1];
+            else
+                fallback = framePitch[f];
+
             double semitone = (double.IsNaN(pitchCurve[f]) ? fallback : pitchCurve[f]) + deviation[f];
             semis[f] = (float)semitone;
             f0[f] = DiffSingerFrames.ToneToFreq(semitone);
-            pitchReadback.Add(new Point(frameTimes[f], semitone));
+
+            // 排除前后 SP 帧的音高回显（不在 pitchReadback 中添加）
+            bool isHeadOrTailSp = f < spHeadFrames || f >= nFrames - spTailFrames;
+            if (!isHeadOrTailSp)
+            {
+                pitchReadback.Add(new Point(frameTimes[f], semitone));
+            }
         }
         progress?.Report(0.3);
         if (cancellation.IsCancellationRequested)
             return null;
 
-        // —— variance 预测（基线；下方与用户 delta 合成喂声学、纯预测产回显）——
-        var varCurves = DiffSingerVariance.Predict(
-            models.GetPredictor("dsvariance"), phones.Select(p => p.Symbol).ToList(),
-            durations, semis, speakerMix, mConfig, mSamplingSteps);
+        // —— variance 预测（needFullPredict 时重新预测；否则复用缓存）——
+        VarianceCurves varCurves;
+        if (needFullPredict)
+        {
+            varCurves = DiffSingerVariance.Predict(varPred, phones.Select(p => p.Symbol).ToList(),
+                durations, semis, speakerMix, mConfig, mSamplingSteps);
+            piece.CachedVarianceCurves = varCurves;
+        }
+        else
+        {
+            varCurves = piece.CachedVarianceCurves;
+        }
         progress?.Report(0.45);
         if (cancellation.IsCancellationRequested)
             return null;
 
-        // —— 声学输入（按 InputMetadata 条件构造）——
+        // —— 声学输入 ——
         var ac = models.Acoustic;
         var inputs = new List<NamedOnnxValue>();
         void AddL(string name, long[] data, int[] dims)
@@ -257,9 +458,7 @@ void AddF(string name, float[] data, int[] dims)
         AddL("durations", durations.Select(x => (long)x).ToArray(), new[] { 1, nTokens });
         AddF("f0", f0, new[] { 1, nFrames });
 
-        // —— variance：预测 + 用户 delta 合成喂声学，同时产纯预测回显 ——
-        //   用户曲线按帧求值（连续轨：未编辑处=中性基线 → Delta 恒得纯预测；编辑处 → 叠加），clamp 到声学值域。
-        //   回显（Use && Predict）= 纯预测值，不含用户编辑。
+        // —— variance：使用（缓存的）预测 + 用户 delta 合成喂声学 ——
         var varReadback = new Dictionary<string, IReadOnlyList<Point>>();
         foreach (var spec in Variances)
         {
@@ -275,8 +474,7 @@ void AddF(string name, float[] data, int[] dims)
                 varReadback[spec.Key] = BuildReadbackSegment(spec, predicted, frameTimes, nFrames);
         }
 
-        // —— gender / velocity：纯用户曲线（无方差器基线），按帧 convert 喂声学（忠实移植 OpenUtau GENC/VELC）——
-        //   无轨 / NaN 自由区 → 中性 → convert 得中性 embed（gender 0、velocity 1）；OpenUtau 不 clamp（UI 量程已界定）。
+        // —— gender / velocity ——
         AddF("gender", BuildCurveInput(snapshot, KeyGender, GenderBaseline, GenderConvert(), frameTimes, nFrames), new[] { 1, nFrames });
         AddF("velocity", BuildCurveInput(snapshot, KeySpeed, SpeedBaseline, SpeedConvert, frameTimes, nFrames), new[] { 1, nFrames });
 
@@ -299,22 +497,47 @@ void AddF(string name, float[] data, int[] dims)
             inputs.Add(NamedOnnxValue.CreateFromTensor("speedup", new DenseTensor<long>(new[] { speedup }, new[] { 1 })));
         }
 
+        // —— 声学模型：产 mel ——
         using var melOut = ac.Run(inputs);
-        var mel = melOut.First(v => v.Name == "mel").AsTensor<float>();
+        var melTensor = melOut.First(v => v.Name == "mel").AsTensor<float>();
+        var melDims = melTensor.Dimensions.ToArray();
+        var newMel = melTensor.ToArray();
+
+        // —— 区段式 mel 拼贴：仅替换受影响的 time range，其余保持旧 mel ——
+        float[] finalMel;
+        if (piece.CachedMel != null && piece.CachedMelDims != null
+            && melDims.SequenceEqual(piece.CachedMelDims) && piece.CachedMel.Length == newMel.Length)
+        {
+            finalMel = StitchMel(newMel, piece.CachedMel, frameTimes, nFrames, numMelBins, mAffectedStartTime, mAffectedEndTime);
+        }
+        else
+        {
+            finalMel = newMel;
+        }
+
+        // 更新 mel 缓存
+        piece.CachedMel = finalMel;
+        piece.CachedMelDims = melDims;
+
         progress?.Report(0.75);
         if (cancellation.IsCancellationRequested)
             return null;
 
-        // —— 声码器：mel (+ f0) → 波形 ——
+        // —— 声码器（使用原始 mel 形状创建张量）——
         var voc = models.Vocoder;
-        var vInputs = new List<NamedOnnxValue> { NamedOnnxValue.CreateFromTensor("mel", mel) };
+        var melShape = new int[melDims.Length];
+        Array.Copy(melDims, melShape, melDims.Length);
+        var vInputs = new List<NamedOnnxValue>
+        {
+            NamedOnnxValue.CreateFromTensor("mel", new DenseTensor<float>(finalMel, melShape))
+        };
         if (voc.InputMetadata.ContainsKey("f0"))
             vInputs.Add(NamedOnnxValue.CreateFromTensor("f0", new DenseTensor<float>(f0, new[] { 1, nFrames })));
         using var wavOut = voc.Run(vInputs);
         var audio = wavOut.First(v => v.Name == "waveform").AsTensor<float>().ToArray();
         progress?.Report(1.0);
 
-        // —— 音素产物（绝对秒、韵核吸收伸缩）——
+        // —— 音素产物 ——
         var phonemes = phones.Select(p => new SynthesizedPhoneme
         {
             Symbol = p.Symbol,
@@ -324,7 +547,148 @@ void AddF(string name, float[] data, int[] dims)
             StretchWeight = p.IsVowel ? 1 : 0,
         }).ToList();
 
-        return new RenderResult(audio, renderStart, sr, phonemes, pitchReadback, varReadback);
+        // 标记缓存有效（首次渲染成功后保持）
+        mHasValidCache = true;
+
+        return new RenderResultEx(audio, renderStart, sr, phonemes, pitchReadback, varReadback,
+            melDims, finalMel);
+    }
+
+    // —— 区段式 mel 拼贴 ——
+    // 将新 mel 的「受影响的 time range」替换到旧 mel 中，边界做 3 帧交叉过渡。
+    // 旧 mel 为 null 或区间无效时直接返回新 mel。
+    static float[] StitchMel(float[] newMel, float[]? oldMel, double[] frameTimes,
+        int nFrames, int numMelBins, double affectedStart, double affectedEnd)
+    {
+        if (oldMel == null || oldMel.Length != newMel.Length)
+            return newMel;
+        if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd))
+            return newMel;
+        if (affectedEnd <= frameTimes[0] || affectedStart >= frameTimes[^1])
+            return newMel; // affected 区间完全在渲染范围外
+
+        const int fadeFrames = 3;
+        int totalFrames = nFrames;
+
+        // 找 affected 区间对应的帧范围（前后各扩展 fadeFrames 帧）
+        int startFrame = totalFrames - 1;
+        int endFrame = 0;
+        for (int f = 0; f < totalFrames; f++)
+        {
+            if (frameTimes[f] >= affectedStart && frameTimes[f] <= affectedEnd)
+            {
+                if (f < startFrame) startFrame = f;
+                if (f > endFrame) endFrame = f;
+            }
+        }
+        startFrame = Math.Max(0, startFrame - fadeFrames);
+        endFrame = Math.Min(totalFrames - 1, endFrame + fadeFrames);
+
+        if (startFrame >= endFrame)
+            return newMel;
+
+        var result = new float[newMel.Length];
+        Array.Copy(oldMel, result, newMel.Length);
+
+        // 区间内直接替换为新 mel
+        for (int f = startFrame + fadeFrames; f <= endFrame - fadeFrames; f++)
+            for (int b = 0; b < numMelBins; b++)
+                result[f * numMelBins + b] = newMel[f * numMelBins + b];
+
+        // 前过渡区（fadeFrames 帧线性渐入）
+        for (int i = 0; i < fadeFrames && startFrame + i < totalFrames; i++)
+        {
+            float t = (float)(i + 1) / (fadeFrames + 1);
+            int f = startFrame + i;
+            for (int b = 0; b < numMelBins; b++)
+            {
+                int idx = f * numMelBins + b;
+                result[idx] = oldMel[idx] * (1 - t) + newMel[idx] * t;
+            }
+        }
+
+        // 后过渡区（fadeFrames 帧线性渐出）
+        for (int i = 0; i < fadeFrames && endFrame - i >= 0; i++)
+        {
+            float t = (float)(i + 1) / (fadeFrames + 1);
+            int f = endFrame - i;
+            for (int b = 0; b < numMelBins; b++)
+            {
+                int idx = f * numMelBins + b;
+                result[idx] = oldMel[idx] * (1 - t) + newMel[idx] * t;
+            }
+        }
+
+        return result;
+    }
+
+    // —— Point 列表拼贴 ——
+    // 将受 affected 区间内的 Point 替换为新列表中的值，其余保持旧列表不变。
+    // 旧列表为 null 或帧结构改变（点数不同）时直接返回新列表。
+    static IReadOnlyList<Point> StitchPoints(IReadOnlyList<Point> newPoints, IReadOnlyList<Point>? oldPoints,
+        double affectedStart, double affectedEnd)
+    {
+        if (oldPoints == null || oldPoints.Count == 0)
+            return newPoints;
+        if (double.IsNaN(affectedStart) || double.IsNaN(affectedEnd))
+            return newPoints;
+        if (newPoints.Count == 0)
+            return oldPoints;
+        if (newPoints.Count != oldPoints.Count)
+            return newPoints;
+        if (Math.Abs(newPoints[0].X - oldPoints[0].X) > 0.001)
+            return newPoints;
+
+        int oldStart = 0, oldEnd = oldPoints.Count;
+        while (oldStart < oldPoints.Count && oldPoints[oldStart].X < affectedStart) oldStart++;
+        while (oldEnd > 0 && oldPoints[oldEnd - 1].X > affectedEnd) oldEnd--;
+
+        if (oldStart >= oldEnd)
+            return newPoints;
+
+        int newStart = 0, newEnd = newPoints.Count;
+        while (newStart < newPoints.Count && newPoints[newStart].X < affectedStart) newStart++;
+        while (newEnd > 0 && newPoints[newEnd - 1].X > affectedEnd) newEnd--;
+
+        if (newStart >= newEnd)
+            return oldPoints;
+
+        // 拼接并在边界做 3 点线性过渡，避免 pitch 断层
+        const int fadeCount = 3;
+        var result = new List<Point>(oldStart + (newEnd - newStart) + (oldPoints.Count - oldEnd));
+
+        // 旧区间（前段）
+        for (int i = 0; i < oldStart; i++) result.Add(oldPoints[i]);
+
+        // 前过渡：fadeCount 个点的旧→新渐变
+        for (int i = 0; i < fadeCount && newStart + i < newEnd; i++)
+        {
+            float t = (float)(i + 1) / (fadeCount + 1);
+            double x = newPoints[newStart + i].X;
+            double y = oldPoints[oldStart + i].Y * (1 - t) + newPoints[newStart + i].Y * t;
+            result.Add(new Point(x, y));
+        }
+
+        // 中间段：全量新值（仅当区间足够长时）
+        for (int i = newStart + fadeCount; i < newEnd - fadeCount; i++)
+            result.Add(newPoints[i]);
+
+        // 后过渡：fadeCount 个点的新→旧渐变（按时间递增顺序）
+        int backStart = Math.Max(newStart, newEnd - fadeCount);
+        for (int i = backStart; i < newEnd; i++)
+        {
+            float t = (float)(newEnd - i) / (fadeCount + 1);
+            int oi = oldEnd - (newEnd - i);
+            if (oi < 0 || oi >= oldPoints.Count) continue;
+            double x = newPoints[i].X;
+            double y = oldPoints[oi].Y * (1 - t) + newPoints[i].Y * t;
+            result.Add(new Point(x, y));
+        }
+
+        // 旧区间（后段）
+        for (int i = oldEnd; i < oldPoints.Count; i++) result.Add(oldPoints[i]);
+
+        return result;
     }
 
     // 无 dur 预测器兜底：每 note 一元音、占满 note 时长（无对齐/无 head/tail 之外的处理）。
@@ -532,36 +896,78 @@ void Resegment()
         StatusChanged?.Invoke();
     }
 
+    sealed record NoteHandlers(Action OnDur, Action OnPitch, Action OnLyric, Action OnProps);
+
     void SubscribeNote(ILiveNote note)
     {
-        void Handler()
+        Action onDur = () =>
+        {
+            SetAffectedRange(note.StartTime.Value, note.EndTime.Value);
+            MarkPieceDirty(note, clearPhones: true, clearPitch: false, clearVariance: true);
+            mNeedResegment = true;
+        };
+        Action onPitch = () =>
+        {
+            SetAffectedRange(note.StartTime.Value, note.EndTime.Value);
+            MarkPieceDirty(note, clearPhones: false, clearPitch: false, clearVariance: false);
+        };
+        Action onLyric = () =>
+        {
+            SetAffectedRange(note.StartTime.Value, note.EndTime.Value);
+            MarkPieceDirty(note, clearPhones: true, clearPitch: true, clearVariance: true);
+            mNeedResegment = true;
+        };
+        Action onProps = () =>
         {
-            foreach (var piece in mPieces)
-                if (piece.Notes.Contains(note)) { piece.Dirty = true; piece.Failed = false; }
+            SetAffectedRange(note.StartTime.Value, note.EndTime.Value);
+            MarkPieceDirty(note, clearPhones: true, clearPitch: false, clearVariance: true);
             mNeedResegment = true;
+        };
+
+        note.StartTime.Modified += onDur;
+        note.EndTime.Modified += onDur;
+        note.Phonemes.Modified += onDur;
+        note.Pitch.Modified += onPitch;
+        note.Lyric.Modified += onLyric;
+        note.Properties.Modified += onProps;
+
+        mNoteHandlers[note] = new NoteHandlers(onDur, onPitch, onLyric, onProps);
+    }
+
+    void MarkPieceDirty(ILiveNote note, bool clearPhones, bool clearPitch, bool clearVariance)
+    {
+        foreach (var piece in mPieces)
+        {
+            if (!piece.Notes.Contains(note)) continue;
+            piece.Dirty = true; piece.Failed = false;
+            if (clearPhones) piece.CachedPhones = null;
+            if (clearPitch) piece.CachedPitchPrediction = null;
+            if (clearVariance) piece.CachedVarianceCurves = default;
+            return;
         }
-        mNoteHandlers[note] = Handler;
-        note.StartTime.Modified += Handler;
-        note.EndTime.Modified += Handler;
-        note.Pitch.Modified += Handler;
-        note.Lyric.Modified += Handler;
-        note.Phonemes.Modified += Handler;
-        note.Properties.Modified += Handler;
+    }
+
+    void SetAffectedRange(double start, double end)
+    {
+        double pad = 0.1;
+        mAffectedStartTime = start - pad;
+        mAffectedEndTime = end + pad;
     }
 
     void UnsubscribeNote(ILiveNote note)
     {
-        if (!mNoteHandlers.Remove(note, out var handler))
-            return;
-        note.StartTime.Modified -= handler;
-        note.EndTime.Modified -= handler;
-        note.Pitch.Modified -= handler;
-        note.Lyric.Modified -= handler;
-        note.Phonemes.Modified -= handler;
-        note.Properties.Modified -= handler;
+        if (mNoteHandlers.Remove(note, out var h))
+        {
+            note.StartTime.Modified -= h.OnDur;
+            note.EndTime.Modified -= h.OnDur;
+            note.Phonemes.Modified -= h.OnDur;
+            note.Pitch.Modified -= h.OnPitch;
+            note.Lyric.Modified -= h.OnLyric;
+            note.Properties.Modified -= h.OnProps;
+        }
     }
 
-    void OnNotesStructureChanged(ILiveNote note) => mNeedResegment = true;
+    void OnNotesStructureChanged(ILiveNote note) { mNeedResegment = true; }
 
     void MarkAllDirtyAndResegment()
     {
@@ -577,6 +983,9 @@ void OnCommitted()
 
     void OnRangeModified(double startTime, double endTime)
     {
+        // 记录受影响的 time range（自动化曲线修改）
+        SetAffectedRange(startTime, endTime);
+
         foreach (var piece in mPieces)
         {
             if (piece.EndTime < startTime || piece.StartTime > endTime)
@@ -587,10 +996,6 @@ void OnRangeModified(double startTime, double endTime)
         StatusChanged?.Invoke();
     }
 
-    sealed record RenderResult(float[] Audio, double StartTime, int SampleRate,
-        List<SynthesizedPhoneme> Phonemes, List<Point> PitchReadback,
-        Dictionary<string, IReadOnlyList<Point>> VarianceReadback);
-
     sealed class Piece
     {
         public required IReadOnlyList<ILiveNote> Notes;
@@ -605,5 +1010,20 @@ sealed class Piece
         public IReadOnlyList<SynthesizedPhoneme> Phonemes = [];
         public IReadOnlyList<Point> PitchReadback = [];
         public IReadOnlyDictionary<string, IReadOnlyList<Point>> VarianceReadback = new Dictionary<string, IReadOnlyList<Point>>();
+        // 回显曲线缓存（区间拼贴用）
+        public IReadOnlyList<Point>? CachedPitchReadback;
+        public IReadOnlyDictionary<string, IReadOnlyList<Point>> CachedVarianceReadback = new Dictionary<string, IReadOnlyList<Point>>();
+
+        // 缓存：note 代数（判断缓存有效性）
+        // 模型预测缓存（增量渲染时复用）
+        public float[]? CachedPitchPrediction;
+        public VarianceCurves CachedVarianceCurves;
+        public List<PhonemeSpan>? CachedPhones;
+        // mel 缓存（用于交叉过渡）
+        public float[]? CachedMel;
+        public int[]? CachedMelDims;
+        public float[]? CachedAudio;
+        // piece 级 RedrawPitch 请求标记
+        public bool RedrawPitchRequested;
     }
 }
diff --git a/DiffSingerVariance.cs b/DiffSingerVariance.cs
index 34036cb..3c07e23 100644
--- a/DiffSingerVariance.cs
+++ b/DiffSingerVariance.cs
@@ -42,14 +42,21 @@ public static VarianceCurves Predict(
         var langs = symbols.Select(s => v.LangId(PhonemeLanguage(s))).Prepend(0L).Append(0L).ToArray();
         var isVowel = symbols.Select(v.IsVowel).ToArray();
 
-        // —— linguistic（词模式）——
-        var (wordDiv, wordDur) = DiffSingerFrames.PaddedWordDivAndDur(isVowel, phDur);
+        // —— linguistic（据编码器实际输入选择词模式或音素模式）——
         var lingInputs = new List<NamedOnnxValue>
         {
             NvL("tokens", tokens, nTokens),
-            NvL("word_div", wordDiv, wordDiv.Length),
-            NvL("word_dur", wordDur, wordDur.Length),
         };
+        if (v.LinguisticUsesWordBoundary)
+        {
+            var (wordDiv, wordDur) = DiffSingerFrames.PaddedWordDivAndDur(isVowel, phDur);
+            lingInputs.Add(NvL("word_div", wordDiv, wordDiv.Length));
+            lingInputs.Add(NvL("word_dur", wordDur, wordDur.Length));
+        }
+        else
+        {
+            lingInputs.Add(NvL("ph_dur", phDur.Select(x => (long)x).ToArray(), nTokens));
+        }
         if (v.Linguistic.InputMetadata.ContainsKey("languages"))
             lingInputs.Add(NvL("languages", langs, nTokens));
         using var lingOut = v.Linguistic.Run(lingInputs);
diff --git a/DiffSingerVoiceEngine.cs b/DiffSingerVoiceEngine.cs
index c52dba7..cfa1a3b 100644
--- a/DiffSingerVoiceEngine.cs
+++ b/DiffSingerVoiceEngine.cs
@@ -1,6 +1,7 @@
 using System;
 using System.Collections.Generic;
 using System.IO;
+using System.Linq;
 using TuneLab.Foundation;
 using TuneLab.SDK;
 
@@ -32,6 +33,44 @@ public void Destroy()
         mModelCache = null;
     }
 
+    // —— 会话注册表（引擎级，跨会话共享）——
+    readonly List<WeakReference<DiffSingerSynthesisSession>> mSessions = new();
+
+    internal void RegisterSession(DiffSingerSynthesisSession session)
+    {
+        lock (mSessions)
+        {
+            // 清理已回收的弱引用
+            mSessions.RemoveAll(wr => !wr.TryGetTarget(out _));
+            mSessions.Add(new WeakReference<DiffSingerSynthesisSession>(session));
+        }
+    }
+
+    internal void UnregisterSession(DiffSingerSynthesisSession session)
+    {
+        lock (mSessions)
+        {
+            mSessions.RemoveAll(wr => !wr.TryGetTarget(out var s) || s == session);
+        }
+    }
+
+    internal DiffSingerSynthesisSession? FindSessionByVoiceId(string voiceId)
+    {
+        lock (mSessions)
+        {
+            // 清理已回收
+            mSessions.RemoveAll(wr => !wr.TryGetTarget(out _));
+            // 取最后一个匹配 voiceId 的会话（通常只有一个）
+            DiffSingerSynthesisSession? found = null;
+            foreach (var wr in mSessions)
+            {
+                if (wr.TryGetTarget(out var s) && s.VoiceId == voiceId)
+                    found = s;
+            }
+            return found;
+        }
+    }
+
     public ISynthesisSession CreateSession(string voiceId, ISynthesisContext context)
     {
         if (!mState.Banks.ContainsKey(voiceId))
@@ -40,7 +79,9 @@ public ISynthesisSession CreateSession(string voiceId, ISynthesisContext context
         // 推理走引擎级模型缓存（懒加载、按 voiceId 共享）；声明面（轨/面板）已上移到引擎方法、建会话前即填好。
         var config = ConfigFor(voiceId)!;
         var samplingSteps = mSettings.GetInt(KeySamplingSteps, 20);
-        return new DiffSingerSynthesisSession(config, context, voiceId, EnsureModelCache(), samplingSteps);
+        var session = new DiffSingerSynthesisSession(config, context, voiceId, EnsureModelCache(), samplingSteps);
+        RegisterSession(session);
+        return session;
     }
 
     // —— 声明（引擎层、纯函数 of (voiceId, part 值)；宿主在每次 part 参数 commit 时按当前值重算 diff 到 UI）——
@@ -52,10 +93,82 @@ public IReadOnlyOrderedMap<string, AutomationConfig> GetSynthesizedParameterConf
         => ConfigFor(context.VoiceId) is { } c ? DiffSingerDeclarations.BuildReadbackConfigs(c) : EmptyAutomations;
 
     public ObjectConfig GetPartPropertyConfig(IPartPropertyContext context)
-        => ConfigFor(context.VoiceId) is { } c ? DiffSingerDeclarations.BuildPartConfig(c) : EmptyConfig;
+    {
+        if (ConfigFor(context.VoiceId) is not { } c)
+            return EmptyConfig;
+
+        var baseConfig = DiffSingerDeclarations.BuildPartConfig(c);
+        var voiceId = context.VoiceId;
+
+        // Properties 是 IReadOnlyOrderedMap，需要新建可写集合
+        var props = new OrderedMap<string, IControllerConfig>();
+        foreach (var kvp in baseConfig.Properties)
+            props.Add(kvp.Key, kvp.Value);
+
+        // 追加 Retake 和 Redraw Pitch 按钮
+        props.Add("_retake", new ButtonConfig
+        {
+            DisplayText = L.Tr("Retake"),
+            Action = () => FindSessionByVoiceId(voiceId)?.RequestRetake(),
+        });
+        props.Add("_redraw_pitch", new ButtonConfig
+        {
+            DisplayText = L.Tr("Redraw Pitch"),
+            Action = () => FindSessionByVoiceId(voiceId)?.RequestRedrawPitch(),
+        });
+
+        return new ObjectConfig { Properties = props };
+    }
 
     public ObjectConfig GetNotePropertyConfig(INotePropertyContext context)
-        => ConfigFor(context.VoiceId) is { } c ? DiffSingerDeclarations.BuildNoteConfig(c, context) : EmptyConfig;
+    {
+        if (ConfigFor(context.VoiceId) is not { } c)
+            return EmptyConfig;
+
+        var baseConfig = DiffSingerDeclarations.BuildNoteConfig(c, context);
+        var voiceId = context.VoiceId;
+        double selStart = context.SelectionStartTime;
+        double selEnd = context.SelectionEndTime;
+        bool hasSelection = !double.IsNaN(selStart) && !double.IsNaN(selEnd);
+
+        var props = new OrderedMap<string, IControllerConfig>();
+        foreach (var kvp in baseConfig.Properties)
+            props.Add(kvp.Key, kvp.Value);
+
+        // 追加 Retake 和 Redraw Pitch 按钮（使用选中音符的区间而非全曲）
+        props.Add("_retake", new ButtonConfig
+        {
+            DisplayText = L.Tr("Retake"),
+            Action = () =>
+            {
+                var session = FindSessionByVoiceId(voiceId);
+                if (session != null)
+                {
+                    if (hasSelection)
+                        session.RequestRetakeScoped(selStart, selEnd);
+                    else
+                        session.RequestRetake();
+                }
+            },
+        });
+        props.Add("_redraw_pitch", new ButtonConfig
+        {
+            DisplayText = L.Tr("Redraw Pitch"),
+            Action = () =>
+            {
+                var session = FindSessionByVoiceId(voiceId);
+                if (session != null)
+                {
+                    if (hasSelection)
+                        session.RequestRedrawPitchScoped(selStart, selEnd);
+                    else
+                        session.RequestRedrawPitch();
+                }
+            },
+        });
+
+        return new ObjectConfig { Properties = props };
+    }
 
     // 声库能力集按 voiceId 缓存（声明每次 commit 都调，避免重复解析 dsconfig）；config 随声库不可变，扫描重建时清空。
     VoicebankConfig? ConfigFor(string voiceId)

From 9d8bd78f3ae71d521fca5da1e486230b23452c8e Mon Sep 17 00:00:00 2001
From: tachengP <2638591622@qq.com>
Date: Sat, 20 Jun 2026 17:35:27 +0800
Subject: [PATCH 2/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E9=87=8D=E6=B8=B2?=
 =?UTF-8?q?=E6=9F=93=E6=8B=BC=E8=B4=B4=E7=BC=BA=E9=99=B7=EF=BC=8C=E6=81=A2?=
 =?UTF-8?q?=E5=A4=8D=E5=BB=B6=E9=9F=B3=E7=AC=A6+=E3=80=81-=E5=8A=9F?=
 =?UTF-8?q?=E8=83=BD=EF=BC=8C=E5=AE=9E=E7=8E=B0=E9=9F=B3=E7=B4=A0=E7=BC=96?=
 =?UTF-8?q?=E8=BE=91=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 DiffSingerForTuneLab.code-workspace |   4 +
 DiffSingerPhonemizer.cs             |  58 ++++++-
 DiffSingerPitch.cs                  |   4 +-
 DiffSingerPredictor.cs              | 102 +++++++++++-
 DiffSingerSynthesisSession.cs       | 236 +++++++++++++++++++++++++---
 DiffSingerVoiceEngine.cs            |  72 +++++++++
 6 files changed, 449 insertions(+), 27 deletions(-)

diff --git a/DiffSingerForTuneLab.code-workspace b/DiffSingerForTuneLab.code-workspace
index 1131316..1c88dca 100644
--- a/DiffSingerForTuneLab.code-workspace
+++ b/DiffSingerForTuneLab.code-workspace
@@ -7,6 +7,10 @@
     {
       "name": "TuneLab (参考：SDK/docs/范例)",
       "path": "../TuneLab"
+    },
+    {
+      "name": "OpenUtau-lunai (参考：OpenUtau.Core)",
+      "path": "../OpenUtau-lunai"
     }
   ],
   "settings": {
diff --git a/DiffSingerPhonemizer.cs b/DiffSingerPhonemizer.cs
index 187d41e..03ac60b 100644
--- a/DiffSingerPhonemizer.cs
+++ b/DiffSingerPhonemizer.cs
@@ -48,9 +48,24 @@ public static List<PhonemeSpan> Phonemize(
         for (int i = 0; i < notes.Count; i++)
         {
             var note = notes[i];
+            string lyric = note.Lyric ?? string.Empty;
             string[] symbols = GetSymbols(dur, note, noteLang[i], out pinned[i]);
             noteSymbolCount[i] = symbols.Length;
 
+            // 连音符：不产生新音素，只延展前音的时长（通过 group 自然吸收）
+            // (+) 额外在起始点插入一个空 vowel 边界组，强制对齐
+            if (symbols.Length == 0 && (lyric == "-" || lyric == "+"))
+            {
+                if (lyric == "+")
+                {
+                    // 插入一个空韵核组作为强制对齐边界
+                    groups.Add(new Group(note.StartTime, note.Pitch));
+                }
+                // - 则完全不做任何事，前组自然吸收时长
+                notePhIndex.Add(notePhIndex[^1]);
+                continue;
+            }
+
             var wordGroups = ProcessWord(dur, note, symbols);
             groups[^1].Phonemes.AddRange(wordGroups[0].Phonemes);   // 前置辅音并入前一组（侵入前一 note 尾）
             groups.AddRange(wordGroups.Skip(1));                    // 韵核组（起点=note 起点）
@@ -152,17 +167,56 @@ static List<Group> ProcessWord(DiffSingerPredictor dur, SynthesisNoteSnapshot no
         return wordGroups;
     }
 
-    // 取音素符号串：钉死=用 note.Phonemes 符号；否则 G2P。过滤到「类型已定义 且 dur 表可 tokenize」；空则 [SP]。
+    // 取音素符号串：钉死/编辑器→用已有 phonemes 或 _phonemes 属性；连音符→空（slur 延展前音素）；否则 G2P。
+    // 空结果且非连音符→ [SP] 兜底。
     static string[] GetSymbols(DiffSingerPredictor dur, SynthesisNoteSnapshot note, string lang, out bool pinned)
     {
+        string lyric = note.Lyric ?? string.Empty;
+        if (lyric == "-" || lyric == "+")
+        {
+            pinned = false;
+            return Array.Empty<string>();
+        }
+
+        // 优先使用 _phonemes 属性（音素编辑器写入）
+        var phonemesProp = note.Properties.GetString("_phonemes", "");
+        if (!string.IsNullOrEmpty(phonemesProp) && phonemesProp != "[]")
+        {
+            pinned = true;
+            return ParsePhonemesProperty(phonemesProp).Where(s => !string.IsNullOrEmpty(s) && dur.TryPhoneme(s, out _)).ToArray();
+        }
+
+        // 其次使用钉死音素
         pinned = note.Phonemes.Count > 0;
         IEnumerable<string> raw = pinned
             ? note.Phonemes.Select(p => p.Symbol)
-            : dur.G2P(note.Lyric ?? string.Empty, lang);
+            : dur.G2P(lyric, lang);
         var symbols = raw.Where(s => dur.IsKnownSymbol(s) && dur.TryPhoneme(s, out _)).ToArray();
         return symbols.Length > 0 ? symbols : new[] { Pause };
     }
 
+    // 从 JSON 字符串解析音素符号列表：[{"s":"ja/b","v":false},...]
+    static string[] ParsePhonemesProperty(string json)
+    {
+        var result = new List<string>();
+        if (string.IsNullOrEmpty(json) || json.Length < 2) return result.ToArray();
+        try
+        {
+            int i = 0;
+            while (true)
+            {
+                int sIdx = json.IndexOf("\"s\":\"", i);
+                if (sIdx < 0) break;
+                sIdx += 5;
+                int eIdx = json.IndexOf('"', sIdx);
+                if (eIdx > sIdx) result.Add(json.Substring(sIdx, eIdx - sIdx));
+                i = eIdx + 1;
+            }
+        }
+        catch { }
+        return result.ToArray();
+    }
+
     // OpenUtau stretch：source[from..from+count) 的帧时长按 ratio 缩放、终点对齐 endPos，返回各音素起点秒。
     static IEnumerable<double> Stretch(IReadOnlyList<double> source, int from, int count, double ratio, double endPos)
     {
diff --git a/DiffSingerPitch.cs b/DiffSingerPitch.cs
index 02f321d..9e2dee8 100644
--- a/DiffSingerPitch.cs
+++ b/DiffSingerPitch.cs
@@ -129,8 +129,10 @@ public static class DiffSingerPitch
             }
             durSec.Add(Math.Max(0, note.EndTime - note.StartTime));
             midiList.Add(note.Pitch);
-            if ((note.Lyric ?? string.Empty).StartsWith("+"))
+            string lyric = note.Lyric ?? string.Empty;
+            if (lyric.StartsWith("+") || lyric == "-")
             {
+                // slur 延音符继承前音 rest 状态，确保 pitch 模型为其生成正确过渡音高
                 restList.Add(restList[^1]);
             }
             else
diff --git a/DiffSingerPredictor.cs b/DiffSingerPredictor.cs
index 706fec0..270a657 100644
--- a/DiffSingerPredictor.cs
+++ b/DiffSingerPredictor.cs
@@ -75,13 +75,21 @@ public int PhonemeToken(string symbol)
             : throw new InvalidOperationException($"音素 \"{symbol}\" 不在 {Path.GetFileName(mDir)} 的音素表中");
     public long LangId(string lang) => mLanguages.TryGetValue(lang, out var id) ? id : 0;
 
-    // —— G2P：按语言查 dsdict-{lang}.yaml 词条（grapheme→带前缀音素），exact 后小写回退 ——
+    // —— G2P：优先查语言特定词典（dsdict-{lang}.yaml），避免默认底库（dsdict.yaml 以 zh 为主）污染；再试 replacements；最后才兜底查合并词典。 ——
     public string[] G2P(string lyric, string lang)
     {
-        var entries = GetEntries(lang);
         var key = lyric.Trim();
-        if (entries.TryGetValue(key, out var phs)) return phs;
-        if (entries.TryGetValue(key.ToLowerInvariant(), out phs)) return phs;
+        // 1. 语言特定词典（不含默认底库）
+        var langEntries = GetLanguageSpecificEntries(lang);
+        if (langEntries.TryGetValue(key, out var phs)) return phs;
+        if (langEntries.TryGetValue(key.ToLowerInvariant(), out phs)) return phs;
+        // 2. 替换规则（en/ko 等无 entries 的语种）
+        var replaced = ApplyReplacements(lyric, lang);
+        if (replaced.Length > 0) return replaced;
+        // 3. 最后才查合并词典（含默认底库 dsdict.yaml，作为未知字素的最终兜底）
+        var allEntries = GetEntries(lang);
+        if (allEntries.TryGetValue(key, out phs)) return phs;
+        if (allEntries.TryGetValue(key.ToLowerInvariant(), out phs)) return phs;
         return Array.Empty<string>();
     }
 
@@ -108,8 +116,78 @@ public float[] GetEmbedding(string acousticSpeaker)
         }
     }
 
+    // —— 替换规则（用于 EN/KO 等无 entries 仅 replacements 的语种）——
+    readonly Dictionary<string, List<(string from, string to)>> mReplacements = new(StringComparer.Ordinal);
+
+    void LoadReplacements(string lang)
+    {
+        if (mReplacements.ContainsKey(lang)) return;
+        var list = new List<(string from, string to)>();
+        foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml", "dsdict.yaml" })
+        {
+            var path = Path.Combine(mDir, file);
+            if (!File.Exists(path)) continue;
+            try
+            {
+                var yaml = new DeserializerBuilder().Build();
+                var doc = yaml.Deserialize<Dictionary<string, object?>>(File.ReadAllText(path));
+                if (doc != null && doc.TryGetValue("replacements", out var reps) && reps is List<object?> repList)
+                {
+                    foreach (var r in repList)
+                    {
+                        if (r is Dictionary<object, object?> repDict)
+                        {
+                            string? from = repDict.TryGetValue("from", out var fv) ? fv?.ToString() : null;
+                            string? to = repDict.TryGetValue("to", out var tv) ? tv?.ToString() : null;
+                            if (!string.IsNullOrEmpty(from) && !string.IsNullOrEmpty(to))
+                                list.Add((from, to));
+                        }
+                    }
+                }
+            }
+            catch { }
+        }
+        mReplacements[lang] = list;
+    }
+
+    // 用替换规则将歌词转为音素（按最长匹配优先）
+    public string[] ApplyReplacements(string lyric, string lang)
+    {
+        LoadReplacements(lang);
+        if (!mReplacements.TryGetValue(lang, out var reps) || reps.Count == 0)
+            return Array.Empty<string>();
+
+        var repsSorted = reps.OrderByDescending(r => r.from.Length).ToList();
+        var result = new List<string>();
+        string text = lyric.ToLowerInvariant();
+        int pos = 0;
+        while (pos < text.Length)
+        {
+            bool matched = false;
+            foreach (var (from, to) in repsSorted)
+            {
+                if (pos + from.Length <= text.Length && text.Substring(pos, from.Length) == from)
+                {
+                    result.Add(to);
+                    pos += from.Length;
+                    matched = true;
+                    break;
+                }
+            }
+            if (!matched)
+            {
+                // 单个字符作为独立音素
+                string ch = text[pos].ToString();
+                result.Add(ch);
+                pos++;
+            }
+        }
+        return result.ToArray();
+    }
+
     // —— 词典加载 ——
     // 策略：先加载 dsdict.yaml 作为默认底库，再叠加载入语种特定文件（后面覆盖前面）。
+    // 若 entries 为空且 replacements 存在，留空返回（上层调用 ApplyReplacements）。
     Dictionary<string, string[]> GetEntries(string lang)
     {
         lock (mLock)
@@ -145,6 +223,22 @@ Dictionary<string, string[]> GetEntries(string lang)
         }
     }
 
+    // 仅加载语言特定词典（不含默认底库 dsdict.yaml），用于 G2P 的优先查表——避免 zh 底库污染其他语言的译音。
+    Dictionary<string, string[]> GetLanguageSpecificEntries(string lang)
+    {
+        var map = new Dictionary<string, string[]>(StringComparer.Ordinal);
+        foreach (var file in new[] { $"dsdict-{lang}.yaml", $"dsdict-zh-{lang}.yaml" })
+        {
+            var path = Path.Combine(mDir, file);
+            if (!File.Exists(path)) continue;
+            var root = DeserializeDsDict(path);
+            foreach (var e in root.entries)
+                if (!string.IsNullOrEmpty(e.grapheme))
+                    map[e.grapheme] = e.phonemes.ToArray();
+        }
+        return map;
+    }
+
     void LoadSymbolTypes(string dsdictPath)
     {
         if (!File.Exists(dsdictPath)) return;
diff --git a/DiffSingerSynthesisSession.cs b/DiffSingerSynthesisSession.cs
index 4cfe592..2547440 100644
--- a/DiffSingerSynthesisSession.cs
+++ b/DiffSingerSynthesisSession.cs
@@ -80,6 +80,53 @@ public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext cont
 
     public string DefaultLyric => "a";
 
+    // 获取默认 G2P 音素（供音素编辑器填充）
+    internal List<PhonemeEntry> GetDefaultPhonemesForNote(double time, string partLang = "")
+    {
+        ILiveNote? targetNote = null;
+        foreach (var note in mContext.Notes)
+        {
+            if (note.StartTime.Value <= time && note.EndTime.Value >= time)
+            { targetNote = note; break; }
+        }
+        if (targetNote == null) return new List<PhonemeEntry>();
+
+        try
+        {
+            var models = mModelCache.GetOrLoad(mVoiceId, mConfig);
+            var durPred = models.GetPredictor("dsdur");
+            if (durPred == null) return new List<PhonemeEntry>();
+
+            string lyric = targetNote.Lyric.Value ?? string.Empty;
+            // partLang 由调用方从快照传入（不依赖 live mContext.PartProperties），确保段落语言变更后能取到新值
+            if (string.IsNullOrEmpty(partLang))
+            {
+                var partLangVal = mContext.PartProperties.GetValue(KeyLanguage, PropertyValue.Create(string.Empty));
+                partLang = partLangVal.ToString(out var pl) ? pl : string.Empty;
+            }
+            var noteLangVal = targetNote.Properties.GetValue(KeyLanguage, PropertyValue.Create(string.Empty));
+            // 注意：ToString 对空字符串也返回 true，故需排除空串，才能正确回退到 partLang
+            string noteLang = noteLangVal.ToString(out var nl) && !string.IsNullOrEmpty(nl) ? nl : partLang;
+
+            var result = new List<PhonemeEntry>();
+            string[] symbols = durPred.G2P(lyric, noteLang);
+            if (symbols.Length == 0) return result;
+
+            foreach (var sym in symbols)
+            {
+                if (string.IsNullOrEmpty(sym)) continue;
+                result.Add(new PhonemeEntry
+                {
+                    Symbol = sym,
+                    IsVowel = durPred.IsVowel(sym),
+                    IsGlide = durPred.IsGlide(sym),
+                });
+            }
+            return result;
+        }
+        catch { return new List<PhonemeEntry>(); }
+    }
+
     // 根据区间找出应处理的 piece 集合（NaN 表示全曲）
     IEnumerable<Piece> PiecesInRange(double start, double end)
     {
@@ -178,9 +225,15 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
             {
                 int rate = rendered.SampleRate;
 
+                // 用渲染后的音素边界精化过渡区间：使过渡在追溯音素内部进行
+                double stitchStart = mAffectedStartTime;
+                double stitchEnd = mAffectedEndTime;
+                RefineStitchRange(rendered.Phonemes,
+                    ref stitchStart, ref stitchEnd, piece.FrameSec);
+
                 // 区段式音频拼贴：仅替换受影响的 time range，其余保持旧音频不变
                 var stitchedAudio = StitchAudio(rendered.Audio, piece.CachedAudio,
-                    rendered.StartTime, rate, mAffectedStartTime, mAffectedEndTime);
+                    rendered.StartTime, rate, stitchStart, stitchEnd);
 
                 // 缓存旧音频供下次拼贴
                 piece.CachedAudio = stitchedAudio;
@@ -295,6 +348,7 @@ sealed record RenderResultEx(float[] Audio, double StartTime, int SampleRate,
         double frameSec = (double)hop / sr;
         int head = DiffSingerFrames.HeadFrames;
         int numMelBins = models.NumMelBins;
+        piece.FrameSec = frameSec;
 
         string partLang = snapshot.PartProperties.GetString(KeyLanguage, string.Empty);
         string speaker = snapshot.PartProperties.GetString(KeySpeaker, mConfig.Speakers.Count > 0 ? mConfig.Speakers[0] : string.Empty);
@@ -366,16 +420,18 @@ sealed record RenderResultEx(float[] Audio, double StartTime, int SampleRate,
             langs[i + 1] = models.TryGetLanguage(PhonemeLang(phones[i].Symbol), out var lid) ? lid : 0;
         }
 
-        // 逐帧 note 音高回退
+        // 逐帧 note 音高回退（直接按 note 时间区间查找，不受 -/+ 延音符无音素的影响）
         var framePitch = new double[nFrames];
-        int fi = 0;
-        for (int seg = 0; seg < nTokens; seg++)
         {
-            int ni = seg == 0 ? phones[0].NoteIndex
-                : seg == nTokens - 1 ? phones[^1].NoteIndex
-                : phones[seg - 1].NoteIndex;
-            int pitch = notes[ni].Pitch;
-            for (int k = 0; k < durations[seg]; k++) framePitch[fi++] = pitch;
+            int ni = 0;
+            for (int f = 0; f < nFrames; f++)
+            {
+                double t = frameTimes[f];
+                // 找出包含当前帧的 note（延音符未产生音素，但 pitch 仍需跟随它的音高）
+                while (ni < notes.Count - 1 && t >= notes[ni + 1].StartTime)
+                    ni++;
+                framePitch[f] = notes[ni].Pitch;
+            }
         }
 
         // —— 自动音高预测（仅在 needPitchPredict 时跑；否则复用缓存）——
@@ -459,6 +515,7 @@ void AddF(string name, float[] data, int[] dims)
         AddF("f0", f0, new[] { 1, nFrames });
 
         // —— variance：使用（缓存的）预测 + 用户 delta 合成喂声学 ——
+        // 回显显示最终喂声学的值（预测+包络加权），包络修改时实参同步跟随
         var varReadback = new Dictionary<string, IReadOnlyList<Point>>();
         foreach (var spec in Variances)
         {
@@ -467,11 +524,12 @@ void AddF(string name, float[] data, int[] dims)
                 ? auto.Evaluator.Evaluate(frameTimes)
                 : null;
 
+            float[] combined = CombineVariance(spec, predicted, user, nFrames);
             if (ac.InputMetadata.ContainsKey(spec.Key))
-                AddF(spec.Key, CombineVariance(spec, predicted, user, nFrames), new[] { 1, nFrames });
+                AddF(spec.Key, combined, new[] { 1, nFrames });
 
             if (spec.Use(mConfig) && spec.Predict(mConfig) && predicted != null)
-                varReadback[spec.Key] = BuildReadbackSegment(spec, predicted, frameTimes, nFrames);
+                varReadback[spec.Key] = BuildReadbackSegment(spec, combined, frameTimes, nFrames, spHeadFrames, spTailFrames);
         }
 
         // —— gender / velocity ——
@@ -554,6 +612,61 @@ void AddF(string name, float[] data, int[] dims)
             melDims, finalMel);
     }
 
+    // —— 用音素边界精化过渡区间 ——
+    // 修改【A B C D】中的【C】→ 重渲染【B C D】，过渡在 B 和 D 的内部进行。
+    // 前过渡：边界音素起始 + 3 帧；后过渡：边界音素结束 - 3 帧。
+    static void RefineStitchRange(IReadOnlyList<SynthesizedPhoneme> newPhonemes,
+        ref double stitchStart, ref double stitchEnd, double frameSec)
+    {
+        if (double.IsNaN(stitchStart) || double.IsNaN(stitchEnd))
+            return;
+        double margin = 3 * frameSec;
+
+        // 前边界：找 stitchStart 所在音素的「前一个」音素（即追溯音素），取其起始 + 3 帧
+        int startIdx = -1;
+        for (int i = 0; i < newPhonemes.Count; i++)
+        {
+            if (newPhonemes[i].StartTime <= stitchStart && newPhonemes[i].EndTime >= stitchStart)
+            { startIdx = i; break; }
+        }
+        // 如果找到的音素就是第一个（无前一个），则用它本身
+        if (startIdx >= 0)
+        {
+            int boundaryIdx = startIdx > 0 ? startIdx - 1 : startIdx;
+            // 但如果起始音素已经是被修改区域内的（stitchStart > 它的起始），则前一个才是边界
+            // 如果起始音素的起始 > stitchStart，说明我们定位到了边界音素之后的第一个，那它的前一个是边界
+            if (newPhonemes[startIdx].StartTime > stitchStart && startIdx > 0)
+                boundaryIdx = startIdx - 1;
+            stitchStart = newPhonemes[boundaryIdx].StartTime + margin;
+            // 确保不超出该音素范围
+            if (stitchStart > newPhonemes[boundaryIdx].EndTime - margin)
+                stitchStart = newPhonemes[boundaryIdx].StartTime + margin * 0.5;
+        }
+
+        // 后边界：找 stitchEnd 所在音素的「后一个」音素（追溯音素），取其结束 - 3 帧
+        int endIdx = -1;
+        for (int i = newPhonemes.Count - 1; i >= 0; i--)
+        {
+            if (newPhonemes[i].StartTime <= stitchEnd && newPhonemes[i].EndTime >= stitchEnd)
+            { endIdx = i; break; }
+        }
+        if (endIdx >= 0)
+        {
+            int boundaryIdx = endIdx < newPhonemes.Count - 1 ? endIdx + 1 : endIdx;
+            if (newPhonemes[endIdx].EndTime < stitchEnd && endIdx < newPhonemes.Count - 1)
+                boundaryIdx = endIdx + 1;
+            stitchEnd = newPhonemes[boundaryIdx].EndTime - margin;
+            if (stitchEnd < newPhonemes[boundaryIdx].StartTime + margin)
+                stitchEnd = newPhonemes[boundaryIdx].EndTime - margin * 0.5;
+        }
+
+        if (stitchStart >= stitchEnd)
+        {
+            stitchStart = double.NaN;
+            stitchEnd = double.NaN;
+        }
+    }
+
     // —— 区段式 mel 拼贴 ——
     // 将新 mel 的「受影响的 time range」替换到旧 mel 中，边界做 3 帧交叉过渡。
     // 旧 mel 为 null 或区间无效时直接返回新 mel。
@@ -755,13 +868,29 @@ Func<double, double> GenderConvert()
     // VELC convert（OpenUtau DiffSingerRenderer）：对数标度，100 = 原速，每 +100 速度 ×2。
     static double SpeedConvert(double x) => Math.Pow(2, (x - 100) / 100);
 
-    // 回显段：纯预测值（不含用户编辑），clamp 到声学值域，逐帧 (全局秒, 值)。
-    static List<Point> BuildReadbackSegment(VarianceSpec spec, float[] predicted, double[] frameTimes, int n)
+    // 回显段：最终值（含用户包络），clamp 到声学值域，逐帧 (全局秒, 值)。
+    // 整个 SP 段做透明度过渡：外边界 → 声学最小值（背景=不可见），内边界 → 100%（实际值）。
+    // 注意：pitch 回显直接排除 SP 帧（不画），此处用渐变使曲线从背景平滑浮现/消失。
+    static List<Point> BuildReadbackSegment(VarianceSpec spec, float[] finalValues, double[] frameTimes, int n,
+        int headSpFrames = 0, int tailSpFrames = 0)
     {
+        float fadeTarget = (float)spec.AcousticMin; // 声学最小值 = 曲线不可见的背景值
         var points = new List<Point>(n);
         for (int f = 0; f < n; f++)
         {
-            float x = f < predicted.Length ? predicted[f] : predicted[^1];
+            float x = f < finalValues.Length ? finalValues[f] : finalValues[^1];
+            // 前 SP：整个 SP 段从外边界（f=0）到内边界线性渐入（0% → 100%）
+            if (f < headSpFrames)
+            {
+                float t = (float)(f + 1) / headSpFrames;
+                x = x * t + fadeTarget * (1 - t);
+            }
+            // 后 SP：整个 SP 段从内边界到外边界线性渐出（100% → 0%）
+            if (f >= n - tailSpFrames)
+            {
+                float t = (float)(n - f) / tailSpFrames;
+                x = x * t + fadeTarget * (1 - t);
+            }
             points.Add(new Point(frameTimes[f], Math.Clamp(x, spec.AcousticMin, spec.AcousticMax)));
         }
         return points;
@@ -780,7 +909,9 @@ static string PickVowelSymbol(VoiceModels models, string lang)
     public IReadOnlyList<IReadOnlyList<Point>> SynthesizedPitch
         => mPieces.Where(p => p.PitchReadback.Count > 0).Select(p => p.PitchReadback).ToList();
 
-    // 回显产物（数据线程发布、可跨线程读）：按声明的回显轨 key 聚合各 piece 的纯预测段（每 piece 一段、段间断开）。
+    // 回显产物（数据线程发布、可跨线程读）：按声明的回显轨 key 聚合各 piece 的预测段。
+    // 每 piece 整段作为一个 segment，用多 GradientStop 的 LinearGradientBrush 实现像素级平滑透明度。
+    // 透明度轮廓：head SP 0%→25%, body 25%, tail SP 25%→0%，无段间边界。
     public IReadOnlyMap<string, SynthesizedParameter> SynthesizedParameters
     {
         get
@@ -789,11 +920,68 @@ public IReadOnlyMap<string, SynthesizedParameter> SynthesizedParameters
             foreach (var kvp in mReadbackConfigs)
             {
                 var segments = new List<IReadOnlyList<Point>>();
+                var stopSets = new List<IReadOnlyList<Point>>();
                 foreach (var piece in mPieces)
-                    if (piece.VarianceReadback.TryGetValue(kvp.Key, out var segment) && segment.Count > 0)
-                        segments.Add(segment);
+                {
+                    if (!piece.VarianceReadback.TryGetValue(kvp.Key, out var allPoints) || allPoints.Count == 0)
+                        continue;
+
+                    var phones = piece.CachedPhones;
+                    if (phones == null || phones.Count == 0 || allPoints.Count < 2)
+                    {
+                        segments.Add(allPoints);
+                        stopSets.Add([new(0, 0.25), new(1, 0.25)]);
+                        continue;
+                    }
+
+                    double bodyStart = phones[0].StartTime;
+                    double bodyEnd = phones[^1].EndTime;
+                    int headEnd = 0, bodyEndIdx = allPoints.Count;
+                    while (headEnd < allPoints.Count && allPoints[headEnd].X < bodyStart) headEnd++;
+                    int bodyStartIdx = headEnd;
+                    while (bodyEndIdx > 0 && allPoints[bodyEndIdx - 1].X > bodyEnd) bodyEndIdx--;
+                    int total = allPoints.Count;
+                    bool hasHead = headEnd > 0;
+                    bool hasTail = bodyEndIdx < total;
+
+                    // 整段作为一个 segment
+                    segments.Add(allPoints);
+
+                    // 构造 GradientStop，按有无 head/tail 调整
+                    var stops = new List<Point>(4);
+                    if (hasHead)
+                    {
+                        stops.Add(new(0.0, 0.0));
+                        double headRatio = (double)headEnd / total;
+                        stops.Add(new(Math.Clamp(headRatio, 0, 1), 0.25));
+                    }
+                    else
+                    {
+                        stops.Add(new(0.0, 0.25));
+                    }
+
+                    double bodyEndRatio = (double)bodyEndIdx / total;
+                    if (hasTail)
+                    {
+                        stops.Add(new(Math.Clamp(bodyEndRatio, 0, 1), 0.25));
+                        stops.Add(new(1.0, 0.0));
+                    }
+                    else
+                    {
+                        stops.Add(new(1.0, 0.25));
+                    }
+
+                    // 移除相邻等偏移的退化 stop
+                    var dedup = new List<Point>(stops.Count);
+                    for (int i = 0; i < stops.Count; i++)
+                        if (i == stops.Count - 1 || Math.Abs(stops[i].X - stops[i + 1].X) > 0.0001)
+                            dedup.Add(stops[i]);
+                    while (dedup.Count < 2) dedup.Add(new(1, 0.25));
+
+                    stopSets.Add(dedup);
+                }
                 if (segments.Count > 0)
-                    map.Add(kvp.Key, new SynthesizedParameter { Segments = segments });
+                    map.Add(kvp.Key, new SynthesizedParameter { Segments = segments, SegmentOpacityStops = stopSets });
             }
             return map;
         }
@@ -971,7 +1159,14 @@ void UnsubscribeNote(ILiveNote note)
 
     void MarkAllDirtyAndResegment()
     {
-        foreach (var piece in mPieces) { piece.Dirty = true; piece.Failed = false; }
+        foreach (var piece in mPieces)
+        {
+            piece.Dirty = true; piece.Failed = false;
+            // 段落属性（语言等）变更 → 清除音素/variance 缓存，强制使用新的 G2P
+            piece.CachedPhones = null;
+            piece.CachedVarianceCurves = default;
+            // 保留 pitch 缓存
+        }
         mNeedResegment = true;
     }
 
@@ -1023,7 +1218,8 @@ sealed class Piece
         public float[]? CachedMel;
         public int[]? CachedMelDims;
         public float[]? CachedAudio;
-        // piece 级 RedrawPitch 请求标记
         public bool RedrawPitchRequested;
+        // 帧时长（秒），用于过渡区间计算
+        public double FrameSec;
     }
 }
diff --git a/DiffSingerVoiceEngine.cs b/DiffSingerVoiceEngine.cs
index cfa1a3b..888c8c5 100644
--- a/DiffSingerVoiceEngine.cs
+++ b/DiffSingerVoiceEngine.cs
@@ -135,6 +135,46 @@ public ObjectConfig GetNotePropertyConfig(INotePropertyContext context)
         foreach (var kvp in baseConfig.Properties)
             props.Add(kvp.Key, kvp.Value);
 
+        // —— 音素编辑器 ——
+        var phonemesJson = context.NoteProperties.GetString("_phonemes", "[]");
+        var phonemeEntries = ParsePhonemeJson(phonemesJson);
+        if (phonemeEntries.Count == 0 && c.Languages.Count > 0 && !double.IsNaN(selStart))
+        {
+            // 未自定义音素时，尝试从会话获取默认 G2P 音素
+            double midTime = (selStart + selEnd) / 2;
+            var session = FindSessionByVoiceId(voiceId);
+            if (session != null)
+            {
+                // 从上下文快照取段落语言（保证段落语言变更后能传新值给 G2P）
+                string snapPartLang = context.PartProperties.GetString(DiffSingerDeclarations.KeyLanguage, string.Empty);
+                phonemeEntries = session.GetDefaultPhonemesForNote(midTime, snapPartLang);
+            }
+        }
+        if (c.Languages.Count > 0)
+        {
+            props.Add("_phoneme_editor", new PhonemeEditorConfig
+            {
+                DisplayText = L.Tr("Phonemes"),
+                Phonemes = phonemeEntries,
+                AvailableLanguages = c.Languages,
+                LanguageDataKey = DiffSingerDeclarations.KeyLanguage,
+                CanDeleteConsonant = phonemeEntries.Count(e => !e.IsVowel) > 1,
+                CanDeleteVowel = phonemeEntries.Count(e => e.IsVowel) > 1,
+                OnChanged = _ =>
+                {
+                    // 音素被编辑 → 触发区间重渲染
+                    var s = FindSessionByVoiceId(voiceId);
+                    if (s != null)
+                    {
+                        if (hasSelection)
+                            s.RequestRetakeScoped(selStart, selEnd);
+                        else
+                            s.RequestRetake();
+                    }
+                },
+            });
+        }
+
         // 追加 Retake 和 Redraw Pitch 按钮（使用选中音符的区间而非全曲）
         props.Add("_retake", new ButtonConfig
         {
@@ -276,6 +316,38 @@ static void EnsureDefaultDirectory()
         catch { }
     }
 
+    // 解析音素 JSON：[{"s":"ja/b","v":false},...]
+    static List<PhonemeEntry> ParsePhonemeJson(string json)
+    {
+        var result = new List<PhonemeEntry>();
+        if (string.IsNullOrEmpty(json) || json.Length < 2) return result;
+        try
+        {
+            // 简单的手动 JSON 解析（避免加依赖）
+            int i = json.IndexOf('[');
+            if (i < 0) return result;
+            while (true)
+            {
+                i = json.IndexOf('{', i);
+                if (i < 0) break;
+                string entry = json.Substring(i, Math.Min(json.Length - i, json.IndexOf('}', i) - i + 1));
+                var phoneme = new PhonemeEntry();
+                int sIdx = entry.IndexOf("\"s\":\"");
+                if (sIdx >= 0)
+                {
+                    sIdx += 5;
+                    int eIdx = entry.IndexOf('"', sIdx);
+                    if (eIdx > sIdx) phoneme.Symbol = entry.Substring(sIdx, eIdx - sIdx);
+                }
+                phoneme.IsVowel = entry.Contains("\"v\":true") || entry.Contains("\"v\": true");
+                result.Add(phoneme);
+                i = json.IndexOf('}', i) + 1;
+            }
+        }
+        catch { }
+        return result;
+    }
+
     // 不可变扫描结果，整体替换发布：get 侧读引用、扫描侧建好新实例后一次性换上，无需锁。
     sealed record State(
         IReadOnlyOrderedMap<string, VoiceSourceInfo> Infos,

From ef0b20c2a99e363c25eaa075353c0918c392b1dd Mon Sep 17 00:00:00 2001
From: tachengP <2638591622@qq.com>
Date: Sun, 21 Jun 2026 19:32:27 +0800
Subject: [PATCH 3/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8DDirectML=E5=B9=B6?=
 =?UTF-8?q?=E8=A1=8C=EF=BC=8C=E4=BF=AE=E5=A4=8D=E5=BB=B6=E9=9F=B3=E7=AC=A6?=
 =?UTF-8?q?=E3=80=90+=E3=80=91=E3=80=90-=E3=80=91=E5=8F=B7=E5=B7=A5?=
 =?UTF-8?q?=E4=BD=9Cbug=EF=BC=8C=E8=AE=BE=E7=BD=AE=E6=96=B0=E5=A2=9E?=
 =?UTF-8?q?=E6=9C=80=E5=A4=A7=E6=B8=B2=E6=9F=93=E4=BB=BB=E5=8A=A1=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                    |  1 +
 DiffSingerModels.cs           | 14 ++++++++++++++
 DiffSingerPhonemizer.cs       | 31 ++++++++++++++++++++++---------
 DiffSingerPitch.cs            |  4 ++--
 DiffSingerPredictor.cs        | 13 +++++++++++++
 DiffSingerSynthesisSession.cs | 12 +++++++++---
 DiffSingerVariance.cs         |  4 ++--
 DiffSingerVoiceEngine.cs      | 26 +++++++++++++++++++++++++-
 8 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index 04d0096..e1b0701 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,4 @@ obj/
 # 打包产物
 *.tlx
 /build/
+/tools/CheckModel
diff --git a/DiffSingerModels.cs b/DiffSingerModels.cs
index f863c3b..d38453b 100644
--- a/DiffSingerModels.cs
+++ b/DiffSingerModels.cs
@@ -120,9 +120,23 @@ public sealed class VoiceModels : IDisposable
     readonly Dictionary<string, DiffSingerPredictor?> mPredictors = new(StringComparer.Ordinal);
     readonly object mPredictorLock = new();
 
+    readonly object mAcousticLock = new();
+    readonly object mVocoderLock = new();
+
     public InferenceSession Acoustic { get; }
     public InferenceSession Vocoder { get; }
 
+    // 线程安全的声学推理包装（DirectML EP 需要串行化 Run 调用）
+    public IDisposableReadOnlyCollection<DisposableNamedOnnxValue> RunAcoustic(List<NamedOnnxValue> inputs)
+    {
+        lock (mAcousticLock) return Acoustic.Run(inputs);
+    }
+
+    public IDisposableReadOnlyCollection<DisposableNamedOnnxValue> RunVocoder(List<NamedOnnxValue> inputs)
+    {
+        lock (mVocoderLock) return Vocoder.Run(inputs);
+    }
+
     public int HiddenSize => mConfig.HiddenSize;
     public int HopSize => mConfig.HopSize;
     public int SampleRate => mConfig.SampleRate;
diff --git a/DiffSingerPhonemizer.cs b/DiffSingerPhonemizer.cs
index 03ac60b..95fdb8b 100644
--- a/DiffSingerPhonemizer.cs
+++ b/DiffSingerPhonemizer.cs
@@ -53,15 +53,30 @@ public static List<PhonemeSpan> Phonemize(
             noteSymbolCount[i] = symbols.Length;
 
             // 连音符：不产生新音素，只延展前音的时长（通过 group 自然吸收）
-            // (+) 额外在起始点插入一个空 vowel 边界组，强制对齐
+            // 连音符：- 不产生边界（前组自然吸收），+ 拆前组末音素到独立组强制 dur 边界
             if (symbols.Length == 0 && (lyric == "-" || lyric == "+"))
             {
                 if (lyric == "+")
                 {
-                    // 插入一个空韵核组作为强制对齐边界
-                    groups.Add(new Group(note.StartTime, note.Pitch));
+                    // 把前一个非空组的最后一个音素拆分到 + 组（仅当 >1 音素，否则退化为空组边界）
+                    string moved = "AP";
+                    bool splitted = false;
+                    for (int gi = groups.Count - 1; gi >= 0; gi--)
+                    {
+                        if (groups[gi].Phonemes.Count > 1)
+                        {
+                            int lastIdx = groups[gi].Phonemes.Count - 1;
+                            moved = groups[gi].Phonemes[lastIdx];
+                            groups[gi].Phonemes.RemoveAt(lastIdx);
+                            splitted = true;
+                            break;
+                        }
+                    }
+                    var g = new Group(note.StartTime, note.Pitch);
+                    if (splitted) g.Phonemes.Add(moved);
+                    groups.Add(g);
                 }
-                // - 则完全不做任何事，前组自然吸收时长
+                // - 则完全不做任何事，前组自然吸收时长，不影响 dur
                 notePhIndex.Add(notePhIndex[^1]);
                 continue;
             }
@@ -262,7 +277,7 @@ static double[] RunDur(DiffSingerPredictor dur, long[] tokens, long[] langs,
         if (dur.Linguistic.InputMetadata.ContainsKey("languages"))
             lingInputs.Add(Nv("languages", langs, nTokens));
 
-        using var lingOut = dur.Linguistic.Run(lingInputs);
+        using var lingOut = dur.RunLinguistic(lingInputs);
         var enc = lingOut.First(v => v.Name == "encoder_out").AsTensor<float>();
         var mask = lingOut.First(v => v.Name == "x_masks").AsTensor<bool>();
         var encDense = new DenseTensor<float>(enc.ToArray(), enc.Dimensions.ToArray());
@@ -272,15 +287,13 @@ static double[] RunDur(DiffSingerPredictor dur, long[] tokens, long[] langs,
         var spk = new float[nTokens * hidden];
         for (int i = 0; i < nTokens; i++) Array.Copy(emb, 0, spk, i * hidden, hidden);
 
-        var durModel = dur.Model("dur");
-        var durInputs = new List<NamedOnnxValue>
+        using var durOut = dur.RunModel("dur", new List<NamedOnnxValue>
         {
             NamedOnnxValue.CreateFromTensor("encoder_out", encDense),
             NamedOnnxValue.CreateFromTensor("x_masks", maskDense),
             Nv("ph_midi", phMidi.Select(x => (long)x).ToArray(), nTokens),
             NamedOnnxValue.CreateFromTensor("spk_embed", new DenseTensor<float>(spk, new[] { 1, nTokens, hidden })),
-        };
-        using var durOut = durModel.Run(durInputs);
+        });
         return durOut.First(v => v.Name == "ph_dur_pred").AsTensor<float>().Select(x => (double)x).ToArray();
     }
 
diff --git a/DiffSingerPitch.cs b/DiffSingerPitch.cs
index 9e2dee8..ceccb79 100644
--- a/DiffSingerPitch.cs
+++ b/DiffSingerPitch.cs
@@ -52,7 +52,7 @@ public static class DiffSingerPitch
             var langs = phones.Select(p => v.LangId(PhonemeLanguage(p.Symbol))).Prepend(0L).Append(0L).ToArray();
             lingInputs.Add(NvL("languages", langs, nTokens));
         }
-        using var lingOut = v.Linguistic.Run(lingInputs);
+        using var lingOut = v.RunLinguistic(lingInputs);
         var enc = lingOut.First(o => o.Name == "encoder_out").AsTensor<float>();
         var encDense = new DenseTensor<float>(enc.ToArray(), enc.Dimensions.ToArray());
 
@@ -95,7 +95,7 @@ public static class DiffSingerPitch
             inputs.Add(NamedOnnxValue.CreateFromTensor("note_rest",
                 new DenseTensor<bool>(noteRest, new[] { 1, noteRest.Length })));
 
-        using var outputs = model.Run(inputs);
+        using var outputs = v.RunModel("pitch", inputs);
         return outputs.First().AsTensor<float>().ToArray();
     }
 
diff --git a/DiffSingerPredictor.cs b/DiffSingerPredictor.cs
index 270a657..0457298 100644
--- a/DiffSingerPredictor.cs
+++ b/DiffSingerPredictor.cs
@@ -27,12 +27,25 @@ public sealed class DiffSingerPredictor : IDisposable
     readonly Dictionary<string, Dictionary<string, string[]>> mEntryCache = new(StringComparer.Ordinal);
     readonly Dictionary<string, string> mSymbolTypes = new(StringComparer.Ordinal);  // symbol → type（合并 dsdict）
     readonly object mLock = new();
+    // 推理锁：DirectML EP 的 InferenceSession.Run() 非线程安全，串行化所有 Run 调用。
+    readonly object mRunLock = new();
 
     public InferenceSession Linguistic { get; }
     public int HiddenSize => mHidden;
     // linguistic 是否吃 word_div/word_dur（dsdur/dsvariance 词边界；dspitch 用已知 ph_dur）。
     public bool LinguisticUsesWordBoundary { get; }
 
+    // 线程安全的推理包装（DirectML EP 需要串行化 Run 调用）
+    public IDisposableReadOnlyCollection<DisposableNamedOnnxValue> RunLinguistic(List<NamedOnnxValue> inputs)
+    {
+        lock (mRunLock) return Linguistic.Run(inputs);
+    }
+
+    public IDisposableReadOnlyCollection<DisposableNamedOnnxValue> RunModel(string role, List<NamedOnnxValue> inputs)
+    {
+        lock (mRunLock) return mModels[role].Run(inputs);
+    }
+
     public DiffSingerPredictor(string dir, Func<string, InferenceSession> load)
     {
         mDir = dir;
diff --git a/DiffSingerSynthesisSession.cs b/DiffSingerSynthesisSession.cs
index 2547440..25c9745 100644
--- a/DiffSingerSynthesisSession.cs
+++ b/DiffSingerSynthesisSession.cs
@@ -48,14 +48,17 @@ enum RenderMode { Normal, Retake }
     double mAffectedStartTime = double.NaN;
     double mAffectedEndTime = double.NaN;
 
+    readonly SemaphoreSlim mRenderSemaphore;
+
     public DiffSingerSynthesisSession(VoicebankConfig config, ISynthesisContext context,
-        string voiceId, DiffSingerModelCache modelCache, int samplingSteps)
+        string voiceId, DiffSingerModelCache modelCache, int samplingSteps, SemaphoreSlim renderSemaphore)
     {
         mConfig = config;
         mContext = context;
         mVoiceId = voiceId;
         mModelCache = modelCache;
         mSamplingSteps = samplingSteps;
+        mRenderSemaphore = renderSemaphore;
 
         mAutomationConfigs = BuildAutomationConfigs(config);
         mReadbackConfigs = BuildReadbackConfigs(config);
@@ -218,6 +221,8 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
         StatusChanged?.Invoke();
 
         var report = new Progress<double>(p => { piece.Progress = p; StatusChanged?.Invoke(); });
+
+        await mRenderSemaphore.WaitAsync(cancellation);
         try
         {
             var rendered = await Task.Run(() => Render(snapshot, piece.Notes, piece, report, cancellation), CancellationToken.None);
@@ -271,6 +276,7 @@ public async Task SynthesizeNext(SynthesisSegment segment, CancellationToken can
         }
         finally
         {
+            mRenderSemaphore.Release();
             piece.Synthesizing = false;
             StatusChanged?.Invoke();
         }
@@ -556,7 +562,7 @@ void AddF(string name, float[] data, int[] dims)
         }
 
         // —— 声学模型：产 mel ——
-        using var melOut = ac.Run(inputs);
+        using var melOut = models.RunAcoustic(inputs);
         var melTensor = melOut.First(v => v.Name == "mel").AsTensor<float>();
         var melDims = melTensor.Dimensions.ToArray();
         var newMel = melTensor.ToArray();
@@ -591,7 +597,7 @@ void AddF(string name, float[] data, int[] dims)
         };
         if (voc.InputMetadata.ContainsKey("f0"))
             vInputs.Add(NamedOnnxValue.CreateFromTensor("f0", new DenseTensor<float>(f0, new[] { 1, nFrames })));
-        using var wavOut = voc.Run(vInputs);
+        using var wavOut = models.RunVocoder(vInputs);
         var audio = wavOut.First(v => v.Name == "waveform").AsTensor<float>().ToArray();
         progress?.Report(1.0);
 
diff --git a/DiffSingerVariance.cs b/DiffSingerVariance.cs
index 3c07e23..c74bb8f 100644
--- a/DiffSingerVariance.cs
+++ b/DiffSingerVariance.cs
@@ -59,7 +59,7 @@ public static VarianceCurves Predict(
         }
         if (v.Linguistic.InputMetadata.ContainsKey("languages"))
             lingInputs.Add(NvL("languages", langs, nTokens));
-        using var lingOut = v.Linguistic.Run(lingInputs);
+        using var lingOut = v.RunLinguistic(lingInputs);
         var enc = lingOut.First(o => o.Name == "encoder_out").AsTensor<float>();
         var encDense = new DenseTensor<float>(enc.ToArray(), enc.Dimensions.ToArray());
 
@@ -101,7 +101,7 @@ void Channel(bool predict, string name)
                 new DenseTensor<float>(spk, new[] { 1, totalFrames, hidden })));
         }
 
-        using var outputs = model.Run(inputs);
+        using var outputs = v.RunModel("variance", inputs);
         float[]? Out(bool predict, string name)
             => predict ? outputs.First(o => o.Name == name).AsTensor<float>().ToArray() : null;
         return new VarianceCurves(
diff --git a/DiffSingerVoiceEngine.cs b/DiffSingerVoiceEngine.cs
index 888c8c5..c60055c 100644
--- a/DiffSingerVoiceEngine.cs
+++ b/DiffSingerVoiceEngine.cs
@@ -18,6 +18,7 @@ public sealed class DiffSingerVoiceEngine : IVoiceEngine, IExtensionSettings
     const string KeyVoicebankDirs = "voicebank_dirs";
     const string KeyExecutionProvider = "execution_provider";
     const string KeySamplingSteps = "sampling_steps";
+    const string KeyMaxConcurrentRenderings = "max_concurrent_renderings";
 
     public IReadOnlyOrderedMap<string, VoiceSourceInfo> VoiceSourceInfos => mState.Infos;
 
@@ -33,6 +34,18 @@ public void Destroy()
         mModelCache = null;
     }
 
+    // —— 并发渲染限流（DirectML 多轨并发需要限制同时渲染的轨数）——
+    SemaphoreSlim mRenderSemaphore = new(1, 1);
+
+    internal void UpdateRenderSemaphore(int maxConcurrent)
+    {
+        maxConcurrent = Math.Max(1, maxConcurrent);
+        var old = Interlocked.Exchange(ref mRenderSemaphore, new SemaphoreSlim(maxConcurrent, maxConcurrent));
+        old.Dispose();
+    }
+
+    internal SemaphoreSlim RenderSemaphore => mRenderSemaphore;
+
     // —— 会话注册表（引擎级，跨会话共享）——
     readonly List<WeakReference<DiffSingerSynthesisSession>> mSessions = new();
 
@@ -79,7 +92,7 @@ public ISynthesisSession CreateSession(string voiceId, ISynthesisContext context
         // 推理走引擎级模型缓存（懒加载、按 voiceId 共享）；声明面（轨/面板）已上移到引擎方法、建会话前即填好。
         var config = ConfigFor(voiceId)!;
         var samplingSteps = mSettings.GetInt(KeySamplingSteps, 20);
-        var session = new DiffSingerSynthesisSession(config, context, voiceId, EnsureModelCache(), samplingSteps);
+        var session = new DiffSingerSynthesisSession(config, context, voiceId, EnsureModelCache(), samplingSteps, mRenderSemaphore);
         RegisterSession(session);
         return session;
     }
@@ -266,6 +279,15 @@ public ObjectConfig GetSettingsConfig(IExtensionSettingsContext context)
                     DefaultValue = 20, MinValue = 1, MaxValue = 1000, IsInteger = true,
                 }
             },
+            {
+                // DirectML 最大同时渲染轨数：默认 1（串行安全），提高可让 CPU 端并行加速。
+                KeyMaxConcurrentRenderings,
+                new SliderConfig
+                {
+                    DisplayText = L.Tr("Max concurrent renderings"),
+                    DefaultValue = 1, MinValue = 1, MaxValue = 8, IsInteger = true,
+                }
+            },
         };
         return new ObjectConfig { Properties = properties };
     }
@@ -273,6 +295,8 @@ public ObjectConfig GetSettingsConfig(IExtensionSettingsContext context)
     public void ApplySettings(PropertyObject settings)
     {
         mSettings = settings;
+        var maxConcurrent = mSettings.GetInt(KeyMaxConcurrentRenderings, 1);
+        UpdateRenderSemaphore(maxConcurrent);
         Rescan();
     }