Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docs/Model Support.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
[ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast |
[HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality |
[Lens](#lens) | MMDiT | 2026 | Microsoft | 4B | Minimal | Modern, lightweight |
[PixelDiT](#pixeldit) | Pixel DiT | 2026 | NVIDIA | 1.3B | Minimal | Modern, fast, pixel-space |

Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md):

Expand Down Expand Up @@ -640,6 +641,21 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended
- **Steps:** For Turbo, `4` is recommended, `8` works well. For Base, `20` as normal.
- **Resolution:** Side length `1440` is the official default, but 1024 is a reasonable option. It retains coherence down to about 512 and up to about 2048.

# PixelDiT

- NVIDIA's [PixelDiT](<https://huggingface.co/Comfy-Org/PixelDiT>) is supported in SwarmUI!
- The smaller FP8 model can be downloaded here: [Comfy-Org/PixelDiT - fp8](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_mxfp8.safetensors>)
- Or fat BF16 version: [Comfy-Org/PixelDiT - bf16](<https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/diffusion_models/pixeldit_1300m_1024px_bf16.safetensors>)
- Save in `diffusion_models`
- It does not use a VAE
- Uses the Gemma 2 2B text encoder, will be downloaded and handled automatically
- **Parameters:**
- **Sampler:** Default is fine.
- **Scheduler:** Default is fine.
- **CFG Scale:** `4` is recommended.
- **Steps:** `30` is recommended.
- **Resolution:** Side length `1024` is the standard.

# Video Models

- Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,9 @@ public static void AssignValuesFromRaw(JObject rawObjectInfo)
],
Schedulers = ["normal///Normal", "karras///Karras", "exponential///Exponential", "simple///Simple", "ddim_uniform///DDIM Uniform", "sgm_uniform///SGM Uniform", "turbo///Turbo (for turbo models, max 10 steps)", "align_your_steps///Align Your Steps (Model-specific behavior)", "beta///Beta", "linear_quadratic///Linear Quadratic (Mochi)", "ltxv///LTX-Video", "ltxv-image///LTXV-Image", "kl_optimal///KL Optimal (Nvidia AYS)", "flux2///Flux.2"];

/// <summary>Lists PiD decoder models.</summary>
public static List<string> PidUpscaleModels(Session session) => [.. Program.MainSDModels.ListModelsFor(session).Where(m => m.ModelClass?.CompatClass?.ID == "pid").OrderBy(m => m.Name).Select(m => $"pidmodel-{m.Name}///PiD Model: {m.Name}")];

public static List<string> IPAdapterModels = ["None"], IPAdapterWeightTypes = ["standard", "prompt is more important", "style transfer"];

public static List<string> GligenModels = ["None"], YoloModels = [], StyleModels = ["None"], SetClipDevices = ["cpu"];
Expand Down Expand Up @@ -752,7 +755,7 @@ public override void OnInit()
));
RefinerUpscaleMethod = T2IParamTypes.Register<string>(new("Refiner Upscale Method", "How to upscale the image, if upscaling is used.",
"pixel-lanczos", Group: T2IParamTypes.GroupRefiners, OrderPriority: -1, FeatureFlag: "comfyui", ChangeWeight: 1,
GetValues: (_) => UpscalerModels, DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID
GetValues: (session) => [.. UpscalerModels, .. PidUpscaleModels(session)], DependNonDefault: T2IParamTypes.RefinerUpscale.Type.ID
));
RefinerSamplerParam = T2IParamTypes.Register<string>(new("Refiner Sampler", SamplerParam.Type.Description + "\nThis is an override to only affect the Refine/Upscale stage.",
"euler", Toggleable: true, FeatureFlag: "comfyui", Group: T2IParamTypes.GroupRefinerOverrides, OrderPriority: -2,
Expand Down
13 changes: 10 additions & 3 deletions src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ public JArray FinalImageOut
/// <summary>If true, the generator is currently working on the refiner stage.</summary>
public bool IsRefinerStage = false;

/// <summary>If true, the generator is currently working on the pixel-decoder stage.</summary>
public bool IsPixelDecoderStage = false;

/// <summary>If true, the generator is currently working on Image2Video.</summary>
public bool IsImageToVideo = false;

Expand Down Expand Up @@ -959,7 +962,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent
}
}
// TODO: Registry of model default preferences instead of this
else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens())
else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens() || IsPixelDiT() || IsPiD())
{
defscheduler ??= "simple";
}
Expand Down Expand Up @@ -2518,7 +2521,7 @@ public bool ShouldZeroNegative()
}

/// <summary>Creates a "CLIPTextEncode" or equivalent node for the given input, applying prompt-given conditioning modifiers as relevant.</summary>
public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false)
public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, bool isPositive, string firstId = null, bool isRefiner = false, bool isVideo = false, bool isVideoSwap = false, bool isPixelDecoder = false)
{
PromptRegion regionalizer = new(prompt);
string globalPromptText = regionalizer.GlobalPrompt;
Expand All @@ -2534,7 +2537,11 @@ public JArray CreateConditioning(string prompt, JArray clip, T2IModel model, boo
{
globalPromptText = $"{globalPromptText} {regionalizer.RefinerPrompt}";
}
else if (!isVideo && !isRefiner && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt))
else if (isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.PixelDecoderPrompt))
{
globalPromptText = $"{globalPromptText} {regionalizer.PixelDecoderPrompt}";
}
else if (!isVideo && !isRefiner && !isPixelDecoder && !string.IsNullOrWhiteSpace(regionalizer.BasePrompt))
{
globalPromptText = $"{globalPromptText} {regionalizer.BasePrompt}";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ public bool IsKontext()
/// <summary>Returns true if the current model is Chroma Radiance.</summary>
public bool IsChromaRadiance() => IsModelCompatClass(T2IModelClassSorter.CompatChromaRadiance);

/// <summary>Returns true if the current model is PixelDiT.</summary>
public bool IsPixelDiT() => IsModelCompatClass(T2IModelClassSorter.CompatPixelDiT);

/// <summary>Returns true if the current model is PiD.</summary>
public bool IsPiD() => IsModelCompatClass(T2IModelClassSorter.CompatPiD);

/// <summary>Returns true if the current model is HiDream-i1.</summary>
public bool IsHiDream() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamI1);

Expand Down Expand Up @@ -401,7 +407,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n
["width"] = width
}, id), frames);
}
else if (IsChromaRadiance() || IsZetaChroma())
else if (IsChromaRadiance() || IsZetaChroma() || IsPixelDiT())
{
return resultImage(CreateNode("EmptyChromaRadianceLatentImage", new JObject()
{
Expand Down Expand Up @@ -657,6 +663,11 @@ public string GetGemma2Model()
return RequireClipModel("gemma_2_2b_fp16.safetensors", "https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/resolve/main/split_files/text_encoders/gemma_2_2b_fp16.safetensors", "29761442862f8d064d3f854bb6fabf4379dcff511a7f6ba9405a00bd0f7e2dbd", T2IParamTypes.GemmaModel);
}

public string GetGemma2_2bElmModel()
{
return RequireClipModel("gemma_2_2b_it_elm_fp8_scaled.safetensors", "https://huggingface.co/Comfy-Org/PixelDiT/resolve/main/text_encoders/gemma_2_2b_it_elm_fp8_scaled.safetensors", "87692b2ab1714028e29910ea645d96db656505ca0805051048d2298b225c02d1", T2IParamTypes.GemmaModel);
}

public string GetGemma3_12bModel()
{
return RequireClipModel("gemma_3_12B_it.safetensors", "https://huggingface.co/Comfy-Org/ltx-2/resolve/main/split_files/text_encoders/gemma_3_12B_it_fp4_mixed.safetensors", "aaca463d11e6d8d2a4bdb0d6299214c15ef78a3f73e0ef8113d5a9d0219b3f6d", T2IParamTypes.GemmaModel);
Expand Down Expand Up @@ -907,7 +918,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
{
dtype = "default";
}
else if (IsZImage() || IsZetaChroma() || IsAnima() || IsLens()) // Model is small and dense, so trust user preferred download format
else if (IsZImage() || IsZetaChroma() || IsAnima() || IsLens() || IsPixelDiT() || IsPiD()) // Model is small and dense, so trust user preferred download format
{
dtype = "default";
}
Expand Down Expand Up @@ -1139,6 +1150,11 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFluxVAE, "flux-1", "flux-ae");
}
}
else if (IsPixelDiT() || IsPiD())
{
helpers.LoadClip("pixeldit", helpers.GetGemma2_2bElmModel());
LoadingVAE = CreateVAELoader("pixel_space");
}
else if (IsHiDream())
{
string loaderType = "QuadrupleCLIPLoader";
Expand Down
73 changes: 68 additions & 5 deletions src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorSteps.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,11 @@ public static void Register()
(g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(0, g.LoadingModel, g.LoadingClip);
if (g.IsRefinerStage)
{
(g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(1, g.LoadingModel, g.LoadingClip);
(g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(T2IParamInput.SectionID_Refiner, g.LoadingModel, g.LoadingClip);
}
else if (g.IsPixelDecoderStage)
{
(g.LoadingModel, g.LoadingClip) = g.LoadLorasForConfinement(T2IParamInput.SectionID_PixelDecoder, g.LoadingModel, g.LoadingClip);
}
else if (g.IsImageToVideoSwap)
{
Expand Down Expand Up @@ -1452,10 +1456,10 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
prompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isRefiner: true);
negPrompt = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isRefiner: true);
bool doSave = g.UserInput.Get(T2IParamTypes.OutputIntermediateImages, false);
bool doUspcale = g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double refineUpscale) && refineUpscale != 1;
bool doUpscale = g.UserInput.TryGet(T2IParamTypes.RefinerUpscale, out double refineUpscale) && refineUpscale != 1;
string upscaleMethod = g.UserInput.Get(ComfyUIBackendExtension.RefinerUpscaleMethod, "None");
// TODO: Better same-VAE check
bool doPixelUpscale = doUspcale && (upscaleMethod.StartsWith("pixel-") || upscaleMethod.StartsWith("model-"));
bool doPixelUpscale = doUpscale && (upscaleMethod.StartsWith("pixel-") || upscaleMethod.StartsWith("model-"));
int width = (int)Math.Round(g.UserInput.GetImageWidth() * refineUpscale);
int height = (int)Math.Round(g.UserInput.GetImageHeight() * refineUpscale);
width = (width / 16) * 16; // avoid unworkable output sizes
Expand Down Expand Up @@ -1517,7 +1521,7 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
g.CurrentMedia = decoded.EncodeToLatent(g.CurrentVae, "25");
}
}
if (doUspcale && upscaleMethod.StartsWith("latent-"))
if (doUpscale && upscaleMethod.StartsWith("latent-"))
{
g.CurrentMedia = g.CurrentMedia.AsLatentImage(g.CurrentVae);
g.CreateNode("LatentUpscaleBy", new JObject()
Expand All @@ -1530,7 +1534,7 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
g.CurrentMedia.Width = width;
g.CurrentMedia.Height = height;
}
else if (doUspcale && upscaleMethod.StartsWith("latentmodel-"))
else if (doUpscale && upscaleMethod.StartsWith("latentmodel-"))
{
g.CreateNode("LatentUpscaleModelLoader", new JObject()
{
Expand Down Expand Up @@ -1598,6 +1602,65 @@ JArray doMaskShrinkApply(WorkflowGenerator g, JArray imgIn)
explicitSampler: explicitSampler, explicitScheduler: explicitScheduler, sectionId: T2IParamInput.SectionID_Refiner);
g.CurrentMedia = g.CurrentMedia.WithPath(["23", 0]);
g.IsRefinerStage = false;
if (doUpscale && upscaleMethod.StartsWith("pidmodel-"))
{
string pidModelName = upscaleMethod.After("pidmodel-");
string pidMatched = T2IParamTypes.GetBestModelInList(pidModelName, Program.MainSDModels.ListModelNamesFor(g.UserInput.SourceSession));
if (pidMatched is not null && pidMatched.EndsWith(".safetensors"))
{
pidMatched = pidMatched.BeforeLast('.');
}
T2IModel pidModel = pidMatched is null ? null : Program.MainSDModels.GetModel(pidMatched);
if (pidModel is null || pidModel.ModelClass?.CompatClass?.ID != "pid")
{
throw new SwarmUserErrorException($"Refiner Upscale Method is set to PiD model '{pidModelName}', but that model could not be found or is not a valid PiD model.");
}
string pidLatentFormat = g.IsSD3() ? "sd3" : (g.IsFlux() || g.IsAnyFlux2() || g.IsZImage() || g.IsZetaChroma()) ? "flux" : null;
if (pidLatentFormat is null)
{
throw new SwarmUserErrorException($"PiD model requires the refiner model's VAE to be Flux.1, Flux.2, or SD3, but model '{refineModel.Name}' is '{refineModel.ModelClass?.CompatClass?.ID ?? "unknown"}'.");
}
JArray refinedLatent = g.CurrentMedia.Path;
int pidWidth = g.UserInput.GetImageWidth() * 4;
int pidHeight = g.UserInput.GetImageHeight() * 4;
pidWidth = (pidWidth / 16) * 16;
pidHeight = (pidHeight / 16) * 16;
T2IModel refinerFinalModel = g.FinalLoadedModel;
List<T2IModel> refinerFinalModelList = g.FinalLoadedModelList;
g.FinalLoadedModel = pidModel;
g.FinalLoadedModelList = [pidModel];
g.NoVAEOverride = true;
g.IsPixelDecoderStage = true;
(g.FinalLoadedModel, g.CurrentModel, g.CurrentTextEnc, g.CurrentVae) = g.CreateModelLoader(pidModel, "PixelDecoder", sectionId: T2IParamInput.SectionID_PixelDecoder);
g.IsPixelDecoderStage = false;
g.NoVAEOverride = false;
JArray pidPos = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.Prompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, true, isPixelDecoder: true);
JArray pidNeg = g.CreateConditioning(g.UserInput.Get(T2IParamTypes.NegativePrompt), g.CurrentTextEnc.Path, g.FinalLoadedModel, false, isPixelDecoder: true);
string pidCond = g.CreateNode("PiDConditioning", new JObject()
{
["positive"] = pidPos,
["latent"] = refinedLatent,
["latent_format"] = pidLatentFormat,
["degrade_sigma"] = 0.0
});
string pidEmptyLatent = g.CreateNode("EmptyChromaRadianceLatentImage", new JObject()
{
["batch_size"] = g.UserInput.Get(T2IParamTypes.BatchSize, 1),
["width"] = pidWidth,
["height"] = pidHeight
});
int pidSteps = g.UserInput.GetNullable(T2IParamTypes.Steps, T2IParamInput.SectionID_PixelDecoder, false) ?? 4;
double pidCfg = g.UserInput.GetNullable(T2IParamTypes.CFGScale, T2IParamInput.SectionID_PixelDecoder, false) ?? 1.0;
string pidSampler = g.UserInput.Get(ComfyUIBackendExtension.SamplerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false);
string pidScheduler = g.UserInput.Get(ComfyUIBackendExtension.SchedulerParam, null, sectionId: T2IParamInput.SectionID_PixelDecoder, includeBase: false);
string pidSampled = g.CreateKSampler(g.CurrentModel.Path, [pidCond, 0], pidNeg, [pidEmptyLatent, 0], pidCfg, pidSteps, 0, 10000,
g.UserInput.Get(T2IParamTypes.Seed) + 2, false, true, defsampler: "lcm", defscheduler: "simple", explicitSampler: pidSampler, explicitScheduler: pidScheduler, sectionId: T2IParamInput.SectionID_PixelDecoder);
g.CurrentMedia = g.CurrentMedia.WithPath([pidSampled, 0], WGNodeData.DT_LATENT_IMAGE, pidModel.ModelClass?.CompatClass);
g.CurrentMedia.Width = pidWidth;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for the Refiner Upscale, since target size is user-specified, follow user specified size by way of doing a post-rescale in pixel space, see how ImageUpscaleWithModel does it above

g.CurrentMedia.Height = pidHeight;
g.FinalLoadedModel = refinerFinalModel;
g.FinalLoadedModelList = refinerFinalModelList;
}
}
}, -4);
#endregion
Expand Down
Loading
Loading