From 38225de58bdba9b2efa8cc56500f626854f42a89 Mon Sep 17 00:00:00 2001 From: Akshan Krithick Date: Fri, 10 Apr 2026 02:59:34 -0700 Subject: [PATCH] Fix HunyuanVideo 1.5 I2V by preprocessing image at pixel resolution instead of latent resolution --- .../hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py index 791dec073524..1d33c2ae188f 100644 --- a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py +++ b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py @@ -611,7 +611,7 @@ def prepare_cond_latents_and_mask( tuple: (cond_latents_concat, mask_concat) - both are zero tensors for t2v """ - batch, channels, frames, height, width = latents.shape + batch, channels, frames, latent_height, latent_width = latents.shape image_latents = self._get_image_latents( vae=self.vae, @@ -626,7 +626,7 @@ def prepare_cond_latents_and_mask( latent_condition[:, :, 1:, :, :] = 0 latent_condition = latent_condition.to(device=device, dtype=dtype) - latent_mask = torch.zeros(batch, 1, frames, height, width, dtype=dtype, device=device) + latent_mask = torch.zeros(batch, 1, frames, latent_height, latent_width, dtype=dtype, device=device) latent_mask[:, :, 0, :, :] = 1.0 return latent_condition, latent_mask