From 38225de58bdba9b2efa8cc56500f626854f42a89 Mon Sep 17 00:00:00 2001
From: Akshan Krithick <akshan3@uw.edu>
Date: Fri, 10 Apr 2026 02:59:34 -0700
Subject: [PATCH] Fix HunyuanVideo 1.5 I2V by preprocessing image at pixel
 resolution instead of latent resolution

---
 .../hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
index 791dec073524..1d33c2ae188f 100644
--- a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
+++ b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
@@ -611,7 +611,7 @@ def prepare_cond_latents_and_mask(
             tuple: (cond_latents_concat, mask_concat) - both are zero tensors for t2v
         """
 
-        batch, channels, frames, height, width = latents.shape
+        batch, channels, frames, latent_height, latent_width = latents.shape
 
         image_latents = self._get_image_latents(
             vae=self.vae,
@@ -626,7 +626,7 @@ def prepare_cond_latents_and_mask(
         latent_condition[:, :, 1:, :, :] = 0
         latent_condition = latent_condition.to(device=device, dtype=dtype)
 
-        latent_mask = torch.zeros(batch, 1, frames, height, width, dtype=dtype, device=device)
+        latent_mask = torch.zeros(batch, 1, frames, latent_height, latent_width, dtype=dtype, device=device)
         latent_mask[:, :, 0, :, :] = 1.0
 
         return latent_condition, latent_mask