diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d2fd04068248..8ac9700239e6 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -456,6 +456,8 @@
             "HeliosPyramidDistilledAutoBlocks",
             "HeliosPyramidDistilledModularPipeline",
             "HeliosPyramidModularPipeline",
+            "HunyuanVideo15AutoBlocks",
+            "HunyuanVideo15ModularPipeline",
             "LTXAutoBlocks",
             "LTXModularPipeline",
             "QwenImageAutoBlocks",
@@ -1239,6 +1241,8 @@
             HeliosPyramidDistilledAutoBlocks,
             HeliosPyramidDistilledModularPipeline,
             HeliosPyramidModularPipeline,
+            HunyuanVideo15AutoBlocks,
+            HunyuanVideo15ModularPipeline,
             LTXAutoBlocks,
             LTXModularPipeline,
             QwenImageAutoBlocks,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index c4891d1c0f7d..b7137249fe16 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -88,6 +88,10 @@
         "QwenImageLayeredModularPipeline",
         "QwenImageLayeredAutoBlocks",
     ]
+    _import_structure["hunyuan_video1_5"] = [
+        "HunyuanVideo15AutoBlocks",
+        "HunyuanVideo15ModularPipeline",
+    ]
     _import_structure["ltx"] = [
         "LTXAutoBlocks",
         "LTXModularPipeline",
@@ -123,6 +127,10 @@
             HeliosPyramidDistilledModularPipeline,
             HeliosPyramidModularPipeline,
         )
+        from .hunyuan_video1_5 import (
+            HunyuanVideo15AutoBlocks,
+            HunyuanVideo15ModularPipeline,
+        )
         from .ltx import LTXAutoBlocks, LTXModularPipeline
         from .modular_pipeline import (
             AutoPipelineBlocks,
diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/__init__.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/__init__.py
new file mode 100644
index 000000000000..f2716e416229
--- /dev/null
+++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/__init__.py
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modular_blocks_hunyuan_video1_5"] = [
+        "HunyuanVideo15AutoBlocks",
+        "HunyuanVideo15Blocks",
+        "HunyuanVideo15Image2VideoBlocks",
+    ]
+    _import_structure["modular_pipeline"] = ["HunyuanVideo15ModularPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modular_blocks_hunyuan_video1_5 import (
+            HunyuanVideo15AutoBlocks,
+            HunyuanVideo15Blocks,
+            HunyuanVideo15Image2VideoBlocks,
+        )
+        from .modular_pipeline import HunyuanVideo15ModularPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/before_denoise.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/before_denoise.py
new file mode 100644
index 000000000000..c5fff8f8eca9
--- /dev/null
+++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/before_denoise.py
@@ -0,0 +1,305 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+import numpy as np
+import torch
+
+from ...models import HunyuanVideo15Transformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import HunyuanVideo15ModularPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: int | None = None,
+    device: str | torch.device | None = None,
+    timesteps: list[int] | None = None,
+    sigmas: list[float] | None = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`list[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`list[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class HunyuanVideo15TextInputStep(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def description(self) -> str:
+        return "Input processing step that determines batch_size"
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"),
+            InputParam.template("prompt_embeds"),
+            InputParam.template("batch_size", default=None),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("batch_size", type_hint=int),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.batch_size = getattr(block_state, "batch_size", None) or block_state.prompt_embeds.shape[0]
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HunyuanVideo15SetTimestepsStep(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+
+    @property
+    def description(self) -> str:
+        return "Step that sets the scheduler's timesteps for inference"
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("timesteps", type_hint=torch.Tensor),
+            OutputParam("num_inference_steps", type_hint=int),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        sigmas = block_state.sigmas
+        if sigmas is None:
+            sigmas = np.linspace(1.0, 0.0, block_state.num_inference_steps + 1)[:-1]
+
+        block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
+            components.scheduler, block_state.num_inference_steps, device, sigmas=sigmas
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HunyuanVideo15PrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def description(self) -> str:
+        return "Prepare latents, conditioning latents, mask, and image_embeds for T2V"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("transformer", HunyuanVideo15Transformer3DModel)]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam("num_frames", type_hint=int, default=121),
+            InputParam.template("latents"),
+            InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size", required=True, default=None),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("latents", type_hint=torch.Tensor, description="Pure noise latents"),
+            OutputParam("cond_latents_concat", type_hint=torch.Tensor),
+            OutputParam("mask_concat", type_hint=torch.Tensor),
+            OutputParam("image_embeds", type_hint=torch.Tensor),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+
+        height = block_state.height
+        width = block_state.width
+        if height is None and width is None:
+            height, width = components.video_processor.calculate_default_height_width(
+                components.default_aspect_ratio[1], components.default_aspect_ratio[0], components.target_size
+            )
+
+        batch_size = block_state.batch_size * block_state.num_videos_per_prompt
+        num_frames = block_state.num_frames
+
+        latents = block_state.latents
+        if latents is not None:
+            latents = latents.to(device=device, dtype=dtype)
+        else:
+            shape = (
+                batch_size,
+                components.num_channels_latents,
+                (num_frames - 1) // components.vae_scale_factor_temporal + 1,
+                int(height) // components.vae_scale_factor_spatial,
+                int(width) // components.vae_scale_factor_spatial,
+            )
+            if isinstance(block_state.generator, list) and len(block_state.generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            latents = randn_tensor(shape, generator=block_state.generator, device=device, dtype=dtype)
+
+        block_state.latents = latents
+
+        b, c, f, h, w = latents.shape
+        block_state.cond_latents_concat = torch.zeros(b, c, f, h, w, dtype=dtype, device=device)
+        block_state.mask_concat = torch.zeros(b, 1, f, h, w, dtype=dtype, device=device)
+
+        block_state.image_embeds = torch.zeros(
+            block_state.batch_size,
+            components.vision_num_semantic_tokens,
+            components.vision_states_dim,
+            dtype=dtype,
+            device=device,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HunyuanVideo15Image2VideoPrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Prepare I2V conditioning from image_latents and image_embeds. "
+            "Expects pure noise `latents` from HunyuanVideo15PrepareLatentsStep. "
+            "Builds cond_latents_concat and mask_concat for the denoiser."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("transformer", HunyuanVideo15Transformer3DModel)]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("image_latents", type_hint=torch.Tensor, required=True),
+            InputParam("image_embeds", type_hint=torch.Tensor, required=True),
+            InputParam.template("latents", required=True),
+            InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"),
+            InputParam.template("batch_size", required=True, default=None),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("cond_latents_concat", type_hint=torch.Tensor),
+            OutputParam("mask_concat", type_hint=torch.Tensor),
+            OutputParam("image_embeds", type_hint=torch.Tensor),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+
+        batch_size = block_state.batch_size * block_state.num_videos_per_prompt
+
+        b, c, f, h, w = block_state.latents.shape
+
+        latent_condition = block_state.image_latents.to(device=device, dtype=dtype)
+        latent_condition = latent_condition.repeat(batch_size, 1, f, 1, 1)
+        latent_condition[:, :, 1:, :, :] = 0
+        block_state.cond_latents_concat = latent_condition
+
+        latent_mask = torch.zeros(b, 1, f, h, w, dtype=dtype, device=device)
+        latent_mask[:, :, 0, :, :] = 1.0
+        block_state.mask_concat = latent_mask
+
+        image_embeds = block_state.image_embeds.to(device=device, dtype=dtype)
+        if image_embeds.shape[0] == 1 and batch_size > 1:
+            image_embeds = image_embeds.repeat(batch_size, 1, 1)
+        block_state.image_embeds = image_embeds
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/decoders.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/decoders.py
new file mode 100644
index 000000000000..f6b9eb68559f
--- /dev/null
+++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/decoders.py
@@ -0,0 +1,70 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKLHunyuanVideo15
+from ...pipelines.hunyuan_video1_5.image_processor import HunyuanVideo15ImageProcessor
+from ...utils import logging
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunyuanVideo15VaeDecoderStep(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLHunyuanVideo15),
+            ComponentSpec(
+                "video_processor",
+                HunyuanVideo15ImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step that decodes the denoised latents into videos"
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("latents", required=True),
+            InputParam.template("output_type", default="np"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam.template("videos"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        latents = block_state.latents.to(components.vae.dtype) / components.vae.config.scaling_factor
+        video = components.vae.decode(latents, return_dict=False)[0]
+        block_state.videos = components.video_processor.postprocess_video(video, output_type=block_state.output_type)
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/denoise.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/denoise.py
new file mode 100644
index 000000000000..033cd60e29de
--- /dev/null
+++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/denoise.py
@@ -0,0 +1,353 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...models import HunyuanVideo15Transformer3DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ..modular_pipeline import (
+    BlockState,
+    LoopSequentialPipelineBlocks,
+    ModularPipelineBlocks,
+    PipelineState,
+)
+from ..modular_pipeline_utils import ComponentSpec, InputParam
+from .modular_pipeline import HunyuanVideo15ModularPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunyuanVideo15LoopBeforeDenoiser(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def description(self) -> str:
+        return "Step within the denoising loop that prepares the latent input"
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("latents", required=True),
+            InputParam("cond_latents_concat", required=True, type_hint=torch.Tensor),
+            InputParam("mask_concat", required=True, type_hint=torch.Tensor),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        block_state.latent_model_input = torch.cat(
+            [block_state.latents, block_state.cond_latents_concat, block_state.mask_concat], dim=1
+        )
+        return components, block_state
+
+
+class HunyuanVideo15LoopDenoiser(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    def __init__(self, guider_input_fields=None):
+        if guider_input_fields is None:
+            guider_input_fields = {
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+                "encoder_attention_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"),
+                "encoder_hidden_states_2": ("prompt_embeds_2", "negative_prompt_embeds_2"),
+                "encoder_attention_mask_2": ("prompt_embeds_mask_2", "negative_prompt_embeds_mask_2"),
+            }
+        if not isinstance(guider_input_fields, dict):
+            raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
+        self._guider_input_fields = guider_input_fields
+        super().__init__()
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("transformer", HunyuanVideo15Transformer3DModel),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step within the denoising loop that denoises the latents with guidance"
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        inputs = [
+            InputParam.template("attention_kwargs"),
+            InputParam.template("num_inference_steps", required=True, default=None),
+            InputParam("image_embeds", type_hint=torch.Tensor),
+        ]
+        for value in self._guider_input_fields.values():
+            if isinstance(value, tuple):
+                inputs.append(InputParam(name=value[0], required=True, type_hint=torch.Tensor))
+                for neg_name in value[1:]:
+                    inputs.append(InputParam(name=neg_name, type_hint=torch.Tensor))
+            else:
+                inputs.append(InputParam(name=value, required=True, type_hint=torch.Tensor))
+        return inputs
+
+    @torch.no_grad()
+    def __call__(
+        self, components: HunyuanVideo15ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        timestep = t.expand(block_state.latent_model_input.shape[0]).to(block_state.latent_model_input.dtype)
+
+        # Step 1: Collect model inputs
+        guider_inputs = {
+            input_name: tuple(getattr(block_state, v) for v in value)
+            if isinstance(value, tuple)
+            else getattr(block_state, value)
+            for input_name, value in self._guider_input_fields.items()
+        }
+
+        # Step 2: Update guider state
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+
+        # Step 3: Prepare batched inputs
+        guider_state = components.guider.prepare_inputs(guider_inputs)
+
+        # Step 4: Run denoiser for each batch
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.transformer)
+
+            cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()}
+
+            context_name = getattr(guider_state_batch, components.guider._identifier_key)
+            with components.transformer.cache_context(context_name):
+                guider_state_batch.noise_pred = components.transformer(
+                    hidden_states=block_state.latent_model_input,
+                    image_embeds=block_state.image_embeds,
+                    timestep=timestep,
+                    attention_kwargs=block_state.attention_kwargs,
+                    return_dict=False,
+                    **cond_kwargs,
+                )[0]
+
+            components.guider.cleanup_models(components.transformer)
+
+        # Step 5: Combine predictions
+        block_state.noise_pred = components.guider(guider_state)[0]
+
+        return components, block_state
+
+
+class HunyuanVideo15LoopAfterDenoiser(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+
+    @property
+    def description(self) -> str:
+        return "Step within the denoising loop that updates the latents"
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        latents_dtype = block_state.latents.dtype
+        block_state.latents = components.scheduler.step(
+            block_state.noise_pred, t, block_state.latents, return_dict=False
+        )[0]
+
+        if block_state.latents.dtype != latents_dtype:
+            if torch.backends.mps.is_available():
+                block_state.latents = block_state.latents.to(latents_dtype)
+
+        return components, block_state
+
+
+class HunyuanVideo15DenoiseLoopWrapper(LoopSequentialPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def description(self) -> str:
+        return "Pipeline block that iteratively denoises the latents over timesteps"
+
+    @property
+    def loop_expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+            ComponentSpec("transformer", HunyuanVideo15Transformer3DModel),
+        ]
+
+    @property
+    def loop_inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("timesteps", required=True),
+            InputParam.template("num_inference_steps", required=True, default=None),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        block_state.num_warmup_steps = max(
+            len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
+        )
+
+        with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
+            for i, t in enumerate(block_state.timesteps):
+                components, block_state = self.loop_step(components, block_state, i=i, t=t)
+                if i == len(block_state.timesteps) - 1 or (
+                    (i + 1) > block_state.num_warmup_steps and (i + 1) % components.scheduler.order == 0
+                ):
+                    progress_bar.update()
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HunyuanVideo15DenoiseStep(HunyuanVideo15DenoiseLoopWrapper):
+    block_classes = [
+        HunyuanVideo15LoopBeforeDenoiser,
+        HunyuanVideo15LoopDenoiser(),
+        HunyuanVideo15LoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoises the latents.\n"
+            "At each iteration:\n"
+            " - `HunyuanVideo15LoopBeforeDenoiser`\n"
+            " - `HunyuanVideo15LoopDenoiser`\n"
+            " - `HunyuanVideo15LoopAfterDenoiser`\n"
+            "This block supports text-to-video tasks."
+        )
+
+
+class HunyuanVideo15Image2VideoLoopDenoiser(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    def __init__(self, guider_input_fields=None):
+        if guider_input_fields is None:
+            guider_input_fields = {
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+                "encoder_attention_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"),
+                "encoder_hidden_states_2": ("prompt_embeds_2", "negative_prompt_embeds_2"),
+                "encoder_attention_mask_2": ("prompt_embeds_mask_2", "negative_prompt_embeds_mask_2"),
+            }
+        if not isinstance(guider_input_fields, dict):
+            raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
+        self._guider_input_fields = guider_input_fields
+        super().__init__()
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("transformer", HunyuanVideo15Transformer3DModel),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "I2V denoiser with MeanFlow timestep_r support"
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        inputs = [
+            InputParam.template("attention_kwargs"),
+            InputParam.template("num_inference_steps", required=True, default=None),
+            InputParam("image_embeds", type_hint=torch.Tensor),
+            InputParam.template("timesteps", required=True),
+        ]
+        for value in self._guider_input_fields.values():
+            if isinstance(value, tuple):
+                inputs.append(InputParam(name=value[0], required=True, type_hint=torch.Tensor))
+                for neg_name in value[1:]:
+                    inputs.append(InputParam(name=neg_name, type_hint=torch.Tensor))
+            else:
+                inputs.append(InputParam(name=value, required=True, type_hint=torch.Tensor))
+        return inputs
+
+    @torch.no_grad()
+    def __call__(
+        self, components: HunyuanVideo15ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        timestep = t.expand(block_state.latent_model_input.shape[0]).to(block_state.latent_model_input.dtype)
+
+        # MeanFlow timestep_r (lines 855-862)
+        if components.transformer.config.use_meanflow:
+            if i == len(block_state.timesteps) - 1:
+                timestep_r = torch.tensor([0.0], device=timestep.device)
+            else:
+                timestep_r = block_state.timesteps[i + 1]
+            timestep_r = timestep_r.expand(block_state.latents.shape[0]).to(block_state.latents.dtype)
+        else:
+            timestep_r = None
+
+        guider_inputs = {
+            input_name: tuple(getattr(block_state, v) for v in value)
+            if isinstance(value, tuple)
+            else getattr(block_state, value)
+            for input_name, value in self._guider_input_fields.items()
+        }
+
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+        guider_state = components.guider.prepare_inputs(guider_inputs)
+
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.transformer)
+
+            cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()}
+
+            context_name = getattr(guider_state_batch, components.guider._identifier_key)
+            with components.transformer.cache_context(context_name):
+                guider_state_batch.noise_pred = components.transformer(
+                    hidden_states=block_state.latent_model_input,
+                    image_embeds=block_state.image_embeds,
+                    timestep=timestep,
+                    timestep_r=timestep_r,
+                    attention_kwargs=block_state.attention_kwargs,
+                    return_dict=False,
+                    **cond_kwargs,
+                )[0]
+
+            components.guider.cleanup_models(components.transformer)
+
+        block_state.noise_pred = components.guider(guider_state)[0]
+
+        return components, block_state
+
+
+class HunyuanVideo15Image2VideoDenoiseStep(HunyuanVideo15DenoiseLoopWrapper):
+    block_classes = [
+        HunyuanVideo15LoopBeforeDenoiser,
+        HunyuanVideo15Image2VideoLoopDenoiser(),
+        HunyuanVideo15LoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step for image-to-video with MeanFlow support.\n"
+            "At each iteration:\n"
+            " - `HunyuanVideo15LoopBeforeDenoiser`\n"
+            " - `HunyuanVideo15Image2VideoLoopDenoiser`\n"
+            " - `HunyuanVideo15LoopAfterDenoiser`"
+        )
diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/encoders.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/encoders.py
new file mode 100644
index 000000000000..0922186a8bda
--- /dev/null
+++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/encoders.py
@@ -0,0 +1,445 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import torch
+from transformers import (
+    ByT5Tokenizer,
+    Qwen2_5_VLTextModel,
+    Qwen2TokenizerFast,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+    T5EncoderModel,
+)
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...models import AutoencoderKLHunyuanVideo15
+from ...pipelines.hunyuan_video1_5.image_processor import HunyuanVideo15ImageProcessor
+from ...utils import logging
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import HunyuanVideo15ModularPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+def format_text_input(prompt, system_message):
+    return [
+        [{"role": "system", "content": system_message}, {"role": "user", "content": p if p else " "}] for p in prompt
+    ]
+
+
+def extract_glyph_texts(prompt):
+    pattern = r"\"(.*?)\"|\"(.*?)\""
+    matches = re.findall(pattern, prompt)
+    result = [match[0] or match[1] for match in matches]
+    result = list(dict.fromkeys(result)) if len(result) > 1 else result
+    if result:
+        formatted_result = ". ".join([f'Text "{text}"' for text in result]) + ". "
+    else:
+        formatted_result = None
+    return formatted_result
+
+
+def _get_mllm_prompt_embeds(
+    text_encoder,
+    tokenizer,
+    prompt,
+    device,
+    tokenizer_max_length=1000,
+    num_hidden_layers_to_skip=2,
+    # fmt: off
+    system_message="You are a helpful assistant. Describe the video by detailing the following aspects: \
+    1. The main content and theme of the video. \
+    2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \
+    3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \
+    4. background environment, light, style and atmosphere. \
+    5. camera angles, movements, and transitions used in the video.",
+    # fmt: on
+    crop_start=108,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt = format_text_input(prompt, system_message)
+
+    text_inputs = tokenizer.apply_chat_template(
+        prompt,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        padding="max_length",
+        max_length=tokenizer_max_length + crop_start,
+        truncation=True,
+        return_tensors="pt",
+    )
+
+    text_input_ids = text_inputs.input_ids.to(device=device)
+    prompt_attention_mask = text_inputs.attention_mask.to(device=device)
+
+    prompt_embeds = text_encoder(
+        input_ids=text_input_ids,
+        attention_mask=prompt_attention_mask,
+        output_hidden_states=True,
+    ).hidden_states[-(num_hidden_layers_to_skip + 1)]
+
+    if crop_start is not None and crop_start > 0:
+        prompt_embeds = prompt_embeds[:, crop_start:]
+        prompt_attention_mask = prompt_attention_mask[:, crop_start:]
+
+    return prompt_embeds, prompt_attention_mask
+
+
+def _get_byt5_prompt_embeds(tokenizer, text_encoder, prompt, device, tokenizer_max_length=256):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    glyph_texts = [extract_glyph_texts(p) for p in prompt]
+
+    prompt_embeds_list = []
+    prompt_embeds_mask_list = []
+
+    for glyph_text in glyph_texts:
+        if glyph_text is None:
+            glyph_text_embeds = torch.zeros(
+                (1, tokenizer_max_length, text_encoder.config.d_model), device=device, dtype=text_encoder.dtype
+            )
+            glyph_text_embeds_mask = torch.zeros((1, tokenizer_max_length), device=device, dtype=torch.int64)
+        else:
+            txt_tokens = tokenizer(
+                glyph_text,
+                padding="max_length",
+                max_length=tokenizer_max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            ).to(device)
+
+            glyph_text_embeds = text_encoder(
+                input_ids=txt_tokens.input_ids,
+                attention_mask=txt_tokens.attention_mask.float(),
+            )[0]
+            glyph_text_embeds = glyph_text_embeds.to(device=device)
+            glyph_text_embeds_mask = txt_tokens.attention_mask.to(device=device)
+
+        prompt_embeds_list.append(glyph_text_embeds)
+        prompt_embeds_mask_list.append(glyph_text_embeds_mask)
+
+    return torch.cat(prompt_embeds_list, dim=0), torch.cat(prompt_embeds_mask_list, dim=0)
+
+
+class HunyuanVideo15TextEncoderStep(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def description(self) -> str:
+        return "Dual text encoder step using Qwen2.5-VL (MLLM) and ByT5 (glyph text)"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", Qwen2_5_VLTextModel),
+            ComponentSpec("tokenizer", Qwen2TokenizerFast),
+            ComponentSpec("text_encoder_2", T5EncoderModel),
+            ComponentSpec("tokenizer_2", ByT5Tokenizer),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("prompt", required=False),
+            InputParam.template("negative_prompt"),
+            InputParam.template("prompt_embeds", required=False),
+            InputParam.template("prompt_embeds_mask", required=False),
+            InputParam.template("negative_prompt_embeds"),
+            InputParam.template("negative_prompt_embeds_mask"),
+            InputParam("prompt_embeds_2", type_hint=torch.Tensor),
+            InputParam("prompt_embeds_mask_2", type_hint=torch.Tensor),
+            InputParam("negative_prompt_embeds_2", type_hint=torch.Tensor),
+            InputParam("negative_prompt_embeds_mask_2", type_hint=torch.Tensor),
+            InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
+            OutputParam("prompt_embeds_2", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"),
+            OutputParam("prompt_embeds_mask_2", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"),
+            OutputParam("negative_prompt_embeds_2", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"),
+            OutputParam("negative_prompt_embeds_mask_2", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"),
+        ]
+
+    @staticmethod
+    def encode_prompt(
+        components,
+        prompt,
+        device=None,
+        dtype=None,
+        batch_size=1,
+        num_videos_per_prompt=1,
+        prompt_embeds=None,
+        prompt_embeds_mask=None,
+        prompt_embeds_2=None,
+        prompt_embeds_mask_2=None,
+    ):
+        device = device or components._execution_device
+        dtype = dtype or components.text_encoder.dtype
+
+        if prompt is None:
+            prompt = [""] * batch_size
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_embeds, prompt_embeds_mask = _get_mllm_prompt_embeds(
+                tokenizer=components.tokenizer,
+                text_encoder=components.text_encoder,
+                prompt=prompt,
+                device=device,
+                tokenizer_max_length=components.tokenizer_max_length,
+                system_message=components.system_message,
+                crop_start=components.prompt_template_encode_start_idx,
+            )
+
+        if prompt_embeds_2 is None:
+            prompt_embeds_2, prompt_embeds_mask_2 = _get_byt5_prompt_embeds(
+                tokenizer=components.tokenizer_2,
+                text_encoder=components.text_encoder_2,
+                prompt=prompt,
+                device=device,
+                tokenizer_max_length=components.tokenizer_2_max_length,
+            )
+
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1).view(
+            batch_size * num_videos_per_prompt, seq_len, -1
+        )
+        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_videos_per_prompt, 1).view(
+            batch_size * num_videos_per_prompt, seq_len
+        )
+
+        _, seq_len_2, _ = prompt_embeds_2.shape
+        prompt_embeds_2 = prompt_embeds_2.repeat(1, num_videos_per_prompt, 1).view(
+            batch_size * num_videos_per_prompt, seq_len_2, -1
+        )
+        prompt_embeds_mask_2 = prompt_embeds_mask_2.repeat(1, num_videos_per_prompt, 1).view(
+            batch_size * num_videos_per_prompt, seq_len_2
+        )
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds_mask = prompt_embeds_mask.to(dtype=dtype, device=device)
+        prompt_embeds_2 = prompt_embeds_2.to(dtype=dtype, device=device)
+        prompt_embeds_mask_2 = prompt_embeds_mask_2.to(dtype=dtype, device=device)
+
+        return prompt_embeds, prompt_embeds_mask, prompt_embeds_2, prompt_embeds_mask_2
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+
+        prompt = block_state.prompt
+        negative_prompt = block_state.negative_prompt
+        num_videos_per_prompt = block_state.num_videos_per_prompt
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        elif getattr(block_state, "prompt_embeds", None) is not None:
+            batch_size = block_state.prompt_embeds.shape[0]
+        else:
+            batch_size = 1
+
+        (
+            block_state.prompt_embeds,
+            block_state.prompt_embeds_mask,
+            block_state.prompt_embeds_2,
+            block_state.prompt_embeds_mask_2,
+        ) = self.encode_prompt(
+            components,
+            prompt=prompt,
+            device=device,
+            dtype=dtype,
+            batch_size=batch_size,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=getattr(block_state, "prompt_embeds", None),
+            prompt_embeds_mask=getattr(block_state, "prompt_embeds_mask", None),
+            prompt_embeds_2=getattr(block_state, "prompt_embeds_2", None),
+            prompt_embeds_mask_2=getattr(block_state, "prompt_embeds_mask_2", None),
+        )
+
+        if components.requires_unconditional_embeds:
+            (
+                block_state.negative_prompt_embeds,
+                block_state.negative_prompt_embeds_mask,
+                block_state.negative_prompt_embeds_2,
+                block_state.negative_prompt_embeds_mask_2,
+            ) = self.encode_prompt(
+                components,
+                prompt=negative_prompt,
+                device=device,
+                dtype=dtype,
+                batch_size=batch_size,
+                num_videos_per_prompt=num_videos_per_prompt,
+                prompt_embeds=getattr(block_state, "negative_prompt_embeds", None),
+                prompt_embeds_mask=getattr(block_state, "negative_prompt_embeds_mask", None),
+                prompt_embeds_2=getattr(block_state, "negative_prompt_embeds_2", None),
+                prompt_embeds_mask_2=getattr(block_state, "negative_prompt_embeds_mask_2", None),
+            )
+
+        state.set("batch_size", batch_size)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class HunyuanVideo15VaeEncoderStep(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def description(self) -> str:
+        return "VAE Encoder step that encodes an input image into latent space for image-to-video generation"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLHunyuanVideo15),
+            ComponentSpec(
+                "video_processor",
+                HunyuanVideo15ImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("image", required=True),
+            InputParam.template("height"),
+            InputParam.template("width"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "image_latents",
+                type_hint=torch.Tensor,
+                description="Encoded image latents from the VAE encoder",
+            ),
+            OutputParam("height", type_hint=int, description="Target height resolved from image"),
+            OutputParam("width", type_hint=int, description="Target width resolved from image"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        image = block_state.image
+        height = block_state.height
+        width = block_state.width
+        if height is None or width is None:
+            height, width = components.video_processor.calculate_default_height_width(
+                height=image.size[1], width=image.size[0], target_size=components.target_size
+            )
+        image = components.video_processor.resize(image, height=height, width=width, resize_mode="crop")
+
+        vae_dtype = components.vae.dtype
+        image_tensor = components.video_processor.preprocess(image, height=height, width=width).to(
+            device=device, dtype=vae_dtype
+        )
+        image_tensor = image_tensor.unsqueeze(2)
+        image_latents = retrieve_latents(components.vae.encode(image_tensor), sample_mode="argmax")
+        image_latents = image_latents * components.vae.config.scaling_factor
+
+        block_state.image_latents = image_latents
+        block_state.height = height
+        block_state.width = width
+        state.set("image", image)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HunyuanVideo15ImageEncoderStep(ModularPipelineBlocks):
+    model_name = "hunyuan-video-1.5"
+
+    @property
+    def description(self) -> str:
+        return "Siglip image encoder step that produces image_embeds for image-to-video generation"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("image_encoder", SiglipVisionModel),
+            ComponentSpec("feature_extractor", SiglipImageProcessor),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("image", required=True),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "image_embeds",
+                type_hint=torch.Tensor,
+                description="Image embeddings from the Siglip vision encoder",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        image_encoder_dtype = next(components.image_encoder.parameters()).dtype
+        image_inputs = components.feature_extractor.preprocess(
+            images=block_state.image, do_resize=True, return_tensors="pt", do_convert_rgb=True
+        )
+        image_inputs = image_inputs.to(device=device, dtype=image_encoder_dtype)
+        image_embeds = components.image_encoder(**image_inputs).last_hidden_state
+
+        block_state.image_embeds = image_embeds
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_blocks_hunyuan_video1_5.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_blocks_hunyuan_video1_5.py
new file mode 100644
index 000000000000..6c8bb475dac1
--- /dev/null
+++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_blocks_hunyuan_video1_5.py
@@ -0,0 +1,583 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import OutputParam
+from .before_denoise import (
+    HunyuanVideo15Image2VideoPrepareLatentsStep,
+    HunyuanVideo15PrepareLatentsStep,
+    HunyuanVideo15SetTimestepsStep,
+    HunyuanVideo15TextInputStep,
+)
+from .decoders import HunyuanVideo15VaeDecoderStep
+from .denoise import HunyuanVideo15DenoiseStep, HunyuanVideo15Image2VideoDenoiseStep
+from .encoders import (
+    HunyuanVideo15ImageEncoderStep,
+    HunyuanVideo15TextEncoderStep,
+    HunyuanVideo15VaeEncoderStep,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+# auto_docstring
+class HunyuanVideo15CoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Denoise block that takes encoded conditions and runs the denoising process.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          batch_size (`int`, *optional*):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          num_frames (`int`, *optional*, defaults to 121):
+              TODO: Add description.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_mask (`Tensor`):
+              TODO: Add description.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_2 (`Tensor`):
+              TODO: Add description.
+          negative_prompt_embeds_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_mask_2 (`Tensor`):
+              TODO: Add description.
+          negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    model_name = "hunyuan-video-1.5"
+    block_classes = [
+        HunyuanVideo15TextInputStep,
+        HunyuanVideo15SetTimestepsStep,
+        HunyuanVideo15PrepareLatentsStep,
+        HunyuanVideo15DenoiseStep,
+    ]
+    block_names = ["input", "set_timesteps", "prepare_latents", "denoise"]
+
+    @property
+    def description(self):
+        return "Denoise block that takes encoded conditions and runs the denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
+
+# auto_docstring
+class HunyuanVideo15Blocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline blocks for HunyuanVideo 1.5 text-to-video.
+
+      Components:
+          text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`)
+          tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) scheduler (`FlowMatchEulerDiscreteScheduler`)
+          transformer (`HunyuanVideo15Transformer3DModel`) vae (`AutoencoderKLHunyuanVideo15`) video_processor
+          (`HunyuanVideo15ImageProcessor`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          prompt_embeds (`Tensor`, *optional*):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          prompt_embeds_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_mask_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          num_frames (`int`, *optional*, defaults to 121):
+              TODO: Add description.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          output_type (`str`, *optional*, defaults to np):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
+    model_name = "hunyuan-video-1.5"
+    block_classes = [
+        HunyuanVideo15TextEncoderStep,
+        HunyuanVideo15CoreDenoiseStep,
+        HunyuanVideo15VaeDecoderStep,
+    ]
+    block_names = ["text_encoder", "denoise", "decode"]
+
+    @property
+    def description(self):
+        return "Modular pipeline blocks for HunyuanVideo 1.5 text-to-video."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
+
+
+# auto_docstring
+class HunyuanVideo15Image2VideoCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Denoise block for image-to-video that takes encoded conditions and runs the denoising process.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          batch_size (`int`, *optional*):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          num_frames (`int`, *optional*, defaults to 121):
+              TODO: Add description.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image_latents (`Tensor`):
+              TODO: Add description.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_mask (`Tensor`):
+              TODO: Add description.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_2 (`Tensor`):
+              TODO: Add description.
+          negative_prompt_embeds_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_mask_2 (`Tensor`):
+              TODO: Add description.
+          negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    model_name = "hunyuan-video-1.5"
+    block_classes = [
+        HunyuanVideo15TextInputStep,
+        HunyuanVideo15SetTimestepsStep,
+        HunyuanVideo15PrepareLatentsStep,
+        HunyuanVideo15Image2VideoPrepareLatentsStep,
+        HunyuanVideo15Image2VideoDenoiseStep,
+    ]
+    block_names = ["input", "set_timesteps", "prepare_latents", "prepare_i2v_latents", "denoise"]
+
+    @property
+    def description(self):
+        return "Denoise block for image-to-video that takes encoded conditions and runs the denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
+
+# auto_docstring
+class HunyuanVideo15AutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    VAE encoder step that encodes the image input into its latent representation.
+      This is an auto pipeline block that works for image-to-video tasks.
+       - `HunyuanVideo15VaeEncoderStep` is used when `image` is provided.
+       - If `image` is not provided, step will be skipped.
+
+      Components:
+          vae (`AutoencoderKLHunyuanVideo15`) video_processor (`HunyuanVideo15ImageProcessor`)
+
+      Inputs:
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+      Outputs:
+          image_latents (`Tensor`):
+              Encoded image latents from the VAE encoder
+          height (`int`):
+              Target height resolved from image
+          width (`int`):
+              Target width resolved from image
+    """
+
+    model_name = "hunyuan-video-1.5"
+    block_classes = [HunyuanVideo15VaeEncoderStep]
+    block_names = ["vae_encoder"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "VAE encoder step that encodes the image input into its latent representation.\n"
+            "This is an auto pipeline block that works for image-to-video tasks.\n"
+            " - `HunyuanVideo15VaeEncoderStep` is used when `image` is provided.\n"
+            " - If `image` is not provided, step will be skipped."
+        )
+
+
+# auto_docstring
+class HunyuanVideo15AutoImageEncoderStep(AutoPipelineBlocks):
+    """
+    Siglip image encoder step that produces image_embeds.
+      This is an auto pipeline block that works for image-to-video tasks.
+       - `HunyuanVideo15ImageEncoderStep` is used when `image` is provided.
+       - If `image` is not provided, step will be skipped.
+
+      Components:
+          image_encoder (`SiglipVisionModel`) feature_extractor (`SiglipImageProcessor`)
+
+      Inputs:
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          image_embeds (`Tensor`):
+              Image embeddings from the Siglip vision encoder
+    """
+
+    model_name = "hunyuan-video-1.5"
+    block_classes = [HunyuanVideo15ImageEncoderStep]
+    block_names = ["image_encoder"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "Siglip image encoder step that produces image_embeds.\n"
+            "This is an auto pipeline block that works for image-to-video tasks.\n"
+            " - `HunyuanVideo15ImageEncoderStep` is used when `image` is provided.\n"
+            " - If `image` is not provided, step will be skipped."
+        )
+
+
+# auto_docstring
+class HunyuanVideo15AutoCoreDenoiseStep(AutoPipelineBlocks):
+    """
+    Auto denoise block that selects the appropriate denoise pipeline based on inputs.
+       - `HunyuanVideo15Image2VideoCoreDenoiseStep` is used when `image_latents` is provided.
+       - `HunyuanVideo15CoreDenoiseStep` is used otherwise (text-to-video).
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          num_frames (`int`, *optional*, defaults to 121):
+              TODO: Add description.
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image_latents (`Tensor`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_mask (`Tensor`):
+              TODO: Add description.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_2 (`Tensor`):
+              TODO: Add description.
+          negative_prompt_embeds_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_mask_2 (`Tensor`):
+              TODO: Add description.
+          negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
+              TODO: Add description.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    model_name = "hunyuan-video-1.5"
+    block_classes = [HunyuanVideo15Image2VideoCoreDenoiseStep, HunyuanVideo15CoreDenoiseStep]
+    block_names = ["image2video", "text2video"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self):
+        return (
+            "Auto denoise block that selects the appropriate denoise pipeline based on inputs.\n"
+            " - `HunyuanVideo15Image2VideoCoreDenoiseStep` is used when `image_latents` is provided.\n"
+            " - `HunyuanVideo15CoreDenoiseStep` is used otherwise (text-to-video)."
+        )
+
+
+# auto_docstring
+class HunyuanVideo15AutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows.
+
+      Supported workflows:
+        - `text2video`: requires `prompt`
+        - `image2video`: requires `image`, `prompt`
+
+      Components:
+          text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`)
+          tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) vae (`AutoencoderKLHunyuanVideo15`)
+          video_processor (`HunyuanVideo15ImageProcessor`) image_encoder (`SiglipVisionModel`) feature_extractor
+          (`SiglipImageProcessor`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
+          (`HunyuanVideo15Transformer3DModel`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          prompt_embeds (`Tensor`, *optional*):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          prompt_embeds_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_mask_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          num_frames (`int`, *optional*, defaults to 121):
+              TODO: Add description.
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image_latents (`Tensor`, *optional*):
+              TODO: Add description.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          output_type (`str`, *optional*, defaults to np):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
+    model_name = "hunyuan-video-1.5"
+    block_classes = [
+        HunyuanVideo15TextEncoderStep,
+        HunyuanVideo15AutoVaeEncoderStep,
+        HunyuanVideo15AutoImageEncoderStep,
+        HunyuanVideo15AutoCoreDenoiseStep,
+        HunyuanVideo15VaeDecoderStep,
+    ]
+    block_names = ["text_encoder", "vae_encoder", "image_encoder", "denoise", "decode"]
+    _workflow_map = {
+        "text2video": {"prompt": True},
+        "image2video": {"image": True, "prompt": True},
+    }
+
+    @property
+    def description(self):
+        return "Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
+
+
+# auto_docstring
+class HunyuanVideo15Image2VideoBlocks(SequentialPipelineBlocks):
+    """
+    Modular pipeline blocks for HunyuanVideo 1.5 image-to-video.
+
+      Components:
+          text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`)
+          tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) vae (`AutoencoderKLHunyuanVideo15`)
+          video_processor (`HunyuanVideo15ImageProcessor`) image_encoder (`SiglipVisionModel`) feature_extractor
+          (`SiglipImageProcessor`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
+          (`HunyuanVideo15Transformer3DModel`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          prompt_embeds (`Tensor`, *optional*):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          prompt_embeds_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          prompt_embeds_mask_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          negative_prompt_embeds_mask_2 (`Tensor`, *optional*):
+              TODO: Add description.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          batch_size (`int`, *optional*):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          num_frames (`int`, *optional*, defaults to 121):
+              TODO: Add description.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image_latents (`Tensor`):
+              TODO: Add description.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          output_type (`str`, *optional*, defaults to np):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
+    model_name = "hunyuan-video-1.5"
+    block_classes = [
+        HunyuanVideo15TextEncoderStep,
+        HunyuanVideo15AutoVaeEncoderStep,
+        HunyuanVideo15AutoImageEncoderStep,
+        HunyuanVideo15Image2VideoCoreDenoiseStep,
+        HunyuanVideo15VaeDecoderStep,
+    ]
+    block_names = ["text_encoder", "vae_encoder", "image_encoder", "denoise", "decode"]
+
+    @property
+    def description(self):
+        return "Modular pipeline blocks for HunyuanVideo 1.5 image-to-video."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_pipeline.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_pipeline.py
new file mode 100644
index 000000000000..5b23d8699905
--- /dev/null
+++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_pipeline.py
@@ -0,0 +1,90 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...loaders import HunyuanVideoLoraLoaderMixin
+from ...utils import logging
+from ..modular_pipeline import ModularPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunyuanVideo15ModularPipeline(
+    ModularPipeline,
+    HunyuanVideoLoraLoaderMixin,
+):
+    """
+    A ModularPipeline for HunyuanVideo 1.5.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "HunyuanVideo15AutoBlocks"
+
+    @property
+    def vae_scale_factor_spatial(self):
+        return self.vae.spatial_compression_ratio if getattr(self, "vae", None) else 16
+
+    @property
+    def vae_scale_factor_temporal(self):
+        return self.vae.temporal_compression_ratio if getattr(self, "vae", None) else 4
+
+    @property
+    def num_channels_latents(self):
+        return self.vae.config.latent_channels if getattr(self, "vae", None) else 32
+
+    @property
+    def target_size(self):
+        return self.transformer.config.target_size if getattr(self, "transformer", None) else 640
+
+    @property
+    def default_aspect_ratio(self):
+        return (16, 9)
+
+    @property
+    def vision_num_semantic_tokens(self):
+        return 729
+
+    @property
+    def vision_states_dim(self):
+        return self.transformer.config.image_embed_dim if getattr(self, "transformer", None) else 1152
+
+    @property
+    def tokenizer_max_length(self):
+        return 1000
+
+    @property
+    def tokenizer_2_max_length(self):
+        return 256
+
+    # fmt: off
+    @property
+    def system_message(self):
+        return "You are a helpful assistant. Describe the video by detailing the following aspects: \
+        1. The main content and theme of the video. \
+        2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \
+        3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \
+        4. background environment, light, style and atmosphere. \
+        5. camera angles, movements, and transitions used in the video."
+    # fmt: on
+
+    @property
+    def prompt_template_encode_start_idx(self):
+        return 108
+
+    @property
+    def requires_unconditional_embeds(self):
+        if hasattr(self, "guider") and self.guider is not None:
+            return self.guider._enabled and self.guider.num_conditions > 1
+        return False
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index ace89f0d6f91..d00bf716a78f 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -132,6 +132,7 @@ def _helios_pyramid_map_fn(config_dict=None):
         ("z-image", _create_default_map_fn("ZImageModularPipeline")),
         ("helios", _create_default_map_fn("HeliosModularPipeline")),
         ("helios-pyramid", _helios_pyramid_map_fn),
+        ("hunyuan-video-1.5", _create_default_map_fn("HunyuanVideo15ModularPipeline")),
         ("ltx", _create_default_map_fn("LTXModularPipeline")),
     ]
 )
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 0d4d6d97a05b..938115408314 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -242,6 +242,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class HunyuanVideo15AutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HunyuanVideo15ModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class LTXAutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/modular_pipelines/hunyuan_video1_5/__init__.py b/tests/modular_pipelines/hunyuan_video1_5/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/modular_pipelines/hunyuan_video1_5/test_modular_pipeline_hunyuan_video1_5.py b/tests/modular_pipelines/hunyuan_video1_5/test_modular_pipeline_hunyuan_video1_5.py
new file mode 100644
index 000000000000..6d776eaa1a11
--- /dev/null
+++ b/tests/modular_pipelines/hunyuan_video1_5/test_modular_pipeline_hunyuan_video1_5.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from diffusers.modular_pipelines import HunyuanVideo15AutoBlocks, HunyuanVideo15ModularPipeline
+
+from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+
+
+HUNYUANVIDEO15_WORKFLOWS = {
+    "text2video": [
+        ("text_encoder", "HunyuanVideo15TextEncoderStep"),
+        ("denoise.input", "HunyuanVideo15TextInputStep"),
+        ("denoise.set_timesteps", "HunyuanVideo15SetTimestepsStep"),
+        ("denoise.prepare_latents", "HunyuanVideo15PrepareLatentsStep"),
+        ("denoise.denoise", "HunyuanVideo15DenoiseStep"),
+        ("decode", "HunyuanVideo15VaeDecoderStep"),
+    ],
+    "image2video": [
+        ("text_encoder", "HunyuanVideo15TextEncoderStep"),
+        ("vae_encoder", "HunyuanVideo15VaeEncoderStep"),
+        ("image_encoder", "HunyuanVideo15ImageEncoderStep"),
+        ("denoise.input", "HunyuanVideo15TextInputStep"),
+        ("denoise.set_timesteps", "HunyuanVideo15SetTimestepsStep"),
+        ("denoise.prepare_latents", "HunyuanVideo15PrepareLatentsStep"),
+        ("denoise.prepare_i2v_latents", "HunyuanVideo15Image2VideoPrepareLatentsStep"),
+        ("denoise.denoise", "HunyuanVideo15Image2VideoDenoiseStep"),
+        ("decode", "HunyuanVideo15VaeDecoderStep"),
+    ],
+}
+
+
+class TestHunyuanVideo15ModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = HunyuanVideo15ModularPipeline
+    pipeline_blocks_class = HunyuanVideo15AutoBlocks
+    pretrained_model_name_or_path = "akshan-main/tiny-hunyuanvideo1_5-modular-pipe"
+
+    params = frozenset(["prompt", "height", "width", "num_frames"])
+    batch_params = frozenset(["prompt"])
+    optional_params = frozenset(["num_inference_steps", "num_videos_per_prompt", "latents"])
+    expected_workflow_blocks = HUNYUANVIDEO15_WORKFLOWS
+    output_name = "videos"
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "num_frames": 9,
+            "output_type": "pt",
+        }
+        return inputs
+
+    @pytest.mark.skip(reason="num_videos_per_prompt")
+    def test_num_images_per_prompt(self):
+        pass
+
+    @pytest.mark.skip(reason="VAE causal attention mask does not support batch>1 decode")
+    def test_inference_batch_consistent(self):
+        pass
+
+    @pytest.mark.skip(reason="VAE causal attention mask does not support batch>1 decode")
+    def test_inference_batch_single_identical(self):
+        pass
+
+    def test_float16_inference(self):
+        super().test_float16_inference(expected_max_diff=0.1)