diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index d2fd04068248..8ac9700239e6 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -456,6 +456,8 @@ "HeliosPyramidDistilledAutoBlocks", "HeliosPyramidDistilledModularPipeline", "HeliosPyramidModularPipeline", + "HunyuanVideo15AutoBlocks", + "HunyuanVideo15ModularPipeline", "LTXAutoBlocks", "LTXModularPipeline", "QwenImageAutoBlocks", @@ -1239,6 +1241,8 @@ HeliosPyramidDistilledAutoBlocks, HeliosPyramidDistilledModularPipeline, HeliosPyramidModularPipeline, + HunyuanVideo15AutoBlocks, + HunyuanVideo15ModularPipeline, LTXAutoBlocks, LTXModularPipeline, QwenImageAutoBlocks, diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index c4891d1c0f7d..b7137249fe16 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -88,6 +88,10 @@ "QwenImageLayeredModularPipeline", "QwenImageLayeredAutoBlocks", ] + _import_structure["hunyuan_video1_5"] = [ + "HunyuanVideo15AutoBlocks", + "HunyuanVideo15ModularPipeline", + ] _import_structure["ltx"] = [ "LTXAutoBlocks", "LTXModularPipeline", @@ -123,6 +127,10 @@ HeliosPyramidDistilledModularPipeline, HeliosPyramidModularPipeline, ) + from .hunyuan_video1_5 import ( + HunyuanVideo15AutoBlocks, + HunyuanVideo15ModularPipeline, + ) from .ltx import LTXAutoBlocks, LTXModularPipeline from .modular_pipeline import ( AutoPipelineBlocks, diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/__init__.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/__init__.py new file mode 100644 index 000000000000..f2716e416229 --- /dev/null +++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/__init__.py @@ -0,0 +1,55 @@ +from typing import TYPE_CHECKING + +from ...utils import ( + DIFFUSERS_SLOW_IMPORT, + OptionalDependencyNotAvailable, + _LazyModule, + get_objects_from_module, + is_torch_available, + is_transformers_available, +) + + +_dummy_objects = {} +_import_structure = {} + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils import dummy_torch_and_transformers_objects # noqa F403 + + _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) +else: + _import_structure["modular_blocks_hunyuan_video1_5"] = [ + "HunyuanVideo15AutoBlocks", + "HunyuanVideo15Blocks", + "HunyuanVideo15Image2VideoBlocks", + ] + _import_structure["modular_pipeline"] = ["HunyuanVideo15ModularPipeline"] + +if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: + try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 + else: + from .modular_blocks_hunyuan_video1_5 import ( + HunyuanVideo15AutoBlocks, + HunyuanVideo15Blocks, + HunyuanVideo15Image2VideoBlocks, + ) + from .modular_pipeline import HunyuanVideo15ModularPipeline +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + module_spec=__spec__, + ) + + for name, value in _dummy_objects.items(): + setattr(sys.modules[__name__], name, value) diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/before_denoise.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/before_denoise.py new file mode 100644 index 000000000000..c5fff8f8eca9 --- /dev/null +++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/before_denoise.py @@ -0,0 +1,305 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect + +import numpy as np +import torch + +from ...models import HunyuanVideo15Transformer3DModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import logging +from ...utils.torch_utils import randn_tensor +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .modular_pipeline import HunyuanVideo15ModularPipeline + + +logger = logging.get_logger(__name__) + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: int | None = None, + device: str | torch.device | None = None, + timesteps: list[int] | None = None, + sigmas: list[float] | None = None, + **kwargs, +): + r""" + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`list[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`list[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class HunyuanVideo15TextInputStep(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def description(self) -> str: + return "Input processing step that determines batch_size" + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"), + InputParam.template("prompt_embeds"), + InputParam.template("batch_size", default=None), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam("batch_size", type_hint=int), + ] + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + block_state.batch_size = getattr(block_state, "batch_size", None) or block_state.prompt_embeds.shape[0] + self.set_block_state(state, block_state) + return components, state + + +class HunyuanVideo15SetTimestepsStep(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)] + + @property + def description(self) -> str: + return "Step that sets the scheduler's timesteps for inference" + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("num_inference_steps"), + InputParam.template("sigmas"), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam("timesteps", type_hint=torch.Tensor), + OutputParam("num_inference_steps", type_hint=int), + ] + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + + sigmas = block_state.sigmas + if sigmas is None: + sigmas = np.linspace(1.0, 0.0, block_state.num_inference_steps + 1)[:-1] + + block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps( + components.scheduler, block_state.num_inference_steps, device, sigmas=sigmas + ) + + self.set_block_state(state, block_state) + return components, state + + +class HunyuanVideo15PrepareLatentsStep(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def description(self) -> str: + return "Prepare latents, conditioning latents, mask, and image_embeds for T2V" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ComponentSpec("transformer", HunyuanVideo15Transformer3DModel)] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("height"), + InputParam.template("width"), + InputParam("num_frames", type_hint=int, default=121), + InputParam.template("latents"), + InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"), + InputParam.template("generator"), + InputParam.template("batch_size", required=True, default=None), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam("latents", type_hint=torch.Tensor, description="Pure noise latents"), + OutputParam("cond_latents_concat", type_hint=torch.Tensor), + OutputParam("mask_concat", type_hint=torch.Tensor), + OutputParam("image_embeds", type_hint=torch.Tensor), + ] + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + dtype = components.transformer.dtype + + height = block_state.height + width = block_state.width + if height is None and width is None: + height, width = components.video_processor.calculate_default_height_width( + components.default_aspect_ratio[1], components.default_aspect_ratio[0], components.target_size + ) + + batch_size = block_state.batch_size * block_state.num_videos_per_prompt + num_frames = block_state.num_frames + + latents = block_state.latents + if latents is not None: + latents = latents.to(device=device, dtype=dtype) + else: + shape = ( + batch_size, + components.num_channels_latents, + (num_frames - 1) // components.vae_scale_factor_temporal + 1, + int(height) // components.vae_scale_factor_spatial, + int(width) // components.vae_scale_factor_spatial, + ) + if isinstance(block_state.generator, list) and len(block_state.generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + latents = randn_tensor(shape, generator=block_state.generator, device=device, dtype=dtype) + + block_state.latents = latents + + b, c, f, h, w = latents.shape + block_state.cond_latents_concat = torch.zeros(b, c, f, h, w, dtype=dtype, device=device) + block_state.mask_concat = torch.zeros(b, 1, f, h, w, dtype=dtype, device=device) + + block_state.image_embeds = torch.zeros( + block_state.batch_size, + components.vision_num_semantic_tokens, + components.vision_states_dim, + dtype=dtype, + device=device, + ) + + self.set_block_state(state, block_state) + return components, state + + +class HunyuanVideo15Image2VideoPrepareLatentsStep(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def description(self) -> str: + return ( + "Prepare I2V conditioning from image_latents and image_embeds. " + "Expects pure noise `latents` from HunyuanVideo15PrepareLatentsStep. " + "Builds cond_latents_concat and mask_concat for the denoiser." + ) + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ComponentSpec("transformer", HunyuanVideo15Transformer3DModel)] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam("image_latents", type_hint=torch.Tensor, required=True), + InputParam("image_embeds", type_hint=torch.Tensor, required=True), + InputParam.template("latents", required=True), + InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"), + InputParam.template("batch_size", required=True, default=None), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam("cond_latents_concat", type_hint=torch.Tensor), + OutputParam("mask_concat", type_hint=torch.Tensor), + OutputParam("image_embeds", type_hint=torch.Tensor), + ] + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + dtype = components.transformer.dtype + + batch_size = block_state.batch_size * block_state.num_videos_per_prompt + + b, c, f, h, w = block_state.latents.shape + + latent_condition = block_state.image_latents.to(device=device, dtype=dtype) + latent_condition = latent_condition.repeat(batch_size, 1, f, 1, 1) + latent_condition[:, :, 1:, :, :] = 0 + block_state.cond_latents_concat = latent_condition + + latent_mask = torch.zeros(b, 1, f, h, w, dtype=dtype, device=device) + latent_mask[:, :, 0, :, :] = 1.0 + block_state.mask_concat = latent_mask + + image_embeds = block_state.image_embeds.to(device=device, dtype=dtype) + if image_embeds.shape[0] == 1 and batch_size > 1: + image_embeds = image_embeds.repeat(batch_size, 1, 1) + block_state.image_embeds = image_embeds + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/decoders.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/decoders.py new file mode 100644 index 000000000000..f6b9eb68559f --- /dev/null +++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/decoders.py @@ -0,0 +1,70 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch + +from ...configuration_utils import FrozenDict +from ...models import AutoencoderKLHunyuanVideo15 +from ...pipelines.hunyuan_video1_5.image_processor import HunyuanVideo15ImageProcessor +from ...utils import logging +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam + + +logger = logging.get_logger(__name__) + + +class HunyuanVideo15VaeDecoderStep(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("vae", AutoencoderKLHunyuanVideo15), + ComponentSpec( + "video_processor", + HunyuanVideo15ImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def description(self) -> str: + return "Step that decodes the denoised latents into videos" + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("latents", required=True), + InputParam.template("output_type", default="np"), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam.template("videos"), + ] + + @torch.no_grad() + def __call__(self, components, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + latents = block_state.latents.to(components.vae.dtype) / components.vae.config.scaling_factor + video = components.vae.decode(latents, return_dict=False)[0] + block_state.videos = components.video_processor.postprocess_video(video, output_type=block_state.output_type) + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/denoise.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/denoise.py new file mode 100644 index 000000000000..033cd60e29de --- /dev/null +++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/denoise.py @@ -0,0 +1,353 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch + +from ...configuration_utils import FrozenDict +from ...guiders import ClassifierFreeGuidance +from ...models import HunyuanVideo15Transformer3DModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import logging +from ..modular_pipeline import ( + BlockState, + LoopSequentialPipelineBlocks, + ModularPipelineBlocks, + PipelineState, +) +from ..modular_pipeline_utils import ComponentSpec, InputParam +from .modular_pipeline import HunyuanVideo15ModularPipeline + + +logger = logging.get_logger(__name__) + + +class HunyuanVideo15LoopBeforeDenoiser(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def description(self) -> str: + return "Step within the denoising loop that prepares the latent input" + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("latents", required=True), + InputParam("cond_latents_concat", required=True, type_hint=torch.Tensor), + InputParam("mask_concat", required=True, type_hint=torch.Tensor), + ] + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + block_state.latent_model_input = torch.cat( + [block_state.latents, block_state.cond_latents_concat, block_state.mask_concat], dim=1 + ) + return components, block_state + + +class HunyuanVideo15LoopDenoiser(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + def __init__(self, guider_input_fields=None): + if guider_input_fields is None: + guider_input_fields = { + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_attention_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"), + "encoder_hidden_states_2": ("prompt_embeds_2", "negative_prompt_embeds_2"), + "encoder_attention_mask_2": ("prompt_embeds_mask_2", "negative_prompt_embeds_mask_2"), + } + if not isinstance(guider_input_fields, dict): + raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}") + self._guider_input_fields = guider_input_fields + super().__init__() + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 7.5}), + default_creation_method="from_config", + ), + ComponentSpec("transformer", HunyuanVideo15Transformer3DModel), + ] + + @property + def description(self) -> str: + return "Step within the denoising loop that denoises the latents with guidance" + + @property + def inputs(self) -> list[InputParam]: + inputs = [ + InputParam.template("attention_kwargs"), + InputParam.template("num_inference_steps", required=True, default=None), + InputParam("image_embeds", type_hint=torch.Tensor), + ] + for value in self._guider_input_fields.values(): + if isinstance(value, tuple): + inputs.append(InputParam(name=value[0], required=True, type_hint=torch.Tensor)) + for neg_name in value[1:]: + inputs.append(InputParam(name=neg_name, type_hint=torch.Tensor)) + else: + inputs.append(InputParam(name=value, required=True, type_hint=torch.Tensor)) + return inputs + + @torch.no_grad() + def __call__( + self, components: HunyuanVideo15ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor + ) -> PipelineState: + timestep = t.expand(block_state.latent_model_input.shape[0]).to(block_state.latent_model_input.dtype) + + # Step 1: Collect model inputs + guider_inputs = { + input_name: tuple(getattr(block_state, v) for v in value) + if isinstance(value, tuple) + else getattr(block_state, value) + for input_name, value in self._guider_input_fields.items() + } + + # Step 2: Update guider state + components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) + + # Step 3: Prepare batched inputs + guider_state = components.guider.prepare_inputs(guider_inputs) + + # Step 4: Run denoiser for each batch + for guider_state_batch in guider_state: + components.guider.prepare_models(components.transformer) + + cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()} + + context_name = getattr(guider_state_batch, components.guider._identifier_key) + with components.transformer.cache_context(context_name): + guider_state_batch.noise_pred = components.transformer( + hidden_states=block_state.latent_model_input, + image_embeds=block_state.image_embeds, + timestep=timestep, + attention_kwargs=block_state.attention_kwargs, + return_dict=False, + **cond_kwargs, + )[0] + + components.guider.cleanup_models(components.transformer) + + # Step 5: Combine predictions + block_state.noise_pred = components.guider(guider_state)[0] + + return components, block_state + + +class HunyuanVideo15LoopAfterDenoiser(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)] + + @property + def description(self) -> str: + return "Step within the denoising loop that updates the latents" + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): + latents_dtype = block_state.latents.dtype + block_state.latents = components.scheduler.step( + block_state.noise_pred, t, block_state.latents, return_dict=False + )[0] + + if block_state.latents.dtype != latents_dtype: + if torch.backends.mps.is_available(): + block_state.latents = block_state.latents.to(latents_dtype) + + return components, block_state + + +class HunyuanVideo15DenoiseLoopWrapper(LoopSequentialPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def description(self) -> str: + return "Pipeline block that iteratively denoises the latents over timesteps" + + @property + def loop_expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ComponentSpec("transformer", HunyuanVideo15Transformer3DModel), + ] + + @property + def loop_inputs(self) -> list[InputParam]: + return [ + InputParam.template("timesteps", required=True), + InputParam.template("num_inference_steps", required=True, default=None), + ] + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + block_state.num_warmup_steps = max( + len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0 + ) + + with self.progress_bar(total=block_state.num_inference_steps) as progress_bar: + for i, t in enumerate(block_state.timesteps): + components, block_state = self.loop_step(components, block_state, i=i, t=t) + if i == len(block_state.timesteps) - 1 or ( + (i + 1) > block_state.num_warmup_steps and (i + 1) % components.scheduler.order == 0 + ): + progress_bar.update() + + self.set_block_state(state, block_state) + return components, state + + +class HunyuanVideo15DenoiseStep(HunyuanVideo15DenoiseLoopWrapper): + block_classes = [ + HunyuanVideo15LoopBeforeDenoiser, + HunyuanVideo15LoopDenoiser(), + HunyuanVideo15LoopAfterDenoiser, + ] + block_names = ["before_denoiser", "denoiser", "after_denoiser"] + + @property + def description(self) -> str: + return ( + "Denoise step that iteratively denoises the latents.\n" + "At each iteration:\n" + " - `HunyuanVideo15LoopBeforeDenoiser`\n" + " - `HunyuanVideo15LoopDenoiser`\n" + " - `HunyuanVideo15LoopAfterDenoiser`\n" + "This block supports text-to-video tasks." + ) + + +class HunyuanVideo15Image2VideoLoopDenoiser(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + def __init__(self, guider_input_fields=None): + if guider_input_fields is None: + guider_input_fields = { + "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), + "encoder_attention_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"), + "encoder_hidden_states_2": ("prompt_embeds_2", "negative_prompt_embeds_2"), + "encoder_attention_mask_2": ("prompt_embeds_mask_2", "negative_prompt_embeds_mask_2"), + } + if not isinstance(guider_input_fields, dict): + raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}") + self._guider_input_fields = guider_input_fields + super().__init__() + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 7.5}), + default_creation_method="from_config", + ), + ComponentSpec("transformer", HunyuanVideo15Transformer3DModel), + ] + + @property + def description(self) -> str: + return "I2V denoiser with MeanFlow timestep_r support" + + @property + def inputs(self) -> list[InputParam]: + inputs = [ + InputParam.template("attention_kwargs"), + InputParam.template("num_inference_steps", required=True, default=None), + InputParam("image_embeds", type_hint=torch.Tensor), + InputParam.template("timesteps", required=True), + ] + for value in self._guider_input_fields.values(): + if isinstance(value, tuple): + inputs.append(InputParam(name=value[0], required=True, type_hint=torch.Tensor)) + for neg_name in value[1:]: + inputs.append(InputParam(name=neg_name, type_hint=torch.Tensor)) + else: + inputs.append(InputParam(name=value, required=True, type_hint=torch.Tensor)) + return inputs + + @torch.no_grad() + def __call__( + self, components: HunyuanVideo15ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor + ) -> PipelineState: + timestep = t.expand(block_state.latent_model_input.shape[0]).to(block_state.latent_model_input.dtype) + + # MeanFlow timestep_r (lines 855-862) + if components.transformer.config.use_meanflow: + if i == len(block_state.timesteps) - 1: + timestep_r = torch.tensor([0.0], device=timestep.device) + else: + timestep_r = block_state.timesteps[i + 1] + timestep_r = timestep_r.expand(block_state.latents.shape[0]).to(block_state.latents.dtype) + else: + timestep_r = None + + guider_inputs = { + input_name: tuple(getattr(block_state, v) for v in value) + if isinstance(value, tuple) + else getattr(block_state, value) + for input_name, value in self._guider_input_fields.items() + } + + components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) + guider_state = components.guider.prepare_inputs(guider_inputs) + + for guider_state_batch in guider_state: + components.guider.prepare_models(components.transformer) + + cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()} + + context_name = getattr(guider_state_batch, components.guider._identifier_key) + with components.transformer.cache_context(context_name): + guider_state_batch.noise_pred = components.transformer( + hidden_states=block_state.latent_model_input, + image_embeds=block_state.image_embeds, + timestep=timestep, + timestep_r=timestep_r, + attention_kwargs=block_state.attention_kwargs, + return_dict=False, + **cond_kwargs, + )[0] + + components.guider.cleanup_models(components.transformer) + + block_state.noise_pred = components.guider(guider_state)[0] + + return components, block_state + + +class HunyuanVideo15Image2VideoDenoiseStep(HunyuanVideo15DenoiseLoopWrapper): + block_classes = [ + HunyuanVideo15LoopBeforeDenoiser, + HunyuanVideo15Image2VideoLoopDenoiser(), + HunyuanVideo15LoopAfterDenoiser, + ] + block_names = ["before_denoiser", "denoiser", "after_denoiser"] + + @property + def description(self) -> str: + return ( + "Denoise step for image-to-video with MeanFlow support.\n" + "At each iteration:\n" + " - `HunyuanVideo15LoopBeforeDenoiser`\n" + " - `HunyuanVideo15Image2VideoLoopDenoiser`\n" + " - `HunyuanVideo15LoopAfterDenoiser`" + ) diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/encoders.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/encoders.py new file mode 100644 index 000000000000..0922186a8bda --- /dev/null +++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/encoders.py @@ -0,0 +1,445 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import torch +from transformers import ( + ByT5Tokenizer, + Qwen2_5_VLTextModel, + Qwen2TokenizerFast, + SiglipImageProcessor, + SiglipVisionModel, + T5EncoderModel, +) + +from ...configuration_utils import FrozenDict +from ...guiders import ClassifierFreeGuidance +from ...models import AutoencoderKLHunyuanVideo15 +from ...pipelines.hunyuan_video1_5.image_processor import HunyuanVideo15ImageProcessor +from ...utils import logging +from ..modular_pipeline import ModularPipelineBlocks, PipelineState +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .modular_pipeline import HunyuanVideo15ModularPipeline + + +logger = logging.get_logger(__name__) + + +def format_text_input(prompt, system_message): + return [ + [{"role": "system", "content": system_message}, {"role": "user", "content": p if p else " "}] for p in prompt + ] + + +def extract_glyph_texts(prompt): + pattern = r"\"(.*?)\"|\"(.*?)\"" + matches = re.findall(pattern, prompt) + result = [match[0] or match[1] for match in matches] + result = list(dict.fromkeys(result)) if len(result) > 1 else result + if result: + formatted_result = ". ".join([f'Text "{text}"' for text in result]) + ". " + else: + formatted_result = None + return formatted_result + + +def _get_mllm_prompt_embeds( + text_encoder, + tokenizer, + prompt, + device, + tokenizer_max_length=1000, + num_hidden_layers_to_skip=2, + # fmt: off + system_message="You are a helpful assistant. Describe the video by detailing the following aspects: \ + 1. The main content and theme of the video. \ + 2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \ + 3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \ + 4. background environment, light, style and atmosphere. \ + 5. camera angles, movements, and transitions used in the video.", + # fmt: on + crop_start=108, +): + prompt = [prompt] if isinstance(prompt, str) else prompt + prompt = format_text_input(prompt, system_message) + + text_inputs = tokenizer.apply_chat_template( + prompt, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + padding="max_length", + max_length=tokenizer_max_length + crop_start, + truncation=True, + return_tensors="pt", + ) + + text_input_ids = text_inputs.input_ids.to(device=device) + prompt_attention_mask = text_inputs.attention_mask.to(device=device) + + prompt_embeds = text_encoder( + input_ids=text_input_ids, + attention_mask=prompt_attention_mask, + output_hidden_states=True, + ).hidden_states[-(num_hidden_layers_to_skip + 1)] + + if crop_start is not None and crop_start > 0: + prompt_embeds = prompt_embeds[:, crop_start:] + prompt_attention_mask = prompt_attention_mask[:, crop_start:] + + return prompt_embeds, prompt_attention_mask + + +def _get_byt5_prompt_embeds(tokenizer, text_encoder, prompt, device, tokenizer_max_length=256): + prompt = [prompt] if isinstance(prompt, str) else prompt + glyph_texts = [extract_glyph_texts(p) for p in prompt] + + prompt_embeds_list = [] + prompt_embeds_mask_list = [] + + for glyph_text in glyph_texts: + if glyph_text is None: + glyph_text_embeds = torch.zeros( + (1, tokenizer_max_length, text_encoder.config.d_model), device=device, dtype=text_encoder.dtype + ) + glyph_text_embeds_mask = torch.zeros((1, tokenizer_max_length), device=device, dtype=torch.int64) + else: + txt_tokens = tokenizer( + glyph_text, + padding="max_length", + max_length=tokenizer_max_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ).to(device) + + glyph_text_embeds = text_encoder( + input_ids=txt_tokens.input_ids, + attention_mask=txt_tokens.attention_mask.float(), + )[0] + glyph_text_embeds = glyph_text_embeds.to(device=device) + glyph_text_embeds_mask = txt_tokens.attention_mask.to(device=device) + + prompt_embeds_list.append(glyph_text_embeds) + prompt_embeds_mask_list.append(glyph_text_embeds_mask) + + return torch.cat(prompt_embeds_list, dim=0), torch.cat(prompt_embeds_mask_list, dim=0) + + +class HunyuanVideo15TextEncoderStep(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def description(self) -> str: + return "Dual text encoder step using Qwen2.5-VL (MLLM) and ByT5 (glyph text)" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("text_encoder", Qwen2_5_VLTextModel), + ComponentSpec("tokenizer", Qwen2TokenizerFast), + ComponentSpec("text_encoder_2", T5EncoderModel), + ComponentSpec("tokenizer_2", ByT5Tokenizer), + ComponentSpec( + "guider", + ClassifierFreeGuidance, + config=FrozenDict({"guidance_scale": 7.5}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("prompt", required=False), + InputParam.template("negative_prompt"), + InputParam.template("prompt_embeds", required=False), + InputParam.template("prompt_embeds_mask", required=False), + InputParam.template("negative_prompt_embeds"), + InputParam.template("negative_prompt_embeds_mask"), + InputParam("prompt_embeds_2", type_hint=torch.Tensor), + InputParam("prompt_embeds_mask_2", type_hint=torch.Tensor), + InputParam("negative_prompt_embeds_2", type_hint=torch.Tensor), + InputParam("negative_prompt_embeds_mask_2", type_hint=torch.Tensor), + InputParam.template("num_images_per_prompt", name="num_videos_per_prompt"), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam.template("prompt_embeds"), + OutputParam.template("prompt_embeds_mask"), + OutputParam.template("negative_prompt_embeds"), + OutputParam.template("negative_prompt_embeds_mask"), + OutputParam("prompt_embeds_2", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"), + OutputParam("prompt_embeds_mask_2", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"), + OutputParam("negative_prompt_embeds_2", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"), + OutputParam("negative_prompt_embeds_mask_2", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"), + ] + + @staticmethod + def encode_prompt( + components, + prompt, + device=None, + dtype=None, + batch_size=1, + num_videos_per_prompt=1, + prompt_embeds=None, + prompt_embeds_mask=None, + prompt_embeds_2=None, + prompt_embeds_mask_2=None, + ): + device = device or components._execution_device + dtype = dtype or components.text_encoder.dtype + + if prompt is None: + prompt = [""] * batch_size + prompt = [prompt] if isinstance(prompt, str) else prompt + + if prompt_embeds is None: + prompt_embeds, prompt_embeds_mask = _get_mllm_prompt_embeds( + tokenizer=components.tokenizer, + text_encoder=components.text_encoder, + prompt=prompt, + device=device, + tokenizer_max_length=components.tokenizer_max_length, + system_message=components.system_message, + crop_start=components.prompt_template_encode_start_idx, + ) + + if prompt_embeds_2 is None: + prompt_embeds_2, prompt_embeds_mask_2 = _get_byt5_prompt_embeds( + tokenizer=components.tokenizer_2, + text_encoder=components.text_encoder_2, + prompt=prompt, + device=device, + tokenizer_max_length=components.tokenizer_2_max_length, + ) + + _, seq_len, _ = prompt_embeds.shape + prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1).view( + batch_size * num_videos_per_prompt, seq_len, -1 + ) + prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_videos_per_prompt, 1).view( + batch_size * num_videos_per_prompt, seq_len + ) + + _, seq_len_2, _ = prompt_embeds_2.shape + prompt_embeds_2 = prompt_embeds_2.repeat(1, num_videos_per_prompt, 1).view( + batch_size * num_videos_per_prompt, seq_len_2, -1 + ) + prompt_embeds_mask_2 = prompt_embeds_mask_2.repeat(1, num_videos_per_prompt, 1).view( + batch_size * num_videos_per_prompt, seq_len_2 + ) + + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + prompt_embeds_mask = prompt_embeds_mask.to(dtype=dtype, device=device) + prompt_embeds_2 = prompt_embeds_2.to(dtype=dtype, device=device) + prompt_embeds_mask_2 = prompt_embeds_mask_2.to(dtype=dtype, device=device) + + return prompt_embeds, prompt_embeds_mask, prompt_embeds_2, prompt_embeds_mask_2 + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + dtype = components.transformer.dtype + + prompt = block_state.prompt + negative_prompt = block_state.negative_prompt + num_videos_per_prompt = block_state.num_videos_per_prompt + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + elif getattr(block_state, "prompt_embeds", None) is not None: + batch_size = block_state.prompt_embeds.shape[0] + else: + batch_size = 1 + + ( + block_state.prompt_embeds, + block_state.prompt_embeds_mask, + block_state.prompt_embeds_2, + block_state.prompt_embeds_mask_2, + ) = self.encode_prompt( + components, + prompt=prompt, + device=device, + dtype=dtype, + batch_size=batch_size, + num_videos_per_prompt=num_videos_per_prompt, + prompt_embeds=getattr(block_state, "prompt_embeds", None), + prompt_embeds_mask=getattr(block_state, "prompt_embeds_mask", None), + prompt_embeds_2=getattr(block_state, "prompt_embeds_2", None), + prompt_embeds_mask_2=getattr(block_state, "prompt_embeds_mask_2", None), + ) + + if components.requires_unconditional_embeds: + ( + block_state.negative_prompt_embeds, + block_state.negative_prompt_embeds_mask, + block_state.negative_prompt_embeds_2, + block_state.negative_prompt_embeds_mask_2, + ) = self.encode_prompt( + components, + prompt=negative_prompt, + device=device, + dtype=dtype, + batch_size=batch_size, + num_videos_per_prompt=num_videos_per_prompt, + prompt_embeds=getattr(block_state, "negative_prompt_embeds", None), + prompt_embeds_mask=getattr(block_state, "negative_prompt_embeds_mask", None), + prompt_embeds_2=getattr(block_state, "negative_prompt_embeds_2", None), + prompt_embeds_mask_2=getattr(block_state, "negative_prompt_embeds_mask_2", None), + ) + + state.set("batch_size", batch_size) + + self.set_block_state(state, block_state) + return components, state + + +def retrieve_latents( + encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +class HunyuanVideo15VaeEncoderStep(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def description(self) -> str: + return "VAE Encoder step that encodes an input image into latent space for image-to-video generation" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("vae", AutoencoderKLHunyuanVideo15), + ComponentSpec( + "video_processor", + HunyuanVideo15ImageProcessor, + config=FrozenDict({"vae_scale_factor": 16}), + default_creation_method="from_config", + ), + ] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("image", required=True), + InputParam.template("height"), + InputParam.template("width"), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam( + "image_latents", + type_hint=torch.Tensor, + description="Encoded image latents from the VAE encoder", + ), + OutputParam("height", type_hint=int, description="Target height resolved from image"), + OutputParam("width", type_hint=int, description="Target width resolved from image"), + ] + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + + image = block_state.image + height = block_state.height + width = block_state.width + if height is None or width is None: + height, width = components.video_processor.calculate_default_height_width( + height=image.size[1], width=image.size[0], target_size=components.target_size + ) + image = components.video_processor.resize(image, height=height, width=width, resize_mode="crop") + + vae_dtype = components.vae.dtype + image_tensor = components.video_processor.preprocess(image, height=height, width=width).to( + device=device, dtype=vae_dtype + ) + image_tensor = image_tensor.unsqueeze(2) + image_latents = retrieve_latents(components.vae.encode(image_tensor), sample_mode="argmax") + image_latents = image_latents * components.vae.config.scaling_factor + + block_state.image_latents = image_latents + block_state.height = height + block_state.width = width + state.set("image", image) + + self.set_block_state(state, block_state) + return components, state + + +class HunyuanVideo15ImageEncoderStep(ModularPipelineBlocks): + model_name = "hunyuan-video-1.5" + + @property + def description(self) -> str: + return "Siglip image encoder step that produces image_embeds for image-to-video generation" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("image_encoder", SiglipVisionModel), + ComponentSpec("feature_extractor", SiglipImageProcessor), + ] + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("image", required=True), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam( + "image_embeds", + type_hint=torch.Tensor, + description="Image embeddings from the Siglip vision encoder", + ), + ] + + @torch.no_grad() + def __call__(self, components: HunyuanVideo15ModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + + image_encoder_dtype = next(components.image_encoder.parameters()).dtype + image_inputs = components.feature_extractor.preprocess( + images=block_state.image, do_resize=True, return_tensors="pt", do_convert_rgb=True + ) + image_inputs = image_inputs.to(device=device, dtype=image_encoder_dtype) + image_embeds = components.image_encoder(**image_inputs).last_hidden_state + + block_state.image_embeds = image_embeds + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_blocks_hunyuan_video1_5.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_blocks_hunyuan_video1_5.py new file mode 100644 index 000000000000..6c8bb475dac1 --- /dev/null +++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_blocks_hunyuan_video1_5.py @@ -0,0 +1,583 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...utils import logging +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks +from ..modular_pipeline_utils import OutputParam +from .before_denoise import ( + HunyuanVideo15Image2VideoPrepareLatentsStep, + HunyuanVideo15PrepareLatentsStep, + HunyuanVideo15SetTimestepsStep, + HunyuanVideo15TextInputStep, +) +from .decoders import HunyuanVideo15VaeDecoderStep +from .denoise import HunyuanVideo15DenoiseStep, HunyuanVideo15Image2VideoDenoiseStep +from .encoders import ( + HunyuanVideo15ImageEncoderStep, + HunyuanVideo15TextEncoderStep, + HunyuanVideo15VaeEncoderStep, +) + + +logger = logging.get_logger(__name__) + + +# auto_docstring +class HunyuanVideo15CoreDenoiseStep(SequentialPipelineBlocks): + """ + Denoise block that takes encoded conditions and runs the denoising process. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider + (`ClassifierFreeGuidance`) + + Inputs: + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + batch_size (`int`, *optional*): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + num_frames (`int`, *optional*, defaults to 121): + TODO: Add description. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. + negative_prompt_embeds (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_mask (`Tensor`): + TODO: Add description. + negative_prompt_embeds_mask (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_2 (`Tensor`): + TODO: Add description. + negative_prompt_embeds_2 (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_mask_2 (`Tensor`): + TODO: Add description. + negative_prompt_embeds_mask_2 (`Tensor`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + model_name = "hunyuan-video-1.5" + block_classes = [ + HunyuanVideo15TextInputStep, + HunyuanVideo15SetTimestepsStep, + HunyuanVideo15PrepareLatentsStep, + HunyuanVideo15DenoiseStep, + ] + block_names = ["input", "set_timesteps", "prepare_latents", "denoise"] + + @property + def description(self): + return "Denoise block that takes encoded conditions and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] + + +# auto_docstring +class HunyuanVideo15Blocks(SequentialPipelineBlocks): + """ + Modular pipeline blocks for HunyuanVideo 1.5 text-to-video. + + Components: + text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`) + tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) scheduler (`FlowMatchEulerDiscreteScheduler`) + transformer (`HunyuanVideo15Transformer3DModel`) vae (`AutoencoderKLHunyuanVideo15`) video_processor + (`HunyuanVideo15ImageProcessor`) + + Inputs: + prompt (`str`, *optional*): + The prompt or prompts to guide image generation. + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + prompt_embeds (`Tensor`, *optional*): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`, *optional*): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + prompt_embeds_2 (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_mask_2 (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds_2 (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds_mask_2 (`Tensor`, *optional*): + TODO: Add description. + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + batch_size (`int`, *optional*): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + num_frames (`int`, *optional*, defaults to 121): + TODO: Add description. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. + output_type (`str`, *optional*, defaults to np): + Output format: 'pil', 'np', 'pt'. + + Outputs: + videos (`list`): + The generated videos. + """ + + model_name = "hunyuan-video-1.5" + block_classes = [ + HunyuanVideo15TextEncoderStep, + HunyuanVideo15CoreDenoiseStep, + HunyuanVideo15VaeDecoderStep, + ] + block_names = ["text_encoder", "denoise", "decode"] + + @property + def description(self): + return "Modular pipeline blocks for HunyuanVideo 1.5 text-to-video." + + @property + def outputs(self): + return [OutputParam.template("videos")] + + +# auto_docstring +class HunyuanVideo15Image2VideoCoreDenoiseStep(SequentialPipelineBlocks): + """ + Denoise block for image-to-video that takes encoded conditions and runs the denoising process. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider + (`ClassifierFreeGuidance`) + + Inputs: + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + batch_size (`int`, *optional*): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + num_frames (`int`, *optional*, defaults to 121): + TODO: Add description. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + image_latents (`Tensor`): + TODO: Add description. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. + negative_prompt_embeds (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_mask (`Tensor`): + TODO: Add description. + negative_prompt_embeds_mask (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_2 (`Tensor`): + TODO: Add description. + negative_prompt_embeds_2 (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_mask_2 (`Tensor`): + TODO: Add description. + negative_prompt_embeds_mask_2 (`Tensor`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + model_name = "hunyuan-video-1.5" + block_classes = [ + HunyuanVideo15TextInputStep, + HunyuanVideo15SetTimestepsStep, + HunyuanVideo15PrepareLatentsStep, + HunyuanVideo15Image2VideoPrepareLatentsStep, + HunyuanVideo15Image2VideoDenoiseStep, + ] + block_names = ["input", "set_timesteps", "prepare_latents", "prepare_i2v_latents", "denoise"] + + @property + def description(self): + return "Denoise block for image-to-video that takes encoded conditions and runs the denoising process." + + @property + def outputs(self): + return [OutputParam.template("latents")] + + +# auto_docstring +class HunyuanVideo15AutoVaeEncoderStep(AutoPipelineBlocks): + """ + VAE encoder step that encodes the image input into its latent representation. + This is an auto pipeline block that works for image-to-video tasks. + - `HunyuanVideo15VaeEncoderStep` is used when `image` is provided. + - If `image` is not provided, step will be skipped. + + Components: + vae (`AutoencoderKLHunyuanVideo15`) video_processor (`HunyuanVideo15ImageProcessor`) + + Inputs: + image (`Image | list`, *optional*): + Reference image(s) for denoising. Can be a single image or list of images. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + + Outputs: + image_latents (`Tensor`): + Encoded image latents from the VAE encoder + height (`int`): + Target height resolved from image + width (`int`): + Target width resolved from image + """ + + model_name = "hunyuan-video-1.5" + block_classes = [HunyuanVideo15VaeEncoderStep] + block_names = ["vae_encoder"] + block_trigger_inputs = ["image"] + + @property + def description(self): + return ( + "VAE encoder step that encodes the image input into its latent representation.\n" + "This is an auto pipeline block that works for image-to-video tasks.\n" + " - `HunyuanVideo15VaeEncoderStep` is used when `image` is provided.\n" + " - If `image` is not provided, step will be skipped." + ) + + +# auto_docstring +class HunyuanVideo15AutoImageEncoderStep(AutoPipelineBlocks): + """ + Siglip image encoder step that produces image_embeds. + This is an auto pipeline block that works for image-to-video tasks. + - `HunyuanVideo15ImageEncoderStep` is used when `image` is provided. + - If `image` is not provided, step will be skipped. + + Components: + image_encoder (`SiglipVisionModel`) feature_extractor (`SiglipImageProcessor`) + + Inputs: + image (`Image | list`, *optional*): + Reference image(s) for denoising. Can be a single image or list of images. + + Outputs: + image_embeds (`Tensor`): + Image embeddings from the Siglip vision encoder + """ + + model_name = "hunyuan-video-1.5" + block_classes = [HunyuanVideo15ImageEncoderStep] + block_names = ["image_encoder"] + block_trigger_inputs = ["image"] + + @property + def description(self): + return ( + "Siglip image encoder step that produces image_embeds.\n" + "This is an auto pipeline block that works for image-to-video tasks.\n" + " - `HunyuanVideo15ImageEncoderStep` is used when `image` is provided.\n" + " - If `image` is not provided, step will be skipped." + ) + + +# auto_docstring +class HunyuanVideo15AutoCoreDenoiseStep(AutoPipelineBlocks): + """ + Auto denoise block that selects the appropriate denoise pipeline based on inputs. + - `HunyuanVideo15Image2VideoCoreDenoiseStep` is used when `image_latents` is provided. + - `HunyuanVideo15CoreDenoiseStep` is used otherwise (text-to-video). + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`HunyuanVideo15Transformer3DModel`) guider + (`ClassifierFreeGuidance`) + + Inputs: + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + prompt_embeds (`Tensor`): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. + num_inference_steps (`int`): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + num_frames (`int`, *optional*, defaults to 121): + TODO: Add description. + latents (`Tensor`): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + image_latents (`Tensor`, *optional*): + TODO: Add description. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. + negative_prompt_embeds (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_mask (`Tensor`): + TODO: Add description. + negative_prompt_embeds_mask (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_2 (`Tensor`): + TODO: Add description. + negative_prompt_embeds_2 (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_mask_2 (`Tensor`): + TODO: Add description. + negative_prompt_embeds_mask_2 (`Tensor`, *optional*): + TODO: Add description. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + model_name = "hunyuan-video-1.5" + block_classes = [HunyuanVideo15Image2VideoCoreDenoiseStep, HunyuanVideo15CoreDenoiseStep] + block_names = ["image2video", "text2video"] + block_trigger_inputs = ["image_latents", None] + + @property + def description(self): + return ( + "Auto denoise block that selects the appropriate denoise pipeline based on inputs.\n" + " - `HunyuanVideo15Image2VideoCoreDenoiseStep` is used when `image_latents` is provided.\n" + " - `HunyuanVideo15CoreDenoiseStep` is used otherwise (text-to-video)." + ) + + +# auto_docstring +class HunyuanVideo15AutoBlocks(SequentialPipelineBlocks): + """ + Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows. + + Supported workflows: + - `text2video`: requires `prompt` + - `image2video`: requires `image`, `prompt` + + Components: + text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`) + tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) vae (`AutoencoderKLHunyuanVideo15`) + video_processor (`HunyuanVideo15ImageProcessor`) image_encoder (`SiglipVisionModel`) feature_extractor + (`SiglipImageProcessor`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer + (`HunyuanVideo15Transformer3DModel`) + + Inputs: + prompt (`str`, *optional*): + The prompt or prompts to guide image generation. + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + prompt_embeds (`Tensor`, *optional*): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`, *optional*): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + prompt_embeds_2 (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_mask_2 (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds_2 (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds_mask_2 (`Tensor`, *optional*): + TODO: Add description. + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + image (`Image | list`, *optional*): + Reference image(s) for denoising. Can be a single image or list of images. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. + num_inference_steps (`int`): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + num_frames (`int`, *optional*, defaults to 121): + TODO: Add description. + latents (`Tensor`): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + image_latents (`Tensor`, *optional*): + TODO: Add description. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. + output_type (`str`, *optional*, defaults to np): + Output format: 'pil', 'np', 'pt'. + + Outputs: + videos (`list`): + The generated videos. + """ + + model_name = "hunyuan-video-1.5" + block_classes = [ + HunyuanVideo15TextEncoderStep, + HunyuanVideo15AutoVaeEncoderStep, + HunyuanVideo15AutoImageEncoderStep, + HunyuanVideo15AutoCoreDenoiseStep, + HunyuanVideo15VaeDecoderStep, + ] + block_names = ["text_encoder", "vae_encoder", "image_encoder", "denoise", "decode"] + _workflow_map = { + "text2video": {"prompt": True}, + "image2video": {"image": True, "prompt": True}, + } + + @property + def description(self): + return "Auto blocks for HunyuanVideo 1.5 that support both text-to-video and image-to-video workflows." + + @property + def outputs(self): + return [OutputParam.template("videos")] + + +# auto_docstring +class HunyuanVideo15Image2VideoBlocks(SequentialPipelineBlocks): + """ + Modular pipeline blocks for HunyuanVideo 1.5 image-to-video. + + Components: + text_encoder (`Qwen2_5_VLTextModel`) tokenizer (`Qwen2TokenizerFast`) text_encoder_2 (`T5EncoderModel`) + tokenizer_2 (`ByT5Tokenizer`) guider (`ClassifierFreeGuidance`) vae (`AutoencoderKLHunyuanVideo15`) + video_processor (`HunyuanVideo15ImageProcessor`) image_encoder (`SiglipVisionModel`) feature_extractor + (`SiglipImageProcessor`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer + (`HunyuanVideo15Transformer3DModel`) + + Inputs: + prompt (`str`, *optional*): + The prompt or prompts to guide image generation. + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + prompt_embeds (`Tensor`, *optional*): + text embeddings used to guide the image generation. Can be generated from text_encoder step. + prompt_embeds_mask (`Tensor`, *optional*): + mask for the text embeddings. Can be generated from text_encoder step. + negative_prompt_embeds (`Tensor`, *optional*): + negative text embeddings used to guide the image generation. Can be generated from text_encoder step. + negative_prompt_embeds_mask (`Tensor`, *optional*): + mask for the negative text embeddings. Can be generated from text_encoder step. + prompt_embeds_2 (`Tensor`, *optional*): + TODO: Add description. + prompt_embeds_mask_2 (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds_2 (`Tensor`, *optional*): + TODO: Add description. + negative_prompt_embeds_mask_2 (`Tensor`, *optional*): + TODO: Add description. + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + image (`Image | list`, *optional*): + Reference image(s) for denoising. Can be a single image or list of images. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + batch_size (`int`, *optional*): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + num_frames (`int`, *optional*, defaults to 121): + TODO: Add description. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + image_latents (`Tensor`): + TODO: Add description. + attention_kwargs (`dict`, *optional*): + Additional kwargs for attention processors. + output_type (`str`, *optional*, defaults to np): + Output format: 'pil', 'np', 'pt'. + + Outputs: + videos (`list`): + The generated videos. + """ + + model_name = "hunyuan-video-1.5" + block_classes = [ + HunyuanVideo15TextEncoderStep, + HunyuanVideo15AutoVaeEncoderStep, + HunyuanVideo15AutoImageEncoderStep, + HunyuanVideo15Image2VideoCoreDenoiseStep, + HunyuanVideo15VaeDecoderStep, + ] + block_names = ["text_encoder", "vae_encoder", "image_encoder", "denoise", "decode"] + + @property + def description(self): + return "Modular pipeline blocks for HunyuanVideo 1.5 image-to-video." + + @property + def outputs(self): + return [OutputParam.template("videos")] diff --git a/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_pipeline.py b/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_pipeline.py new file mode 100644 index 000000000000..5b23d8699905 --- /dev/null +++ b/src/diffusers/modular_pipelines/hunyuan_video1_5/modular_pipeline.py @@ -0,0 +1,90 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...loaders import HunyuanVideoLoraLoaderMixin +from ...utils import logging +from ..modular_pipeline import ModularPipeline + + +logger = logging.get_logger(__name__) + + +class HunyuanVideo15ModularPipeline( + ModularPipeline, + HunyuanVideoLoraLoaderMixin, +): + """ + A ModularPipeline for HunyuanVideo 1.5. + + > [!WARNING] > This is an experimental feature and is likely to change in the future. + """ + + default_blocks_name = "HunyuanVideo15AutoBlocks" + + @property + def vae_scale_factor_spatial(self): + return self.vae.spatial_compression_ratio if getattr(self, "vae", None) else 16 + + @property + def vae_scale_factor_temporal(self): + return self.vae.temporal_compression_ratio if getattr(self, "vae", None) else 4 + + @property + def num_channels_latents(self): + return self.vae.config.latent_channels if getattr(self, "vae", None) else 32 + + @property + def target_size(self): + return self.transformer.config.target_size if getattr(self, "transformer", None) else 640 + + @property + def default_aspect_ratio(self): + return (16, 9) + + @property + def vision_num_semantic_tokens(self): + return 729 + + @property + def vision_states_dim(self): + return self.transformer.config.image_embed_dim if getattr(self, "transformer", None) else 1152 + + @property + def tokenizer_max_length(self): + return 1000 + + @property + def tokenizer_2_max_length(self): + return 256 + + # fmt: off + @property + def system_message(self): + return "You are a helpful assistant. Describe the video by detailing the following aspects: \ + 1. The main content and theme of the video. \ + 2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \ + 3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \ + 4. background environment, light, style and atmosphere. \ + 5. camera angles, movements, and transitions used in the video." + # fmt: on + + @property + def prompt_template_encode_start_idx(self): + return 108 + + @property + def requires_unconditional_embeds(self): + if hasattr(self, "guider") and self.guider is not None: + return self.guider._enabled and self.guider.num_conditions > 1 + return False diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index ace89f0d6f91..d00bf716a78f 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -132,6 +132,7 @@ def _helios_pyramid_map_fn(config_dict=None): ("z-image", _create_default_map_fn("ZImageModularPipeline")), ("helios", _create_default_map_fn("HeliosModularPipeline")), ("helios-pyramid", _helios_pyramid_map_fn), + ("hunyuan-video-1.5", _create_default_map_fn("HunyuanVideo15ModularPipeline")), ("ltx", _create_default_map_fn("LTXModularPipeline")), ] ) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 0d4d6d97a05b..938115408314 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -242,6 +242,36 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class HunyuanVideo15AutoBlocks(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class HunyuanVideo15ModularPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class LTXAutoBlocks(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/modular_pipelines/hunyuan_video1_5/__init__.py b/tests/modular_pipelines/hunyuan_video1_5/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/modular_pipelines/hunyuan_video1_5/test_modular_pipeline_hunyuan_video1_5.py b/tests/modular_pipelines/hunyuan_video1_5/test_modular_pipeline_hunyuan_video1_5.py new file mode 100644 index 000000000000..6d776eaa1a11 --- /dev/null +++ b/tests/modular_pipelines/hunyuan_video1_5/test_modular_pipeline_hunyuan_video1_5.py @@ -0,0 +1,83 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from diffusers.modular_pipelines import HunyuanVideo15AutoBlocks, HunyuanVideo15ModularPipeline + +from ..test_modular_pipelines_common import ModularPipelineTesterMixin + + +HUNYUANVIDEO15_WORKFLOWS = { + "text2video": [ + ("text_encoder", "HunyuanVideo15TextEncoderStep"), + ("denoise.input", "HunyuanVideo15TextInputStep"), + ("denoise.set_timesteps", "HunyuanVideo15SetTimestepsStep"), + ("denoise.prepare_latents", "HunyuanVideo15PrepareLatentsStep"), + ("denoise.denoise", "HunyuanVideo15DenoiseStep"), + ("decode", "HunyuanVideo15VaeDecoderStep"), + ], + "image2video": [ + ("text_encoder", "HunyuanVideo15TextEncoderStep"), + ("vae_encoder", "HunyuanVideo15VaeEncoderStep"), + ("image_encoder", "HunyuanVideo15ImageEncoderStep"), + ("denoise.input", "HunyuanVideo15TextInputStep"), + ("denoise.set_timesteps", "HunyuanVideo15SetTimestepsStep"), + ("denoise.prepare_latents", "HunyuanVideo15PrepareLatentsStep"), + ("denoise.prepare_i2v_latents", "HunyuanVideo15Image2VideoPrepareLatentsStep"), + ("denoise.denoise", "HunyuanVideo15Image2VideoDenoiseStep"), + ("decode", "HunyuanVideo15VaeDecoderStep"), + ], +} + + +class TestHunyuanVideo15ModularPipelineFast(ModularPipelineTesterMixin): + pipeline_class = HunyuanVideo15ModularPipeline + pipeline_blocks_class = HunyuanVideo15AutoBlocks + pretrained_model_name_or_path = "akshan-main/tiny-hunyuanvideo1_5-modular-pipe" + + params = frozenset(["prompt", "height", "width", "num_frames"]) + batch_params = frozenset(["prompt"]) + optional_params = frozenset(["num_inference_steps", "num_videos_per_prompt", "latents"]) + expected_workflow_blocks = HUNYUANVIDEO15_WORKFLOWS + output_name = "videos" + + def get_dummy_inputs(self, seed=0): + generator = self.get_generator(seed) + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "generator": generator, + "num_inference_steps": 2, + "height": 32, + "width": 32, + "num_frames": 9, + "output_type": "pt", + } + return inputs + + @pytest.mark.skip(reason="num_videos_per_prompt") + def test_num_images_per_prompt(self): + pass + + @pytest.mark.skip(reason="VAE causal attention mask does not support batch>1 decode") + def test_inference_batch_consistent(self): + pass + + @pytest.mark.skip(reason="VAE causal attention mask does not support batch>1 decode") + def test_inference_batch_single_identical(self): + pass + + def test_float16_inference(self): + super().test_float16_inference(expected_max_diff=0.1)