From a7841141721dfb2bfac48db19f7b365ad7665d8b Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sat, 11 Jan 2025 08:55:07 +0530 Subject: [PATCH] Apply suggestions from code review --- docs/source/en/using-diffusers/text-img2vid.md | 2 +- src/diffusers/models/transformers/transformer_hunyuan_video.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/using-diffusers/text-img2vid.md b/docs/source/en/using-diffusers/text-img2vid.md index b02c8b723803..92e740bb579d 100644 --- a/docs/source/en/using-diffusers/text-img2vid.md +++ b/docs/source/en/using-diffusers/text-img2vid.md @@ -70,7 +70,7 @@ export_to_video(video, "output.mp4", fps=8) > [!TIP] > HunyuanVideo is a 13B parameter model and requires a lot of memory. Refer to the HunyuanVideo [Quantization](../api/pipelines/hunyuan_video#quantization) guide to learn how to quantize the model. CogVideoX and LTX-Video are more lightweight options that can still generate high-quality videos. -[HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE. +[HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo) features a dual-stream to single-stream diffusion transformer (DiT) for learning video and text tokens separately, and then subsequently concatenating the video and text tokens to combine their information. A single multimodal large language model (MLLM) serves as the text encoder, and videos are also spatio-temporally compressed with a 3D causal VAE. ```py import torch diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 910d24c8b0f9..f9069411fbdb 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -505,7 +505,7 @@ def forward( class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin): r""" A Transformer model for video-like data used in - [HunyuanVideo](https://huggingface.co/hunyuanvideo-community/HunyuanVideo). + [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo). Args: in_channels (`int`, defaults to `16`):