hpcaitech · narrowsnap · Jul 10, 2024 · Jul 11, 2024 · Jul 11, 2024 · Jul 12, 2024
diff --git a/README.md b/README.md
@@ -361,6 +361,52 @@ CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 scripts/inference.py config
 
 :warning: **LIMITATION**: The sequence parallelism is not supported for gradio deployment. For now, the sequence parallelism is only supported when the dimension can be divided by the number of GPUs. Thus, it may fail for some cases. We tested 4 GPUs for 720p and 2 GPUs for 480p.
 
+### Separate Inference 720p video with 24G VRAM
+
+The Open-Sora project consists of three main components: text_encoder, VAE, and STDiT. By running inference for each component separately, we can complete the entire process with limited VRAM.
+
+#### Step-by-Step Inference
+
+1. First, run the text_encoder inference and save text embedding.
+2. If you are using reference image, run VAE encoder and save reference latents. (optional)
+3. Then, run STDiT with saved text embedding and save latents.
+4. Finally, run VAE decoder with saved latents.
+
+#### All-in-one script
+
+The basic text2video script is as follows:
+
+```bash
+# text to video
+./scripts/separate_inference.sh "0,1" 4s 720p "9:16" 7 "a beautiful waterfall"
+
+# Parameter explanations:
+# "0,1"    : GPU indices to use (e.g., "0,1" uses GPUs 0 and 1)
+# 4s       : Duration of the video (4 seconds in this case)
+# 720p     : Resolution of the video (720p)
+# "9:16"   : Aspect ratio of the video (9:16 vertical format)
+# 7        : aes score
+# "a beautiful waterfall" : Text prompt describing the video content
+```
+
+The basic image2video script is as follows:
+
+1. Generate image with text2video.
+
+```bash
+# text to image
+./scripts/separate_inference.sh "0,1" 1 1080p "9:16" 7 "a beautiful waterfall"
+```
+
+2. Generate video with reference image.
+
+```bash
+# image to video
+./scripts/separate_inference.sh "0,1" 4s 720p "9:16" 7 "a beautiful waterfall. {\"reference_path\": \"path2reference.png\",\"mask_strategy\": \"0\"}""
+```
+
+:warning: **LIMITATION**: Due to the text_encoder requiring over 18GB of VRAM, this script currently only supports GPUs with more than 18GB of VRAM. If you want to use it on machines with less than 16GB of VRAM, you'll need to modify the inference precision of the text_encoder yourself.
+
 ### GPT-4o Prompt Refinement
 
 We find that GPT-4o can refine the prompt and improve the quality of the generated video. With this feature, you can also use other language (e.g., Chinese) as the prompt. To enable this feature, you need prepare your openai api key in the environment:

diff --git a/opensora/schedulers/rf/__init__.py b/opensora/schedulers/rf/__init__.py
@@ -42,16 +42,25 @@ def sample(
         mask=None,
         guidance_scale=None,
         progress=True,
+        caption_embs=None,
+        caption_emb_masks=None,
     ):
         # if no specific guidance scale is provided, use the default scale when initializing the scheduler
         if guidance_scale is None:
             guidance_scale = self.cfg_scale
 
         n = len(prompts)
         # text encoding
-        model_args = text_encoder.encode(prompts)
-        y_null = text_encoder.null(n)
-        model_args["y"] = torch.cat([model_args["y"], y_null], 0)
+        if text_encoder is not None:
+            model_args = text_encoder.encode(prompts)
+            y_null = text_encoder.null(n)
+            model_args["y"] = torch.cat([model_args["y"], y_null], 0)
+        else:
+            # use pre-inference text embeddings
+            model_args = dict(mask=caption_emb_masks)
+            y_null = model.y_embedder.y_embedding[None].repeat(n, 1, 1)[:, None]
+            model_args["y"] = torch.cat([caption_embs, y_null], 0)
+
         if additional_args is not None:
             model_args.update(additional_args)
 

diff --git a/scripts/separate_inference.sh b/scripts/separate_inference.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set_default_params() {
+    gpus=${1:-"0,1"}
+    num_frames=${2:-"4s"}
+    resolution=${3:-"720p"}
+    aspect_ratio=${4:-"9:16"}
+    aes=${5:-"7"}
+    prompt=${6:-"Create a video featuring Will Smith enjoying a plate of spaghetti."}
+}
+
+set_default_params "$@"
+
+export CUDA_VISIBLE_DEVICES=$gpus
+
+gpus="${gpus// /}"
+IFS=',' read -ra gpu_array <<< "$gpus"
+gpu_count=${#gpu_array[@]}
+
+torchrun  --nproc_per_node $gpu_count --master_port=23456 scripts/separate_inference/inference_text_encoder.py configs/opensora-v1-2/inference/sample.py --aes $aes --num-frames "$num_frames" --resolution "$resolution" --aspect-ratio "$aspect_ratio" --prompt "$prompt"
+if echo "$prompt" | grep -q "reference_path"; then
+    torchrun --nproc_per_node $gpu_count --master_port=23456 scripts/separate_inference/inference_vae_encoder.py configs/opensora-v1-2/inference/sample.py --aes $aes --num-frames "$num_frames" --resolution "$resolution" --aspect-ratio "$aspect_ratio" --prompt "$prompt"
+fi
+torchrun --nproc_per_node $gpu_count --master_port=23456 scripts/separate_inference/inference_stdit.py configs/opensora-v1-2/inference/sample.py --aes $aes --num-frames "$num_frames" --resolution "$resolution" --aspect-ratio "$aspect_ratio" --prompt "$prompt"
+torchrun --nproc_per_node $gpu_count --master_port=23456 scripts/separate_inference/inference_vae_decoder.py configs/opensora-v1-2/inference/sample.py --aes $aes --num-frames "$num_frames" --resolution "$resolution" --aspect-ratio "$aspect_ratio" --prompt "$prompt"