From fc83d365f2b781ab05aeb94b13f7e97417df7d97 Mon Sep 17 00:00:00 2001 From: wbhu Date: Mon, 25 Nov 2024 12:31:30 +0800 Subject: [PATCH] [Release] v1.0.1 - improve the performance - improve efficiency --- README.md | 226 +++++++++++++++++++++++++++++++++++++++--- app.py | 38 ++++--- depthcrafter/utils.py | 44 ++++++++ run.py | 52 +--------- 4 files changed, 279 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 1ad4108..46d2684 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,210 @@ ---- -title: DepthCrafter -emoji: 🦀 -colorFrom: purple -colorTo: pink -sdk: gradio -sdk_version: 4.44.0 -app_file: app.py -pinned: true -license: other -thumbnail: >- - https://cdn-uploads.huggingface.co/production/uploads/657a7458afbb0117ba15c59f/n81BOzDvx-nPqGADmNWxD.png -short_description: a super consistent video depth model ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file +## ___***DepthCrafter: Generating Consistent Long Depth Sequences for Open-world Videos***___ +
+ + + + +   +   +   + + +_**[Wenbo Hu1* †](https://wbhu.github.io), +[Xiangjun Gao2*](https://scholar.google.com/citations?user=qgdesEcAAAAJ&hl=en), +[Xiaoyu Li1* †](https://xiaoyu258.github.io), +[Sijie Zhao1](https://scholar.google.com/citations?user=tZ3dS3MAAAAJ&hl=en), +[Xiaodong Cun1](https://vinthony.github.io/academic),
+[Yong Zhang1](https://yzhang2016.github.io), +[Long Quan2](https://home.cse.ust.hk/~quan), +[Ying Shan3, 1](https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=en)**_ +

+1Tencent AI Lab +2The Hong Kong University of Science and Technology +3ARC Lab, Tencent PCG + +arXiv preprint, 2024 + +
+ +## 🔆 Introduction +🤗 If you find DepthCrafter useful, **please help ⭐ this repo**, which is important to Open-Source projects. Thanks! + +🔥 DepthCrafter can generate temporally consistent long-depth sequences with fine-grained details for open-world videos, +without requiring additional information such as camera poses or optical flow. + +- `[24-11-26]` 🚀🚀🚀 DepthCrafter v1.0.1 is released now, with improved quality and speed +- `[24-10-19]` 🤗🤗🤗 DepthCrafter now has been integrated into [ComfyUI](https://github.com/akatz-ai/ComfyUI-DepthCrafter-Nodes)! +- `[24-10-08]` 🤗🤗🤗 DepthCrafter now has been integrated into [Nuke](https://github.com/Theo-SAMINADIN-td/NukeDepthCrafter), have a try! +- `[24-09-28]` Add full dataset inference and evaluation scripts for better comparison use. :-) +- `[24-09-25]` 🤗🤗🤗 Add huggingface online demo [DepthCrafter](https://huggingface.co/spaces/tencent/DepthCrafter). +- `[24-09-19]` Add scripts for preparing benchmark datasets. +- `[24-09-18]` Add point cloud sequence visualization. +- `[24-09-14]` 🔥🔥🔥 **DepthCrafter** is released now, have fun! + + +## 📦 Release Notes +- **DepthCrafter v1.0.1**: + - Quality and speed improvement + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Methodms/frame↓ @1024×576 Sintel (~50 frames)Scannet (90 frames)KITTI (110 frames)Bonn (110 frames)
AbsRel↓δ₁ ↑AbsRel↓δ₁ ↑AbsRel↓δ₁ ↑AbsRel↓δ₁ ↑
Marigold1070.290.5320.5150.1660.7690.1490.7960.0910.931
Depth-Anything-V2180.460.3670.5540.1350.8220.1400.8040.1060.921
DepthCrafter previous1913.920.2920.6970.1250.8480.1100.8810.0750.971
DepthCrafter v1.0.1465.840.2700.6970.1230.8560.1040.8960.0710.972
+ + + +## 🎥 Visualization +We provide demos of unprojected point cloud sequences, with reference RGB and estimated depth videos. +Please refer to our [project page](https://depthcrafter.github.io) for more details. + + +https://github.com/user-attachments/assets/62141cc8-04d0-458f-9558-fe50bc04cc21 + + + + +## 🚀 Quick Start + +### 🤖 Gradio Demo +- Online demo: [DepthCrafter](https://huggingface.co/spaces/tencent/DepthCrafter) +- Local demo: + ```bash + gradio app.py + ``` + +### 🌟 Community Support +- [NukeDepthCrafter](https://github.com/Theo-SAMINADIN-td/NukeDepthCrafter): + a plugin allows you to generate temporally consistent Depth sequences inside Nuke, + which is widely used in the VFX industry. +- [ComfyUI-Nodes](https://github.com/akatz-ai/ComfyUI-DepthCrafter-Nodes): creating consistent depth maps for your videos using DepthCrafter in ComfyUI. + + +### 🛠️ Installation +1. Clone this repo: +```bash +git clone https://github.com/Tencent/DepthCrafter.git +``` +2. Install dependencies (please refer to [requirements.txt](requirements.txt)): +```bash +pip install -r requirements.txt +``` + + + +### 🤗 Model Zoo +[DepthCrafter](https://huggingface.co/tencent/DepthCrafter) is available in the Hugging Face Model Hub. + +### 🏃‍♂️ Inference +#### 1. High-resolution inference, requires a GPU with ~26GB memory for 1024x576 resolution: +- ~2.1 fps on A100, recommended for high-quality results: + + ```bash + python run.py --video-path examples/example_01.mp4 + ``` + +#### 2. Low-resolution inference requires a GPU with ~9GB memory for 512x256 resolution: +- ~8.6 fps on A100: + + ```bash + python run.py --video-path examples/example_01.mp4 --max-res 512 + ``` + +## 🚀 Dataset Evaluation +Please check the `benchmark` folder. +- To create the dataset we use in the paper, you need to run `dataset_extract/dataset_extract_${dataset_name}.py`. +- Then you will get the `csv` files that save the relative root of extracted RGB video and depth npz files. We also provide these csv files. +- Inference for all datasets scripts: + ```bash + bash benchmark/infer/infer.sh + ``` + (Remember to replace the `input_rgb_root` and `saved_root` with your own path.) +- Evaluation for all datasets scripts: + ```bash + bash benchmark/eval/eval.sh + ``` + (Remember to replace the `pred_disp_root` and `gt_disp_root` with your own path.) +#### + +## 🤝 Contributing +- Welcome to open issues and pull requests. +- Welcome to optimize the inference speed and memory usage, e.g., through model quantization, distillation, or other acceleration techniques. + +## 📜 Citation +If you find this work helpful, please consider citing: +```bibtex +@article{hu2024-DepthCrafter, + author = {Hu, Wenbo and Gao, Xiangjun and Li, Xiaoyu and Zhao, Sijie and Cun, Xiaodong and Zhang, Yong and Quan, Long and Shan, Ying}, + title = {DepthCrafter: Generating Consistent Long Depth Sequences for Open-world Videos}, + journal = {arXiv preprint arXiv:2409.02095}, + year = {2024} + } +``` diff --git a/app.py b/app.py index b80ec41..26d9615 100644 --- a/app.py +++ b/app.py @@ -17,11 +17,11 @@ from depthcrafter.utils import read_video_frames, vis_sequence_depth, save_video examples = [ - ["examples/example_01.mp4", 10, 1.2, 1024, 60], - ["examples/example_02.mp4", 10, 1.2, 1024, 60], - ["examples/example_03.mp4", 10, 1.2, 1024, 60], - ["examples/example_04.mp4", 10, 1.2, 1024, 60], - ["examples/example_05.mp4", 10, 1.2, 1024, 60], + ["examples/example_01.mp4", 5, 1.0, 1024, -1, -1], + ["examples/example_02.mp4", 5, 1.0, 1024, -1, -1], + ["examples/example_03.mp4", 5, 1.0, 1024, -1, -1], + ["examples/example_04.mp4", 5, 1.0, 1024, -1, -1], + ["examples/example_05.mp4", 5, 1.0, 1024, -1, -1], ] @@ -39,18 +39,18 @@ pipe.to("cuda") -@spaces.GPU(duration=140) +@spaces.GPU(duration=120) def infer_depth( video: str, num_denoising_steps: int, guidance_scale: float, max_res: int = 1024, - process_length: int = 195, + process_length: int = -1, + target_fps: int = -1, # save_folder: str = "./demo_output", window_size: int = 110, overlap: int = 25, - target_fps: int = 15, seed: int = 42, track_time: bool = True, save_npz: bool = False, @@ -59,7 +59,6 @@ def infer_depth( pipe.enable_xformers_memory_efficient_attention() frames, target_fps = read_video_frames(video, process_length, target_fps, max_res) - print(f"==> video name: {video}, frames shape: {frames.shape}") # inference the depth map using the DepthCrafter pipeline with torch.inference_mode(): @@ -82,6 +81,7 @@ def infer_depth( vis = vis_sequence_depth(res) # save the depth map and visualization with the target FPS save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0]) + print(f"==> saving results to {save_path}") os.makedirs(os.path.dirname(save_path), exist_ok=True) if save_npz: np.savez_compressed(save_path + ".npz", depth=res) @@ -155,14 +155,14 @@ def construct_demo(): label="num denoising steps", minimum=1, maximum=25, - value=10, + value=5, step=1, ) guidance_scale = gr.Slider( label="cfg scale", minimum=1.0, maximum=1.2, - value=1.2, + value=1.0, step=0.1, ) max_res = gr.Slider( @@ -174,11 +174,18 @@ def construct_demo(): ) process_length = gr.Slider( label="process length", - minimum=1, + minimum=-1, maximum=280, value=60, step=1, ) + process_target_fps = gr.Slider( + label="target FPS", + minimum=-1, + maximum=30, + value=15, + step=1, + ) generate_btn = gr.Button("Generate") with gr.Column(scale=2): pass @@ -191,6 +198,7 @@ def construct_demo(): guidance_scale, max_res, process_length, + process_target_fps, ], outputs=[output_video_1, output_video_2], fn=infer_depth, @@ -216,6 +224,7 @@ def construct_demo(): guidance_scale, max_res, process_length, + process_target_fps, ], outputs=[output_video_1, output_video_2], ) @@ -223,9 +232,8 @@ def construct_demo(): return depthcrafter_iface -demo = construct_demo() - if __name__ == "__main__": + demo = construct_demo() demo.queue() - # demo.launch(server_name="0.0.0.0", server_port=80, debug=True) + # demo.launch(server_name="0.0.0.0", server_port=12345, debug=True, share=False) demo.launch(share=True) diff --git a/depthcrafter/utils.py b/depthcrafter/utils.py index 977e21b..2ac50e8 100644 --- a/depthcrafter/utils.py +++ b/depthcrafter/utils.py @@ -5,6 +5,50 @@ import matplotlib.cm as cm import mediapy import torch +from decord import VideoReader, cpu + +dataset_res_dict = { + "sintel": [448, 1024], + "scannet": [640, 832], + "KITTI": [384, 1280], + "bonn": [512, 640], + "NYUv2": [448, 640], +} + + +def read_video_frames(video_path, process_length, target_fps, max_res, dataset="open"): + if dataset == "open": + print("==> processing video: ", video_path) + vid = VideoReader(video_path, ctx=cpu(0)) + print("==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:])) + original_height, original_width = vid.get_batch([0]).shape[1:3] + height = round(original_height / 64) * 64 + width = round(original_width / 64) * 64 + if max(height, width) > max_res: + scale = max_res / max(original_height, original_width) + height = round(original_height * scale / 64) * 64 + width = round(original_width * scale / 64) * 64 + else: + height = dataset_res_dict[dataset][0] + width = dataset_res_dict[dataset][1] + + vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height) + + fps = vid.get_avg_fps() if target_fps == -1 else target_fps + stride = round(vid.get_avg_fps() / fps) + stride = max(stride, 1) + frames_idx = list(range(0, len(vid), stride)) + print( + f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}" + ) + if process_length != -1 and process_length < len(frames_idx): + frames_idx = frames_idx[:process_length] + print( + f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}" + ) + frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0 + + return frames, fps def save_video( diff --git a/run.py b/run.py index 25384de..bd44917 100644 --- a/run.py +++ b/run.py @@ -3,21 +3,12 @@ import numpy as np import torch -from decord import VideoReader, cpu from diffusers.training_utils import set_seed from fire import Fire from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter -from depthcrafter.utils import vis_sequence_depth, save_video - -dataset_res_dict = { - "sintel": [448, 1024], - "scannet": [640, 832], - "KITTI": [384, 1280], - "bonn": [512, 640], - "NYUv2": [448, 640], -} +from depthcrafter.utils import vis_sequence_depth, save_video, read_video_frames class DepthCrafterDemo: @@ -59,45 +50,6 @@ def __init__( print("Xformers is not enabled") self.pipe.enable_attention_slicing() - @staticmethod - def read_video_frames( - video_path, process_length, target_fps, max_res, dataset="open" - ): - if dataset == "open": - print("==> processing video: ", video_path) - vid = VideoReader(video_path, ctx=cpu(0)) - print( - "==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:]) - ) - original_height, original_width = vid.get_batch([0]).shape[1:3] - height = round(original_height / 64) * 64 - width = round(original_width / 64) * 64 - if max(height, width) > max_res: - scale = max_res / max(original_height, original_width) - height = round(original_height * scale / 64) * 64 - width = round(original_width * scale / 64) * 64 - else: - height = dataset_res_dict[dataset][0] - width = dataset_res_dict[dataset][1] - - vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height) - - fps = vid.get_avg_fps() if target_fps == -1 else target_fps - stride = round(vid.get_avg_fps() / fps) - stride = max(stride, 1) - frames_idx = list(range(0, len(vid), stride)) - print( - f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}" - ) - if process_length != -1 and process_length < len(frames_idx): - frames_idx = frames_idx[:process_length] - print( - f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}" - ) - frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0 - - return frames, fps - def infer( self, video: str, @@ -116,7 +68,7 @@ def infer( ): set_seed(seed) - frames, target_fps = self.read_video_frames( + frames, target_fps = read_video_frames( video, process_length, target_fps,