From fc83d365f2b781ab05aeb94b13f7e97417df7d97 Mon Sep 17 00:00:00 2001
From: wbhu <wbhu@tencent.com>
Date: Mon, 25 Nov 2024 12:31:30 +0800
Subject: [PATCH] [Release] v1.0.1 - improve the performance - improve
 efficiency

---
 README.md             | 226 +++++++++++++++++++++++++++++++++++++++---
 app.py                |  38 ++++---
 depthcrafter/utils.py |  44 ++++++++
 run.py                |  52 +---------
 4 files changed, 279 insertions(+), 81 deletions(-)
diff --git a/README.md b/README.md
index 1ad4108..46d2684 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,210 @@
----
-title: DepthCrafter
-emoji: 🦀
-colorFrom: purple
-colorTo: pink
-sdk: gradio
-sdk_version: 4.44.0
-app_file: app.py
-pinned: true
-license: other
-thumbnail: >-
-  https://cdn-uploads.huggingface.co/production/uploads/657a7458afbb0117ba15c59f/n81BOzDvx-nPqGADmNWxD.png
-short_description: a super consistent video depth model
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
\ No newline at end of file
+## ___***DepthCrafter: Generating Consistent Long Depth Sequences for Open-world Videos***___
+<div align="center">
+<img src='https://depthcrafter.github.io/img/logo.png' style="height:140px"></img>
+
+
+
+ <a href='https://arxiv.org/abs/2409.02095'><img src='https://img.shields.io/badge/arXiv-2409.02095-b31b1b.svg'></a> &nbsp;
+ <a href='https://depthcrafter.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp;
+ <a href='https://huggingface.co/spaces/tencent/DepthCrafter'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-blue'></a> &nbsp;
+
+
+_**[Wenbo Hu<sup>1* &dagger;</sup>](https://wbhu.github.io), 
+[Xiangjun Gao<sup>2*</sup>](https://scholar.google.com/citations?user=qgdesEcAAAAJ&hl=en), 
+[Xiaoyu Li<sup>1* &dagger;</sup>](https://xiaoyu258.github.io), 
+[Sijie Zhao<sup>1</sup>](https://scholar.google.com/citations?user=tZ3dS3MAAAAJ&hl=en), 
+[Xiaodong Cun<sup>1</sup>](https://vinthony.github.io/academic), <br>
+[Yong Zhang<sup>1</sup>](https://yzhang2016.github.io), 
+[Long Quan<sup>2</sup>](https://home.cse.ust.hk/~quan), 
+[Ying Shan<sup>3, 1</sup>](https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=en)**_
+<br><br>
+<sup>1</sup>Tencent AI Lab
+<sup>2</sup>The Hong Kong University of Science and Technology
+<sup>3</sup>ARC Lab, Tencent PCG
+
+arXiv preprint, 2024
+
+</div>
+
+## 🔆 Introduction
+🤗 If you find DepthCrafter useful, **please help ⭐ this repo**, which is important to Open-Source projects. Thanks!
+
+🔥 DepthCrafter can generate temporally consistent long-depth sequences with fine-grained details for open-world videos, 
+without requiring additional information such as camera poses or optical flow.
+
+- `[24-11-26]` 🚀🚀🚀 DepthCrafter v1.0.1 is released now, with improved quality and speed
+- `[24-10-19]` 🤗🤗🤗 DepthCrafter now has been integrated into [ComfyUI](https://github.com/akatz-ai/ComfyUI-DepthCrafter-Nodes)!
+- `[24-10-08]` 🤗🤗🤗 DepthCrafter now has been integrated into [Nuke](https://github.com/Theo-SAMINADIN-td/NukeDepthCrafter), have a try!
+- `[24-09-28]` Add full dataset inference and evaluation scripts for better comparison use. :-)
+- `[24-09-25]` 🤗🤗🤗 Add huggingface online demo [DepthCrafter](https://huggingface.co/spaces/tencent/DepthCrafter). 
+- `[24-09-19]` Add scripts for preparing benchmark datasets. 
+- `[24-09-18]` Add point cloud sequence visualization.
+- `[24-09-14]` 🔥🔥🔥 **DepthCrafter** is released now, have fun!
+
+
+## 📦 Release Notes
+- **DepthCrafter v1.0.1**:
+    - Quality and speed improvement
+        <table>
+          <thead>
+            <tr>
+              <th>Method</th>
+              <th>ms/frame&#x2193; @1024&#xD7;576 </th>
+              <th colspan="2">Sintel (~50 frames)</th>
+              <th colspan="2">Scannet (90 frames)</th>
+              <th colspan="2">KITTI (110 frames)</th>
+              <th colspan="2">Bonn (110 frames)</th>
+            </tr>
+            <tr>
+                <th></th>
+                <th></th>
+              <th>AbsRel&#x2193;</th>
+              <th>&delta;&#x2081; &#x2191;</th>
+              <th>AbsRel&#x2193;</th>
+              <th>&delta;&#x2081; &#x2191;</th>
+              <th>AbsRel&#x2193;</th>
+              <th>&delta;&#x2081; &#x2191;</th>
+              <th>AbsRel&#x2193;</th>
+              <th>&delta;&#x2081; &#x2191;</th>
+            </tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td>Marigold</td>
+              <td>1070.29</td>
+              <td>0.532</td>
+              <td>0.515</td>
+              <td>0.166</td>
+              <td>0.769</td>
+              <td>0.149</td>
+              <td>0.796</td>
+              <td>0.091</td>
+              <td>0.931</td>
+            </tr>
+            <tr>
+              <td>Depth-Anything-V2</td>
+              <td><strong>180.46</strong></td>
+              <td>0.367</td>
+              <td>0.554</td>
+              <td>0.135</td>
+              <td>0.822</td>
+              <td>0.140</td>
+              <td>0.804</td>
+              <td>0.106</td>
+              <td>0.921</td>
+            </tr>
+            <tr>
+              <td>DepthCrafter previous</td>
+              <td>1913.92</td>
+                <td><u>0.292</u></td>
+                <td><strong>0.697</strong></td>
+                <td><u>0.125</u></td>
+                <td><u>0.848</u></td>
+                <td><u>0.110</u></td>
+                <td><u>0.881</u></td>
+                <td><u>0.075</u></td>
+                <td><u>0.971</u></td>
+            </tr>
+            <tr>
+              <td>DepthCrafter v1.0.1</td>
+              <td><u>465.84</u></td>
+                <td><strong>0.270</strong></td>
+                <td><strong>0.697</strong></td>
+                <td><strong>0.123</strong></td>
+                <td><strong>0.856</strong></td>
+                <td><strong>0.104</strong></td>
+                <td><strong>0.896</strong></td>
+                <td><strong>0.071</strong></td>
+                <td><strong>0.972</strong></td>
+            </tr>
+          </tbody>
+        </table>
+
+    
+
+## 🎥 Visualization
+We provide demos of unprojected point cloud sequences, with reference RGB and estimated depth videos. 
+Please refer to our [project page](https://depthcrafter.github.io) for more details.
+
+
+https://github.com/user-attachments/assets/62141cc8-04d0-458f-9558-fe50bc04cc21
+
+
+
+
+## 🚀 Quick Start
+
+### 🤖 Gradio Demo
+- Online demo: [DepthCrafter](https://huggingface.co/spaces/tencent/DepthCrafter) 
+- Local demo:
+    ```bash
+    gradio app.py
+    ``` 
+
+### 🌟 Community Support
+- [NukeDepthCrafter](https://github.com/Theo-SAMINADIN-td/NukeDepthCrafter): 
+    a plugin allows you to generate temporally consistent Depth sequences inside Nuke, 
+    which is widely used in the VFX industry.
+- [ComfyUI-Nodes](https://github.com/akatz-ai/ComfyUI-DepthCrafter-Nodes): creating consistent depth maps for your videos using DepthCrafter in ComfyUI.
+
+
+### 🛠️ Installation
+1. Clone this repo:
+```bash
+git clone https://github.com/Tencent/DepthCrafter.git
+```
+2. Install dependencies (please refer to [requirements.txt](requirements.txt)):
+```bash
+pip install -r requirements.txt
+```
+
+
+
+### 🤗 Model Zoo
+[DepthCrafter](https://huggingface.co/tencent/DepthCrafter) is available in the Hugging Face Model Hub.
+
+### 🏃‍♂️ Inference
+#### 1. High-resolution inference, requires a GPU with ~26GB memory for 1024x576 resolution:
+- ~2.1 fps on A100, recommended for high-quality results:
+
+    ```bash
+    python run.py  --video-path examples/example_01.mp4
+    ```
+
+#### 2. Low-resolution inference requires a GPU with ~9GB memory for 512x256 resolution:
+- ~8.6 fps on A100:
+
+    ```bash
+    python run.py  --video-path examples/example_01.mp4 --max-res 512
+    ```
+
+## 🚀 Dataset Evaluation
+Please check the `benchmark` folder. 
+- To create the dataset we use in the paper, you need to run `dataset_extract/dataset_extract_${dataset_name}.py`.
+- Then you will get the `csv` files that save the relative root of extracted RGB video and depth npz files. We also provide these csv files.
+- Inference for all datasets scripts:
+  ```bash
+  bash benchmark/infer/infer.sh
+  ```
+  (Remember to replace the `input_rgb_root` and `saved_root` with your own path.)
+- Evaluation for all datasets scripts:
+  ```bash
+  bash benchmark/eval/eval.sh
+  ```
+   (Remember to replace the `pred_disp_root` and `gt_disp_root` with your own path.)
+####
+
+## 🤝 Contributing
+- Welcome to open issues and pull requests.
+- Welcome to optimize the inference speed and memory usage, e.g., through model quantization, distillation, or other acceleration techniques.
+
+## 📜 Citation
+If you find this work helpful, please consider citing:
+```bibtex
+@article{hu2024-DepthCrafter,
+            author      = {Hu, Wenbo and Gao, Xiangjun and Li, Xiaoyu and Zhao, Sijie and Cun, Xiaodong and Zhang, Yong and Quan, Long and Shan, Ying},
+            title       = {DepthCrafter: Generating Consistent Long Depth Sequences for Open-world Videos},
+            journal     = {arXiv preprint arXiv:2409.02095},
+            year        = {2024}
+    }
+```
diff --git a/app.py b/app.py
index b80ec41..26d9615 100644
--- a/app.py
+++ b/app.py
@@ -17,11 +17,11 @@
 from depthcrafter.utils import read_video_frames, vis_sequence_depth, save_video
 
 examples = [
-    ["examples/example_01.mp4", 10, 1.2, 1024, 60],
-    ["examples/example_02.mp4", 10, 1.2, 1024, 60],
-    ["examples/example_03.mp4", 10, 1.2, 1024, 60],
-    ["examples/example_04.mp4", 10, 1.2, 1024, 60],
-    ["examples/example_05.mp4", 10, 1.2, 1024, 60],
+    ["examples/example_01.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_02.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_03.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_04.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_05.mp4", 5, 1.0, 1024, -1, -1],
 ]
 
 
@@ -39,18 +39,18 @@
 pipe.to("cuda")
 
 
-@spaces.GPU(duration=140)
+@spaces.GPU(duration=120)
 def infer_depth(
     video: str,
     num_denoising_steps: int,
     guidance_scale: float,
     max_res: int = 1024,
-    process_length: int = 195,
+    process_length: int = -1,
+    target_fps: int = -1,
     #
     save_folder: str = "./demo_output",
     window_size: int = 110,
     overlap: int = 25,
-    target_fps: int = 15,
     seed: int = 42,
     track_time: bool = True,
     save_npz: bool = False,
@@ -59,7 +59,6 @@ def infer_depth(
     pipe.enable_xformers_memory_efficient_attention()
 
     frames, target_fps = read_video_frames(video, process_length, target_fps, max_res)
-    print(f"==> video name: {video}, frames shape: {frames.shape}")
 
     # inference the depth map using the DepthCrafter pipeline
     with torch.inference_mode():
@@ -82,6 +81,7 @@ def infer_depth(
     vis = vis_sequence_depth(res)
     # save the depth map and visualization with the target FPS
     save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0])
+    print(f"==> saving results to {save_path}")
     os.makedirs(os.path.dirname(save_path), exist_ok=True)
     if save_npz:
         np.savez_compressed(save_path + ".npz", depth=res)
@@ -155,14 +155,14 @@ def construct_demo():
                             label="num denoising steps",
                             minimum=1,
                             maximum=25,
-                            value=10,
+                            value=5,
                             step=1,
                         )
                         guidance_scale = gr.Slider(
                             label="cfg scale",
                             minimum=1.0,
                             maximum=1.2,
-                            value=1.2,
+                            value=1.0,
                             step=0.1,
                         )
                         max_res = gr.Slider(
@@ -174,11 +174,18 @@ def construct_demo():
                         )
                         process_length = gr.Slider(
                             label="process length",
-                            minimum=1,
+                            minimum=-1,
                             maximum=280,
                             value=60,
                             step=1,
                         )
+                        process_target_fps = gr.Slider(
+                            label="target FPS",
+                            minimum=-1,
+                            maximum=30,
+                            value=15,
+                            step=1,
+                        )
                     generate_btn = gr.Button("Generate")
             with gr.Column(scale=2):
                 pass
@@ -191,6 +198,7 @@ def construct_demo():
                 guidance_scale,
                 max_res,
                 process_length,
+                process_target_fps,
             ],
             outputs=[output_video_1, output_video_2],
             fn=infer_depth,
@@ -216,6 +224,7 @@ def construct_demo():
                 guidance_scale,
                 max_res,
                 process_length,
+                process_target_fps,
             ],
             outputs=[output_video_1, output_video_2],
         )
@@ -223,9 +232,8 @@ def construct_demo():
     return depthcrafter_iface
 
 
-demo = construct_demo()
-
 if __name__ == "__main__":
+    demo = construct_demo()
     demo.queue()
-    # demo.launch(server_name="0.0.0.0", server_port=80, debug=True)
+    # demo.launch(server_name="0.0.0.0", server_port=12345, debug=True, share=False)
     demo.launch(share=True)
diff --git a/depthcrafter/utils.py b/depthcrafter/utils.py
index 977e21b..2ac50e8 100644
--- a/depthcrafter/utils.py
+++ b/depthcrafter/utils.py
@@ -5,6 +5,50 @@
 import matplotlib.cm as cm
 import mediapy
 import torch
+from decord import VideoReader, cpu
+
+dataset_res_dict = {
+    "sintel": [448, 1024],
+    "scannet": [640, 832],
+    "KITTI": [384, 1280],
+    "bonn": [512, 640],
+    "NYUv2": [448, 640],
+}
+
+
+def read_video_frames(video_path, process_length, target_fps, max_res, dataset="open"):
+    if dataset == "open":
+        print("==> processing video: ", video_path)
+        vid = VideoReader(video_path, ctx=cpu(0))
+        print("==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:]))
+        original_height, original_width = vid.get_batch([0]).shape[1:3]
+        height = round(original_height / 64) * 64
+        width = round(original_width / 64) * 64
+        if max(height, width) > max_res:
+            scale = max_res / max(original_height, original_width)
+            height = round(original_height * scale / 64) * 64
+            width = round(original_width * scale / 64) * 64
+    else:
+        height = dataset_res_dict[dataset][0]
+        width = dataset_res_dict[dataset][1]
+
+    vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
+
+    fps = vid.get_avg_fps() if target_fps == -1 else target_fps
+    stride = round(vid.get_avg_fps() / fps)
+    stride = max(stride, 1)
+    frames_idx = list(range(0, len(vid), stride))
+    print(
+        f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
+    )
+    if process_length != -1 and process_length < len(frames_idx):
+        frames_idx = frames_idx[:process_length]
+    print(
+        f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
+    )
+    frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
+
+    return frames, fps
 
 
 def save_video(
diff --git a/run.py b/run.py
index 25384de..bd44917 100644
--- a/run.py
+++ b/run.py
@@ -3,21 +3,12 @@
 import numpy as np
 import torch
 
-from decord import VideoReader, cpu
 from diffusers.training_utils import set_seed
 from fire import Fire
 
 from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
 from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
-from depthcrafter.utils import vis_sequence_depth, save_video
-
-dataset_res_dict = {
-    "sintel": [448, 1024],
-    "scannet": [640, 832],
-    "KITTI": [384, 1280],
-    "bonn": [512, 640],
-    "NYUv2": [448, 640],
-}
+from depthcrafter.utils import vis_sequence_depth, save_video, read_video_frames
 
 
 class DepthCrafterDemo:
@@ -59,45 +50,6 @@ def __init__(
             print("Xformers is not enabled")
         self.pipe.enable_attention_slicing()
 
-    @staticmethod
-    def read_video_frames(
-        video_path, process_length, target_fps, max_res, dataset="open"
-    ):
-        if dataset == "open":
-            print("==> processing video: ", video_path)
-            vid = VideoReader(video_path, ctx=cpu(0))
-            print(
-                "==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:])
-            )
-            original_height, original_width = vid.get_batch([0]).shape[1:3]
-            height = round(original_height / 64) * 64
-            width = round(original_width / 64) * 64
-            if max(height, width) > max_res:
-                scale = max_res / max(original_height, original_width)
-                height = round(original_height * scale / 64) * 64
-                width = round(original_width * scale / 64) * 64
-        else:
-            height = dataset_res_dict[dataset][0]
-            width = dataset_res_dict[dataset][1]
-
-        vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
-
-        fps = vid.get_avg_fps() if target_fps == -1 else target_fps
-        stride = round(vid.get_avg_fps() / fps)
-        stride = max(stride, 1)
-        frames_idx = list(range(0, len(vid), stride))
-        print(
-            f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
-        )
-        if process_length != -1 and process_length < len(frames_idx):
-            frames_idx = frames_idx[:process_length]
-        print(
-            f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
-        )
-        frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
-
-        return frames, fps
-
     def infer(
         self,
         video: str,
@@ -116,7 +68,7 @@ def infer(
     ):
         set_seed(seed)
 
-        frames, target_fps = self.read_video_frames(
+        frames, target_fps = read_video_frames(
             video,
             process_length,
             target_fps,

Method	ms/frame↓ @1024×576	Sintel (~50 frames)		Scannet (90 frames)		KITTI (110 frames)		Bonn (110 frames)
		AbsRel↓	δ₁ ↑	AbsRel↓	δ₁ ↑	AbsRel↓	δ₁ ↑	AbsRel↓	δ₁ ↑
Marigold	1070.29	0.532	0.515	0.166	0.769	0.149	0.796	0.091	0.931
Depth-Anything-V2	180.46	0.367	0.554	0.135	0.822	0.140	0.804	0.106	0.921
DepthCrafter previous	1913.92	0.292	0.697	0.125	0.848	0.110	0.881	0.075	0.971
DepthCrafter v1.0.1	465.84	0.270	0.697	0.123	0.856	0.104	0.896	0.071	0.972