From 212423cc1679b6d3b2e272e408e25416da635d9e Mon Sep 17 00:00:00 2001 From: maturk Date: Tue, 23 Jul 2024 15:42:54 +0300 Subject: [PATCH 1/6] benchmark --- examples/benchmark.py | 146 ++++++++++++++++++++++++++++++++++++++ examples/requirements.txt | 1 + 2 files changed, 147 insertions(+) create mode 100644 examples/benchmark.py diff --git a/examples/benchmark.py b/examples/benchmark.py new file mode 100644 index 000000000..87b4fa853 --- /dev/null +++ b/examples/benchmark.py @@ -0,0 +1,146 @@ +# Benchmark script + +import glob +import os +import time +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass, field + +import GPUtil + + +@dataclass +class BenchmarkConfig: + """Baseline config""" + + # trainer to run + trainer: str = "simple_trainer.py" + # path to data + data_dir: str = "data/360_v2" + # scenes to run + scenes: set[str] = ( + "bicycle", + "bonsai", + "counter", + "garden", + "stump", + "kitchen", + "room", + ) + # downscale factors + factors: set[str] = (4, 2, 2, 4, 4, 2, 2) + # exclude gpus + excluded_gpus: set = field(default_factory=set) + # result directory + result_dir: str = "results/baseline" + # dry run, useful for debugging + dry_run: bool = False + # extra model specific configs + model_configs: dict = field(default_factory=dict) + + +# Configurations to run +baseline_config = BenchmarkConfig(model_configs={"--max_steps": 1}) +baseline_config_absgrad = BenchmarkConfig( + result_dir="results/absgrad", + model_configs={"--absgrad": True, "--grow_grad2d": 0.0006}, +) +baseline_config_antialiased = BenchmarkConfig( + result_dir="results/antialiased", model_configs={"--antialiased": True} +) +mcmc_config = BenchmarkConfig( + trainer="simple_trainer_mcmc.py", + result_dir="results/mcmc", + model_configs={"--max_steps": 30000}, +) + +configs_to_run = [ + mcmc_config, + # baseline_config, + # baseline_config_absgrad, + # baseline_config_antialiased, +] + + +def train_scene(gpu, scene, factor, config): + # additional user set model configs + model_config_args = " ".join(f"{k} {v}" for k, v in config.model_configs.items()) + + # train without eval + cmd = f"OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES={gpu} python {config.trainer} --eval_steps -1 --disable_viewer --data_factor {factor} --data_dir {config.data_dir}/{scene} --result_dir {config.result_dir}/{scene} {model_config_args}" + + print(cmd) + if not config.dry_run: + os.system(cmd) + + # eval and render for all the ckpts + ckpts = glob.glob(f"{config.result_dir}/{scene}/ckpts/*.pt") + for ckpt in ckpts: + cmd = f"OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES={gpu} python {config.trainer} --disable_viewer --data_factor {factor} --data_dir {config.data_dir}//{scene} --result_dir {config.result_dir}/{scene} --ckpt {ckpt} {model_config_args}" + print(cmd) + if not config.dry_run: + os.system(cmd) + + return True + + +def worker(gpu, scene, factor, config): + print(f"Starting {config.trainer} job on GPU {gpu} with scene {scene}\n") + train_scene(gpu, scene, factor, config) + print(f"Finished {config.trainer} job on GPU {gpu} with scene {scene}\n") + # This worker function starts a job and returns when it's done. + + +def dispatch_jobs(jobs, executor, config): + future_to_job = {} + reserved_gpus = set() # GPUs that are slated for work but may not be active yet + + while jobs or future_to_job: + # Get the list of available GPUs, not including those that are reserved. + all_available_gpus = set( + GPUtil.getAvailable(order="first", limit=10, maxMemory=0.1, maxLoad=0.1) + ) + # all_available_gpus = set([0,1,2,3]) + available_gpus = list(all_available_gpus - reserved_gpus - config.excluded_gpus) + + # Launch new jobs on available GPUs + while available_gpus and jobs: + gpu = available_gpus.pop(0) + job = jobs.pop(0) + future = executor.submit( + worker, gpu, *job, config + ) # Unpacking job as arguments to worker + future_to_job[future] = (gpu, job) + + reserved_gpus.add(gpu) # Reserve this GPU until the job starts processing + + # Check for completed jobs and remove them from the list of running jobs. + # Also, release the GPUs they were using. + done_futures = [future for future in future_to_job if future.done()] + for future in done_futures: + job = future_to_job.pop( + future + ) # Remove the job associated with the completed future + gpu = job[0] # The GPU is the first element in each job tuple + reserved_gpus.discard(gpu) # Release this GPU + print(f"Job {job} has finished., releasing GPU {gpu}") + # (Optional) You might want to introduce a small delay here to prevent this loop from spinning very fast + # when there are no GPUs available. + time.sleep(5) + + print("All jobs have been processed.") + + +def main(): + """Launch batch_configs in serial""" + for config in configs_to_run: + # num jobs = num scenes to run for batch_config + jobs = list(zip(config.scenes, config.factors)) + print(jobs) + # Using ThreadPoolExecutor to manage the thread pool + with ThreadPoolExecutor(max_workers=8) as executor: + dispatch_jobs(jobs, executor, config) + + +if __name__ == "__main__": + main() diff --git a/examples/requirements.txt b/examples/requirements.txt index 73fa4f50a..462b6f71a 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -16,3 +16,4 @@ opencv-python tyro Pillow tensorboard +GPUtil From 01f95f9b149f5e238afd8752fb6e6471d702a586 Mon Sep 17 00:00:00 2001 From: maturk Date: Tue, 23 Jul 2024 15:49:09 +0300 Subject: [PATCH 2/6] update req --- examples/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/requirements.txt b/examples/requirements.txt index 462b6f71a..1ea3f6045 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -17,3 +17,4 @@ tyro Pillow tensorboard GPUtil +tyro From 141ebe6ad2552277523822c28404666fd9653686 Mon Sep 17 00:00:00 2001 From: maturk Date: Tue, 23 Jul 2024 16:22:39 +0300 Subject: [PATCH 3/6] update --- examples/benchmark.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/benchmark.py b/examples/benchmark.py index 87b4fa853..ccb5d65fb 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -18,7 +18,7 @@ class BenchmarkConfig: # path to data data_dir: str = "data/360_v2" # scenes to run - scenes: set[str] = ( + scenes: set = ( "bicycle", "bonsai", "counter", @@ -28,7 +28,7 @@ class BenchmarkConfig: "room", ) # downscale factors - factors: set[str] = (4, 2, 2, 4, 4, 2, 2) + factors: set = (4, 2, 2, 4, 4, 2, 2) # exclude gpus excluded_gpus: set = field(default_factory=set) # result directory @@ -40,7 +40,7 @@ class BenchmarkConfig: # Configurations to run -baseline_config = BenchmarkConfig(model_configs={"--max_steps": 1}) +baseline_config = BenchmarkConfig(model_configs={"--max_steps": 30000}) baseline_config_absgrad = BenchmarkConfig( result_dir="results/absgrad", model_configs={"--absgrad": True, "--grow_grad2d": 0.0006}, From ac748c5bb487885ae208453b7eb17a6a949dba69 Mon Sep 17 00:00:00 2001 From: maturk Date: Wed, 24 Jul 2024 01:07:33 +0300 Subject: [PATCH 4/6] cleanup benchmark.py --- examples/benchmark.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/benchmark.py b/examples/benchmark.py index ccb5d65fb..b73efae1d 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -11,7 +11,7 @@ @dataclass class BenchmarkConfig: - """Baseline config""" + """Benchmark config""" # trainer to run trainer: str = "simple_trainer.py" @@ -40,29 +40,29 @@ class BenchmarkConfig: # Configurations to run -baseline_config = BenchmarkConfig(model_configs={"--max_steps": 30000}) -baseline_config_absgrad = BenchmarkConfig( +baseline_config = BenchmarkConfig() +absgrad_config = BenchmarkConfig( result_dir="results/absgrad", model_configs={"--absgrad": True, "--grow_grad2d": 0.0006}, ) -baseline_config_antialiased = BenchmarkConfig( +antialiased_config = BenchmarkConfig( result_dir="results/antialiased", model_configs={"--antialiased": True} ) mcmc_config = BenchmarkConfig( trainer="simple_trainer_mcmc.py", result_dir="results/mcmc", - model_configs={"--max_steps": 30000}, ) configs_to_run = [ - mcmc_config, - # baseline_config, - # baseline_config_absgrad, - # baseline_config_antialiased, + baseline_config, + # mcmc_config, + # absgrad_config, + # antialiased_config, ] def train_scene(gpu, scene, factor, config): + """Train a single scene with config on current gpu""" # additional user set model configs model_config_args = " ".join(f"{k} {v}" for k, v in config.model_configs.items()) @@ -85,10 +85,10 @@ def train_scene(gpu, scene, factor, config): def worker(gpu, scene, factor, config): + """This worker function starts a job and returns when it's done.""" print(f"Starting {config.trainer} job on GPU {gpu} with scene {scene}\n") train_scene(gpu, scene, factor, config) print(f"Finished {config.trainer} job on GPU {gpu} with scene {scene}\n") - # This worker function starts a job and returns when it's done. def dispatch_jobs(jobs, executor, config): @@ -100,7 +100,6 @@ def dispatch_jobs(jobs, executor, config): all_available_gpus = set( GPUtil.getAvailable(order="first", limit=10, maxMemory=0.1, maxLoad=0.1) ) - # all_available_gpus = set([0,1,2,3]) available_gpus = list(all_available_gpus - reserved_gpus - config.excluded_gpus) # Launch new jobs on available GPUs @@ -111,7 +110,6 @@ def dispatch_jobs(jobs, executor, config): worker, gpu, *job, config ) # Unpacking job as arguments to worker future_to_job[future] = (gpu, job) - reserved_gpus.add(gpu) # Reserve this GPU until the job starts processing # Check for completed jobs and remove them from the list of running jobs. @@ -132,11 +130,13 @@ def dispatch_jobs(jobs, executor, config): def main(): - """Launch batch_configs in serial""" + """Launch batch_configs in serial but process each config in parallel (multi gpu)""" + for config in configs_to_run: - # num jobs = num scenes to run for batch_config + # num jobs = num scenes to run for current config jobs = list(zip(config.scenes, config.factors)) - print(jobs) + + # Run multiple gpu train scripts # Using ThreadPoolExecutor to manage the thread pool with ThreadPoolExecutor(max_workers=8) as executor: dispatch_jobs(jobs, executor, config) From b81a58850f09d9d8202e3960749aa2befa138834 Mon Sep 17 00:00:00 2001 From: maturk Date: Wed, 24 Jul 2024 13:00:15 +0300 Subject: [PATCH 5/6] fix absgrad antialaiased --- examples/benchmark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/benchmark.py b/examples/benchmark.py index b73efae1d..075b3de3d 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -43,10 +43,10 @@ class BenchmarkConfig: baseline_config = BenchmarkConfig() absgrad_config = BenchmarkConfig( result_dir="results/absgrad", - model_configs={"--absgrad": True, "--grow_grad2d": 0.0006}, + model_configs={"--absgrad": "", "--grow_grad2d": 0.0006}, ) antialiased_config = BenchmarkConfig( - result_dir="results/antialiased", model_configs={"--antialiased": True} + result_dir="results/antialiased", model_configs={"--antialiased": ""} ) mcmc_config = BenchmarkConfig( trainer="simple_trainer_mcmc.py", From 9e3322501f8dacc10c3fe6a24ba564212c44665f Mon Sep 17 00:00:00 2001 From: maturk Date: Thu, 8 Aug 2024 21:15:56 +0300 Subject: [PATCH 6/6] benchmark script --- examples/benchmark.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/benchmark.py b/examples/benchmark.py index 075b3de3d..967746a32 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -11,7 +11,7 @@ @dataclass class BenchmarkConfig: - """Benchmark config""" + """Baseline benchmark config""" # trainer to run trainer: str = "simple_trainer.py" @@ -39,7 +39,7 @@ class BenchmarkConfig: model_configs: dict = field(default_factory=dict) -# Configurations to run +# Configurations of different GSPLAT options baseline_config = BenchmarkConfig() absgrad_config = BenchmarkConfig( result_dir="results/absgrad", @@ -53,11 +53,12 @@ class BenchmarkConfig: result_dir="results/mcmc", ) +# Configs to run configs_to_run = [ baseline_config, - # mcmc_config, - # absgrad_config, - # antialiased_config, + mcmc_config, + absgrad_config, + antialiased_config, ]