diff --git a/.github/workflows/core_code_checks.yml b/.github/workflows/core_code_checks.yml index 0f1fcf0caf..556432bfc2 100644 --- a/.github/workflows/core_code_checks.yml +++ b/.github/workflows/core_code_checks.yml @@ -33,9 +33,9 @@ jobs: run: | python ./nerfstudio/scripts/docs/add_nb_tags.py --check - name: Run Ruff Linter - run: ruff check docs/ nerfstudio/ tests/ + run: ruff check docs/ nerfstudio/ tests/ --output-format=github - name: Run Ruff Formatter - run: ruff format docs/ nerfstudio/ tests/ --check + run: ruff format docs/ nerfstudio/ tests/ --diff - name: Run Pyright run: | pyright diff --git a/nerfstudio/cameras/camera_utils.py b/nerfstudio/cameras/camera_utils.py index 7c98ef080c..9c1ee02200 100644 --- a/nerfstudio/cameras/camera_utils.py +++ b/nerfstudio/cameras/camera_utils.py @@ -720,7 +720,7 @@ def fisheye624_unproject_helper(uv, params, max_iters: int = 5): function so this solves an optimization problem using Newton's method to get the inverse. Inputs: - uv: BxNx3 tensor of 2D pixels to be projected + uv: BxNx2 tensor of 2D pixels to be unprojected params: Bx16 tensor of Fisheye624 parameters formatted like this: [f_u f_v c_u c_v {k_0 ... k_5} {p_0 p_1} {s_0 s_1 s_2 s_3}] or Bx15 tensor of Fisheye624 parameters formatted like this: diff --git a/nerfstudio/cameras/cameras.py b/nerfstudio/cameras/cameras.py index 4202c8c273..e390360b5e 100644 --- a/nerfstudio/cameras/cameras.py +++ b/nerfstudio/cameras/cameras.py @@ -864,7 +864,7 @@ def _compute_rays_for_vr180( assert distortion_params is not None masked_coords = pcoord_stack[coord_mask, :] - # The fisheye unprojection does not rely on planar/pinhold unprojection, thus the method needs + # The fisheye unprojection does not rely on planar/pinhole unprojection, thus the method needs # to access the focal length and principle points directly. camera_params = torch.cat( [ diff --git a/nerfstudio/configs/method_configs.py b/nerfstudio/configs/method_configs.py index 55ba44eb41..f60450163b 100644 --- a/nerfstudio/configs/method_configs.py +++ b/nerfstudio/configs/method_configs.py @@ -31,7 +31,6 @@ from nerfstudio.data.datamanagers.parallel_datamanager import ParallelDataManagerConfig from nerfstudio.data.datamanagers.random_cameras_datamanager import RandomCamerasDataManagerConfig from nerfstudio.data.dataparsers.blender_dataparser import BlenderDataParserConfig -from nerfstudio.data.dataparsers.colmap_dataparser import ColmapDataParserConfig from nerfstudio.data.dataparsers.dnerf_dataparser import DNeRFDataParserConfig from nerfstudio.data.dataparsers.instant_ngp_dataparser import InstantNGPDataParserConfig from nerfstudio.data.dataparsers.nerfstudio_dataparser import NerfstudioDataParserConfig @@ -600,7 +599,7 @@ gradient_accumulation_steps={"camera_opt": 100}, pipeline=VanillaPipelineConfig( datamanager=FullImageDatamanagerConfig( - dataparser=ColmapDataParserConfig(load_3D_points=True), + dataparser=NerfstudioDataParserConfig(load_3D_points=True), ), model=GaussianSplattingModelConfig(), ), diff --git a/nerfstudio/data/datamanagers/base_datamanager.py b/nerfstudio/data/datamanagers/base_datamanager.py index 210ce5757d..570c94ad4c 100644 --- a/nerfstudio/data/datamanagers/base_datamanager.py +++ b/nerfstudio/data/datamanagers/base_datamanager.py @@ -471,8 +471,8 @@ def _get_pixel_sampler(self, dataset: TDataset, num_rays_per_batch: int) -> Pixe CONSOLE.print("[bold yellow]Warning: Some cameras are equirectangular, but using default pixel sampler.") fisheye_crop_radius = None - if dataset.cameras.metadata is not None and "fisheye_crop_radius" in dataset.cameras.metadata: - fisheye_crop_radius = dataset.cameras.metadata["fisheye_crop_radius"] + if dataset.cameras.metadata is not None: + fisheye_crop_radius = dataset.cameras.metadata.get("fisheye_crop_radius") return self.config.pixel_sampler.setup( is_equirectangular=is_equirectangular, diff --git a/nerfstudio/data/datamanagers/full_images_datamanager.py b/nerfstudio/data/datamanagers/full_images_datamanager.py index a08a3bbf9f..d0fe74cef2 100644 --- a/nerfstudio/data/datamanagers/full_images_datamanager.py +++ b/nerfstudio/data/datamanagers/full_images_datamanager.py @@ -34,6 +34,7 @@ from torch.nn import Parameter from tqdm import tqdm +from nerfstudio.cameras.camera_utils import fisheye624_project, fisheye624_unproject_helper from nerfstudio.cameras.cameras import Cameras, CameraType from nerfstudio.configs.dataparser_configs import AnnotatedDataParserUnion from nerfstudio.data.datamanagers.base_datamanager import DataManager, DataManagerConfig, TDataset @@ -139,83 +140,11 @@ def cache_images(self, cache_images_option): continue distortion_params = camera.distortion_params.numpy() image = data["image"].numpy() - if camera.camera_type.item() == CameraType.PERSPECTIVE.value: - distortion_params = np.array( - [ - distortion_params[0], - distortion_params[1], - distortion_params[4], - distortion_params[5], - distortion_params[2], - distortion_params[3], - 0, - 0, - ] - ) - if np.any(distortion_params): - newK, roi = cv2.getOptimalNewCameraMatrix(K, distortion_params, (image.shape[1], image.shape[0]), 0) - image = cv2.undistort(image, K, distortion_params, None, newK) # type: ignore - else: - newK = K - roi = 0, 0, image.shape[1], image.shape[0] - # crop the image and update the intrinsics accordingly - x, y, w, h = roi - image = image[y : y + h, x : x + w] - if "depth_image" in data: - data["depth_image"] = data["depth_image"][y : y + h, x : x + w] - # update the width, height - self.train_dataset.cameras.width[i] = w - self.train_dataset.cameras.height[i] = h - if "mask" in data: - mask = data["mask"].numpy() - mask = mask.astype(np.uint8) * 255 - if np.any(distortion_params): - mask = cv2.undistort(mask, K, distortion_params, None, newK) # type: ignore - mask = mask[y : y + h, x : x + w] - data["mask"] = torch.from_numpy(mask).bool() - K = newK - - elif camera.camera_type.item() == CameraType.FISHEYE.value: - distortion_params = np.array( - [ - distortion_params[0], - distortion_params[1], - distortion_params[2], - distortion_params[3], - ] - ) - newK = cv2.fisheye.estimateNewCameraMatrixForUndistortRectify( - K, - distortion_params, - (image.shape[1], image.shape[0]), - np.eye(3), - balance=0, - ) - map1, map2 = cv2.fisheye.initUndistortRectifyMap( - K, - distortion_params, - np.eye(3), - newK, - (image.shape[1], image.shape[0]), - cv2.CV_32FC1, - ) - # and then remap: - image = cv2.remap( - image, - map1, - map2, - interpolation=cv2.INTER_LINEAR, - borderMode=cv2.BORDER_CONSTANT, - ) - if "mask" in data: - mask = data["mask"].numpy() - mask = mask.astype(np.uint8) * 255 - mask = cv2.fisheye.undistortImage(mask, K, distortion_params, None, newK) - data["mask"] = torch.from_numpy(mask).bool() - K = newK - else: - raise NotImplementedError("Only perspective and fisheye cameras are supported") + + K, image, mask = _undistort_image(camera, distortion_params, data, image, K) data["image"] = torch.from_numpy(image) + if mask is not None: + data["mask"] = mask cached_train.append(data) @@ -223,6 +152,8 @@ def cache_images(self, cache_images_option): self.train_dataset.cameras.fy[i] = float(K[1, 1]) self.train_dataset.cameras.cx[i] = float(K[0, 2]) self.train_dataset.cameras.cy[i] = float(K[1, 2]) + self.train_dataset.cameras.width[i] = image.shape[1] + self.train_dataset.cameras.height[i] = image.shape[0] CONSOLE.log("Caching / undistorting eval images") for i in tqdm(range(len(self.eval_dataset)), leave=False): @@ -235,81 +166,11 @@ def cache_images(self, cache_images_option): continue distortion_params = camera.distortion_params.numpy() image = data["image"].numpy() - if camera.camera_type.item() == CameraType.PERSPECTIVE.value: - distortion_params = np.array( - [ - distortion_params[0], - distortion_params[1], - distortion_params[4], - distortion_params[5], - distortion_params[2], - distortion_params[3], - 0, - 0, - ] - ) - if np.any(distortion_params): - newK, roi = cv2.getOptimalNewCameraMatrix(K, distortion_params, (image.shape[1], image.shape[0]), 0) - image = cv2.undistort(image, K, distortion_params, None, newK) # type: ignore - else: - newK = K - roi = 0, 0, image.shape[1], image.shape[0] - # crop the image and update the intrinsics accordingly - x, y, w, h = roi - image = image[y : y + h, x : x + w] - # update the width, height - self.eval_dataset.cameras.width[i] = w - self.eval_dataset.cameras.height[i] = h - if "mask" in data: - mask = data["mask"].numpy() - mask = mask.astype(np.uint8) * 255 - if np.any(distortion_params): - mask = cv2.undistort(mask, K, distortion_params, None, newK) # type: ignore - mask = mask[y : y + h, x : x + w] - data["mask"] = torch.from_numpy(mask).bool() - K = newK - - elif camera.camera_type.item() == CameraType.FISHEYE.value: - distortion_params = np.array( - [ - distortion_params[0], - distortion_params[1], - distortion_params[2], - distortion_params[3], - ] - ) - newK = cv2.fisheye.estimateNewCameraMatrixForUndistortRectify( - K, - distortion_params, - (image.shape[1], image.shape[0]), - np.eye(3), - balance=0, - ) - map1, map2 = cv2.fisheye.initUndistortRectifyMap( - K, - distortion_params, - np.eye(3), - newK, - (image.shape[1], image.shape[0]), - cv2.CV_32FC1, - ) - # and then remap: - image = cv2.remap( - image, - map1, - map2, - interpolation=cv2.INTER_LINEAR, - borderMode=cv2.BORDER_CONSTANT, - ) - if "mask" in data: - mask = data["mask"].numpy() - mask = mask.astype(np.uint8) * 255 - mask = cv2.fisheye.undistortImage(mask, K, distortion_params, None, newK) - data["mask"] = torch.from_numpy(mask).bool() - K = newK - else: - raise NotImplementedError("Only perspective and fisheye cameras are supported") + + K, image, mask = _undistort_image(camera, distortion_params, data, image, K) data["image"] = torch.from_numpy(image) + if mask is not None: + data["mask"] = mask cached_eval.append(data) @@ -317,6 +178,8 @@ def cache_images(self, cache_images_option): self.eval_dataset.cameras.fy[i] = float(K[1, 1]) self.eval_dataset.cameras.cx[i] = float(K[0, 2]) self.eval_dataset.cameras.cy[i] = float(K[1, 2]) + self.eval_dataset.cameras.width[i] = image.shape[1] + self.eval_dataset.cameras.height[i] = image.shape[0] if cache_images_option == "gpu": for cache in cached_train: @@ -461,3 +324,156 @@ def next_eval_image(self, step: int) -> Tuple[Cameras, Dict]: assert len(self.eval_dataset.cameras.shape) == 1, "Assumes single batch dimension" camera = self.eval_dataset.cameras[image_idx : image_idx + 1].to(self.device) return camera, data + + +def _undistort_image( + camera: Cameras, distortion_params: np.ndarray, data: dict, image: np.ndarray, K: np.ndarray +) -> Tuple[np.ndarray, np.ndarray, Optional[torch.Tensor]]: + mask = None + if camera.camera_type.item() == CameraType.PERSPECTIVE.value: + distortion_params = np.array( + [ + distortion_params[0], + distortion_params[1], + distortion_params[4], + distortion_params[5], + distortion_params[2], + distortion_params[3], + 0, + 0, + ] + ) + if np.any(distortion_params): + newK, roi = cv2.getOptimalNewCameraMatrix(K, distortion_params, (image.shape[1], image.shape[0]), 0) + image = cv2.undistort(image, K, distortion_params, None, newK) # type: ignore + else: + newK = K + roi = 0, 0, image.shape[1], image.shape[0] + # crop the image and update the intrinsics accordingly + x, y, w, h = roi + image = image[y : y + h, x : x + w] + if "depth_image" in data: + data["depth_image"] = data["depth_image"][y : y + h, x : x + w] + if "mask" in data: + mask = data["mask"].numpy() + mask = mask.astype(np.uint8) * 255 + if np.any(distortion_params): + mask = cv2.undistort(mask, K, distortion_params, None, newK) # type: ignore + mask = mask[y : y + h, x : x + w] + mask = torch.from_numpy(mask).bool() + K = newK + + elif camera.camera_type.item() == CameraType.FISHEYE.value: + distortion_params = np.array( + [distortion_params[0], distortion_params[1], distortion_params[2], distortion_params[3]] + ) + newK = cv2.fisheye.estimateNewCameraMatrixForUndistortRectify( + K, distortion_params, (image.shape[1], image.shape[0]), np.eye(3), balance=0 + ) + map1, map2 = cv2.fisheye.initUndistortRectifyMap( + K, distortion_params, np.eye(3), newK, (image.shape[1], image.shape[0]), cv2.CV_32FC1 + ) + # and then remap: + image = cv2.remap(image, map1, map2, interpolation=cv2.INTER_LINEAR) + if "mask" in data: + mask = data["mask"].numpy() + mask = mask.astype(np.uint8) * 255 + mask = cv2.fisheye.undistortImage(mask, K, distortion_params, None, newK) + mask = torch.from_numpy(mask).bool() + K = newK + elif camera.camera_type.item() == CameraType.FISHEYE624.value: + fisheye624_params = torch.cat( + [camera.fx, camera.fy, camera.cx, camera.cy, torch.from_numpy(distortion_params)], dim=0 + ) + assert fisheye624_params.shape == (16,) + assert ( + "mask" not in data + and camera.metadata is not None + and "fisheye_crop_radius" in camera.metadata + and isinstance(camera.metadata["fisheye_crop_radius"], float) + ) + fisheye_crop_radius = camera.metadata["fisheye_crop_radius"] + + # Approximate the FOV of the unmasked region of the camera. + upper, lower, left, right = fisheye624_unproject_helper( + torch.tensor( + [ + [camera.cx, camera.cy - fisheye_crop_radius], + [camera.cx, camera.cy + fisheye_crop_radius], + [camera.cx - fisheye_crop_radius, camera.cy], + [camera.cx + fisheye_crop_radius, camera.cy], + ], + dtype=torch.float32, + )[None], + params=fisheye624_params[None], + ).squeeze(dim=0) + fov_radians = torch.max( + torch.acos(torch.sum(upper * lower / torch.linalg.norm(upper) / torch.linalg.norm(lower))), + torch.acos(torch.sum(left * right / torch.linalg.norm(left) / torch.linalg.norm(right))), + ) + + # Heuristics to determine parameters of an undistorted image. + undist_h = int(fisheye_crop_radius * 2) + undist_w = int(fisheye_crop_radius * 2) + undistort_focal = undist_h / (2 * torch.tan(fov_radians / 2.0)) + undist_K = torch.eye(3) + undist_K[0, 0] = undistort_focal # fx + undist_K[1, 1] = undistort_focal # fy + undist_K[0, 2] = (undist_w - 1) / 2.0 # cx; for a 1x1 image, center should be at (0, 0). + undist_K[1, 2] = (undist_h - 1) / 2.0 # cy + + # Undistorted 2D coordinates -> rays -> reproject to distorted UV coordinates. + undist_uv_homog = torch.stack( + [ + *torch.meshgrid( + torch.arange(undist_w, dtype=torch.float32), + torch.arange(undist_h, dtype=torch.float32), + ), + torch.ones((undist_w, undist_h), dtype=torch.float32), + ], + dim=-1, + ) + assert undist_uv_homog.shape == (undist_w, undist_h, 3) + dist_uv = ( + fisheye624_project( + xyz=( + torch.einsum( + "ij,bj->bi", + torch.linalg.inv(undist_K), + undist_uv_homog.reshape((undist_w * undist_h, 3)), + )[None] + ), + params=fisheye624_params[None, :], + ) + .reshape((undist_w, undist_h, 2)) + .numpy() + ) + map1 = dist_uv[..., 1] + map2 = dist_uv[..., 0] + + # Use correspondence to undistort image. + image = cv2.remap(image, map1, map2, interpolation=cv2.INTER_LINEAR) + + # Compute undistorted mask as well. + dist_h = camera.height.item() + dist_w = camera.width.item() + mask = np.mgrid[:dist_h, :dist_w] + mask[0, ...] -= dist_h // 2 + mask[1, ...] -= dist_w // 2 + mask = np.linalg.norm(mask, axis=0) < fisheye_crop_radius + mask = torch.from_numpy( + cv2.remap( + mask.astype(np.uint8) * 255, + map1, + map2, + interpolation=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_CONSTANT, + borderValue=0, + ) + / 255.0 + ).bool() + K = undist_K.numpy() + else: + raise NotImplementedError("Only perspective and fisheye cameras are supported") + + return K, image, mask diff --git a/nerfstudio/data/datamanagers/parallel_datamanager.py b/nerfstudio/data/datamanagers/parallel_datamanager.py index 9f36807a61..bd66d01db8 100644 --- a/nerfstudio/data/datamanagers/parallel_datamanager.py +++ b/nerfstudio/data/datamanagers/parallel_datamanager.py @@ -198,8 +198,15 @@ def _get_pixel_sampler(self, dataset: TDataset, num_rays_per_batch: int) -> Pixe is_equirectangular = (dataset.cameras.camera_type == CameraType.EQUIRECTANGULAR.value).all() if is_equirectangular.any(): CONSOLE.print("[bold yellow]Warning: Some cameras are equirectangular, but using default pixel sampler.") + + fisheye_crop_radius = None + if dataset.cameras.metadata is not None: + fisheye_crop_radius = dataset.cameras.metadata.get("fisheye_crop_radius") + return self.config.pixel_sampler.setup( - is_equirectangular=is_equirectangular, num_rays_per_batch=num_rays_per_batch + is_equirectangular=is_equirectangular, + num_rays_per_batch=num_rays_per_batch, + fisheye_crop_radius=fisheye_crop_radius, ) def setup_train(self): diff --git a/nerfstudio/data/dataparsers/colmap_dataparser.py b/nerfstudio/data/dataparsers/colmap_dataparser.py index a5a9685cf6..5f5917e703 100644 --- a/nerfstudio/data/dataparsers/colmap_dataparser.py +++ b/nerfstudio/data/dataparsers/colmap_dataparser.py @@ -67,8 +67,8 @@ class ColmapDataParserConfig(DataParserConfig): assume_colmap_world_coordinate_convention: bool = True """Colmap optimized world often have y direction of the first camera pointing towards down direction, while nerfstudio world set z direction to be up direction for viewer. Therefore, we usually need to apply an extra - transform when orientation_method=none. This parameter has no effects if orientation_method is set other than none. - When this parameter is set to False, no extra transform is applied when reading data from colmap. + transform when orientation_method=none. This parameter has no effects if orientation_method is set other than none. + When this parameter is set to False, no extra transform is applied when reading data from colmap. """ eval_mode: Literal["fraction", "filename", "interval", "all"] = "interval" """ @@ -93,8 +93,9 @@ class ColmapDataParserConfig(DataParserConfig): """Path to depth maps directory. If not set, depths are not loaded.""" colmap_path: Path = Path("colmap/sparse/0") """Path to the colmap reconstruction directory relative to the data path.""" - load_3D_points: bool = False - """Whether to load the 3D points from the colmap reconstruction.""" + load_3D_points: bool = True + """Whether to load the 3D points from the colmap reconstruction. This is helpful for Gaussian splatting and + generally unused otherwise, but it's typically harmless so we default to True.""" max_2D_matches_per_3D_point: int = 0 """Maximum number of 2D matches per 3D point. If set to -1, all 2D matches are loaded. If set to 0, no 2D matches are loaded.""" diff --git a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py index 2d007d8e9a..892ed0684b 100644 --- a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py +++ b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py @@ -73,6 +73,8 @@ class NerfstudioDataParserConfig(DataParserConfig): """The interval between frames to use for eval. Only used when eval_mode is eval-interval.""" depth_unit_scale_factor: float = 1e-3 """Scales the depth values to meters. Default value is 0.001 for a millimeter to meter conversion.""" + load_3D_points: bool = False + """Whether to load the 3D points from the colmap reconstruction.""" @dataclass @@ -288,7 +290,12 @@ def _generate_dataparser_outputs(self, split="train"): else: distortion_params = torch.stack(distort, dim=0)[idx_tensor] - metadata = {"fisheye_crop_radius": fisheye_crop_radius} if fisheye_crop_radius is not None else None + # Only add fisheye crop radius parameter if the images are actually fisheye, to allow the same config to be used + # for both fisheye and non-fisheye datasets. + metadata = {} + if (camera_type in [CameraType.FISHEYE, CameraType.FISHEYE624]) and (fisheye_crop_radius is not None): + metadata["fisheye_crop_radius"] = fisheye_crop_radius + cameras = Cameras( fx=fx, fy=fy, @@ -305,20 +312,94 @@ def _generate_dataparser_outputs(self, split="train"): assert self.downscale_factor is not None cameras.rescale_output_resolution(scaling_factor=1.0 / self.downscale_factor) + # The naming is somewhat confusing, but: + # - transform_matrix contains the transformation to dataparser output coordinates from saved coordinates. + # - dataparser_transform_matrix contains the transformation to dataparser output coordinates from original data coordinates. + # - applied_transform contains the transformation to saved coordinates from original data coordinates. + applied_transform = None + colmap_path = self.config.data / "colmap/sparse/0" if "applied_transform" in meta: applied_transform = torch.tensor(meta["applied_transform"], dtype=transform_matrix.dtype) - transform_matrix = transform_matrix @ torch.cat( + elif colmap_path.exists(): + # For converting from colmap, this was the effective value of applied_transform that was being + # used before we added the applied_transform field to the output dataformat. + meta["applied_transform"] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, -1, 0]] + applied_transform = torch.tensor(meta["applied_transform"], dtype=transform_matrix.dtype) + + if applied_transform is not None: + dataparser_transform_matrix = transform_matrix @ torch.cat( [applied_transform, torch.tensor([[0, 0, 0, 1]], dtype=transform_matrix.dtype)], 0 ) + else: + dataparser_transform_matrix = transform_matrix + if "applied_scale" in meta: applied_scale = float(meta["applied_scale"]) scale_factor *= applied_scale - # Load 3D points + # reinitialize metadata for dataparser_outputs metadata = {} - if "ply_file_path" in meta: - ply_file_path = data_dir / meta["ply_file_path"] - metadata.update(self._load_3D_points(ply_file_path, transform_matrix, scale_factor)) + + # _generate_dataparser_outputs might be called more than once so we check if we already loaded the point cloud + try: + self.prompted_user + except AttributeError: + self.prompted_user = False + + # Load 3D points + if self.config.load_3D_points: + if "ply_file_path" in meta: + ply_file_path = data_dir / meta["ply_file_path"] + + elif colmap_path.exists(): + from rich.prompt import Confirm + + # check if user wants to make a point cloud from colmap points + if not self.prompted_user: + self.create_pc = Confirm.ask( + "load_3D_points is true, but the dataset was processed with an outdated ns-process-data that didn't convert colmap points to .ply! Update the colmap dataset automatically?" + ) + + if self.create_pc: + import json + + from nerfstudio.process_data.colmap_utils import create_ply_from_colmap + + with open(self.config.data / "transforms.json") as f: + transforms = json.load(f) + + # Update dataset if missing the applied_transform field. + if "applied_transform" not in transforms: + transforms["applied_transform"] = meta["applied_transform"] + + ply_filename = "sparse_pc.ply" + create_ply_from_colmap( + filename=ply_filename, + recon_dir=colmap_path, + output_dir=self.config.data, + applied_transform=applied_transform, + ) + ply_file_path = data_dir / ply_filename + transforms["ply_file_path"] = ply_filename + + # This was the applied_transform value + + with open(self.config.data / "transforms.json", "w", encoding="utf-8") as f: + json.dump(transforms, f, indent=4) + else: + ply_file_path = None + else: + if not self.prompted_user: + CONSOLE.print( + "[bold yellow]Warning: load_3D_points set to true but no point cloud found. gaussian-splatting models will use random point cloud initialization." + ) + ply_file_path = None + + if ply_file_path: + sparse_points = self._load_3D_points(ply_file_path, transform_matrix, scale_factor) + if sparse_points is not None: + metadata.update(sparse_points) + self.prompted_user = True dataparser_outputs = DataparserOutputs( image_filenames=image_filenames, @@ -326,7 +407,7 @@ def _generate_dataparser_outputs(self, split="train"): scene_box=scene_box, mask_filenames=mask_filenames if len(mask_filenames) > 0 else None, dataparser_scale=scale_factor, - dataparser_transform=transform_matrix, + dataparser_transform=dataparser_transform_matrix, metadata={ "depth_filenames": depth_filenames if len(depth_filenames) > 0 else None, "depth_unit_scale_factor": self.config.depth_unit_scale_factor, @@ -336,10 +417,24 @@ def _generate_dataparser_outputs(self, split="train"): return dataparser_outputs def _load_3D_points(self, ply_file_path: Path, transform_matrix: torch.Tensor, scale_factor: float): + """Loads point clouds positions and colors from .ply + + Args: + ply_file_path: Path to .ply file + transform_matrix: Matrix to transform world coordinates + scale_factor: How much to scale the camera origins by. + + Returns: + A dictionary of points: points3D_xyz and colors: points3D_rgb + """ import open3d as o3d # Importing open3d is slow, so we only do it if we need it. pcd = o3d.io.read_point_cloud(str(ply_file_path)) + # if no points found don't read in an initial point cloud + if len(pcd.points) == 0: + return None + points3D = torch.from_numpy(np.asarray(pcd.points, dtype=np.float32)) points3D = ( torch.cat( diff --git a/nerfstudio/models/gaussian_splatting.py b/nerfstudio/models/gaussian_splatting.py index ae3b544225..215a93fc18 100644 --- a/nerfstudio/models/gaussian_splatting.py +++ b/nerfstudio/models/gaussian_splatting.py @@ -162,16 +162,18 @@ class GaussianSplattingModel(Model): config: GaussianSplattingModelConfig - def __init__(self, *args, **kwargs): - if "seed_points" in kwargs: - self.seed_pts = kwargs["seed_points"] - else: - self.seed_pts = None + def __init__( + self, + *args, + seed_points: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs, + ): + self.seed_points = seed_points super().__init__(*args, **kwargs) def populate_modules(self): - if self.seed_pts is not None and not self.config.random_init: - self.means = torch.nn.Parameter(self.seed_pts[0]) # (Location, Color) + if self.seed_points is not None and not self.config.random_init: + self.means = torch.nn.Parameter(self.seed_points[0]) # (Location, Color) else: self.means = torch.nn.Parameter((torch.rand((500000, 3)) - 0.5) * 10) self.xys_grad_norm = None @@ -184,14 +186,19 @@ def populate_modules(self): self.quats = torch.nn.Parameter(random_quat_tensor(self.num_points)) dim_sh = num_sh_bases(self.config.sh_degree) - if self.seed_pts is not None and not self.config.random_init: - shs = torch.zeros((self.seed_pts[1].shape[0], dim_sh, 3)).float().cuda() + if ( + self.seed_points is not None + and not self.config.random_init + # We can have colors without points. + and self.seed_points[1].shape[0] > 0 + ): + shs = torch.zeros((self.seed_points[1].shape[0], dim_sh, 3)).float().cuda() if self.config.sh_degree > 0: - shs[:, 0, :3] = RGB2SH(self.seed_pts[1] / 255) + shs[:, 0, :3] = RGB2SH(self.seed_points[1] / 255) shs[:, 1:, 3:] = 0.0 else: CONSOLE.log("use color only optimization with sigmoid activation") - shs[:, 0, :3] = torch.logit(self.seed_pts[1] / 255, eps=1e-10) + shs[:, 0, :3] = torch.logit(self.seed_points[1] / 255, eps=1e-10) self.features_dc = torch.nn.Parameter(shs[:, 0, :]) self.features_rest = torch.nn.Parameter(shs[:, 1:, :]) else: @@ -808,11 +815,18 @@ def get_loss_dict(self, outputs, batch, metrics_dict=None) -> Dict[str, torch.Te metrics_dict: dictionary of metrics, some of which we can use for loss """ gt_img = self.get_gt_img(batch["image"]) - Ll1 = torch.abs(gt_img - outputs["rgb"]).mean() - simloss = 1 - self.ssim( - gt_img.permute(2, 0, 1)[None, ...], - outputs["rgb"].permute(2, 0, 1)[None, ...], - ) + pred_img = outputs["rgb"] + + # Set masked part of both ground-truth and rendered image to black. + # This is a little bit sketchy for the SSIM loss. + if "mask" in batch: + assert batch["mask"].shape == gt_img.shape[:2] == pred_img.shape[:2] + mask = batch["mask"][..., None].to(self.device) + gt_img = gt_img * mask + pred_img = pred_img * mask + + Ll1 = torch.abs(gt_img - pred_img).mean() + simloss = 1 - self.ssim(gt_img.permute(2, 0, 1)[None, ...], pred_img.permute(2, 0, 1)[None, ...]) if self.config.use_scale_regularization and self.step % 10 == 0: scale_exp = torch.exp(self.scales) scale_reg = ( diff --git a/nerfstudio/process_data/colmap_utils.py b/nerfstudio/process_data/colmap_utils.py index 348f2b31a6..2f2ac3021a 100644 --- a/nerfstudio/process_data/colmap_utils.py +++ b/nerfstudio/process_data/colmap_utils.py @@ -18,7 +18,7 @@ import json from pathlib import Path -from typing import Any, Dict, Literal, Optional +from typing import Any, Dict, Literal, Optional, Union import appdirs import cv2 @@ -34,6 +34,7 @@ read_cameras_binary, read_images_binary, read_points3D_binary, + read_points3D_text, ) from nerfstudio.process_data.process_data_utils import CameraModel from nerfstudio.utils import colormaps @@ -391,6 +392,7 @@ def colmap_to_json( camera_mask_path: Optional[Path] = None, image_id_to_depth_path: Optional[Dict[int, Path]] = None, image_rename_map: Optional[Dict[str, str]] = None, + ply_filename="sparse_pc.ply", keep_original_world_coordinate: bool = False, ) -> int: """Converts COLMAP's cameras.bin and images.bin to a JSON file. @@ -459,12 +461,23 @@ def colmap_to_json( out = parse_colmap_camera_params(cam_id_to_camera[1]) out["frames"] = frames + applied_transform = None if not keep_original_world_coordinate: applied_transform = np.eye(4)[:3, :] applied_transform = applied_transform[np.array([0, 2, 1]), :] applied_transform[2, :] *= -1 out["applied_transform"] = applied_transform.tolist() + # create ply from colmap + assert ply_filename.endswith(".ply"), f"ply_filename: {ply_filename} does not end with '.ply'" + create_ply_from_colmap( + ply_filename, + recon_dir, + output_dir, + torch.from_numpy(applied_transform).float() if applied_transform is not None else None, + ) + out["ply_file_path"] = ply_filename + with open(output_dir / "transforms.json", "w", encoding="utf-8") as f: json.dump(out, f, indent=4) @@ -643,3 +656,49 @@ def get_matching_summary(num_initial_frames: int, num_matched_frames: int) -> st result += " or large exposure changes." return result return f"[bold green]COLMAP found poses for {num_matched_frames / num_initial_frames * 100:.2f}% of the images." + + +def create_ply_from_colmap( + filename: str, recon_dir: Path, output_dir: Path, applied_transform: Union[torch.Tensor, None] +) -> None: + """Writes a ply file from colmap. + + Args: + filename: file name for .ply + recon_dir: Directory to grab colmap points + output_dir: Directory to output .ply + """ + if (recon_dir / "points3D.bin").exists(): + colmap_points = read_points3D_binary(recon_dir / "points3D.bin") + elif (recon_dir / "points3D.txt").exists(): + colmap_points = read_points3D_text(recon_dir / "points3D.txt") + else: + raise ValueError(f"Could not find points3D.txt or points3D.bin in {recon_dir}") + + # Load point Positions + points3D = torch.from_numpy(np.array([p.xyz for p in colmap_points.values()], dtype=np.float32)) + if applied_transform is not None: + assert applied_transform.shape == (3, 4) + points3D = torch.einsum("ij,bj->bi", applied_transform[:3, :3], points3D) + applied_transform[:3, 3] + + # Load point colours + points3D_rgb = torch.from_numpy(np.array([p.rgb for p in colmap_points.values()], dtype=np.uint8)) + + # write ply + with open(output_dir / filename, "w") as f: + # Header + f.write("ply\n") + f.write("format ascii 1.0\n") + f.write(f"element vertex {len(points3D)}\n") + f.write("property float x\n") + f.write("property float y\n") + f.write("property float z\n") + f.write("property uint8 red\n") + f.write("property uint8 green\n") + f.write("property uint8 blue\n") + f.write("end_header\n") + + for coord, color in zip(points3D, points3D_rgb): + x, y, z = coord + r, g, b = color + f.write(f"{x:8f} {y:8f} {z:8f} {r} {g} {b}\n") diff --git a/nerfstudio/process_data/process_data_utils.py b/nerfstudio/process_data/process_data_utils.py index 768786996b..40381b2173 100644 --- a/nerfstudio/process_data/process_data_utils.py +++ b/nerfstudio/process_data/process_data_utils.py @@ -58,16 +58,18 @@ class CameraModel(Enum): } -def list_images(data: Path) -> List[Path]: +def list_images(data: Path, recursive: bool = False) -> List[Path]: """Lists all supported images in a directory Args: data: Path to the directory of images. + recursive: Whether to search check nested folders in `data`. Returns: Paths to images contained in the directory """ allowed_exts = [".jpg", ".jpeg", ".png", ".tif", ".tiff"] + ALLOWED_RAW_EXTS - image_paths = sorted([p for p in data.glob("[!.]*") if p.suffix.lower() in allowed_exts]) + glob_str = "**/[!.]*" if recursive else "[!.]*" + image_paths = sorted([p for p in data.glob(glob_str) if p.suffix.lower() in allowed_exts]) return image_paths diff --git a/nerfstudio/scripts/datasets/process_project_aria.py b/nerfstudio/scripts/datasets/process_project_aria.py index 10f26653fd..23d304cf6a 100644 --- a/nerfstudio/scripts/datasets/process_project_aria.py +++ b/nerfstudio/scripts/datasets/process_project_aria.py @@ -17,15 +17,17 @@ import threading from dataclasses import dataclass from pathlib import Path -from typing import Dict, List +from typing import Any, Dict, List, cast import numpy as np +import open3d as o3d import tyro from PIL import Image try: from projectaria_tools.core import mps from projectaria_tools.core.data_provider import VrsDataProvider, create_vrs_data_provider + from projectaria_tools.core.mps.utils import filter_points_from_confidence from projectaria_tools.core.sophus import SE3 except ImportError: print("projectaria_tools import failed, please install with pip3 install projectaria-tools'[all]'") @@ -219,6 +221,21 @@ def main(self) -> None: "fisheye_crop_radius": rgb_valid_radius, } + # save global point cloud, which is useful for Gaussian Splatting. + points_path = self.mps_data_dir / "global_points.csv.gz" + if points_path.exists(): + print("Found global points, saving to PLY...") + points_data = mps.read_global_point_cloud(str(points_path)) # type: ignore + points_data = filter_points_from_confidence(points_data) + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(np.array([cast(Any, it).position_world for it in points_data])) + ply_file_path = self.output_dir / "global_points.ply" + o3d.io.write_point_cloud(str(ply_file_path), pcd) + + nerfstudio_frames["ply_file_path"] = "global_points.ply" + else: + print("No global points found!") + # write the json out to disk as transforms.json print("Writing transforms.json") transform_file = self.output_dir / "transforms.json" diff --git a/nerfstudio/scripts/downloads/download_data.py b/nerfstudio/scripts/downloads/download_data.py index ec4ce9b158..515ee45a62 100644 --- a/nerfstudio/scripts/downloads/download_data.py +++ b/nerfstudio/scripts/downloads/download_data.py @@ -29,26 +29,13 @@ import tyro from typing_extensions import Annotated -from nerfstudio.configs.base_config import PrintableConfig from nerfstudio.process_data import process_data_utils +from nerfstudio.scripts.downloads.eyeful_tower import EyefulTowerDownload +from nerfstudio.scripts.downloads.utils import DatasetDownload from nerfstudio.utils import install_checks from nerfstudio.utils.scripts import run_command -@dataclass -class DatasetDownload(PrintableConfig): - """Download a dataset""" - - capture_name = None - - save_dir: Path = Path("data/") - """The directory to save the dataset to""" - - def download(self, save_dir: Path) -> None: - """Download the dataset""" - raise NotImplementedError - - @dataclass class BlenderDownload(DatasetDownload): """Download the blender dataset.""" @@ -555,6 +542,7 @@ def download(self, save_dir: Path) -> None: Annotated[SDFstudioDemoDownload, tyro.conf.subcommand(name="sdfstudio")], Annotated[NeRFOSRDownload, tyro.conf.subcommand(name="nerfosr")], Annotated[Mill19Download, tyro.conf.subcommand(name="mill19")], + Annotated[EyefulTowerDownload, tyro.conf.subcommand(name="eyefultower")], ] @@ -562,15 +550,7 @@ def main( dataset: DatasetDownload, ): """Script to download existing datasets. - We currently support the following datasets: - - nerfstudio: Growing collection of real-world scenes. Use the `capture_name` argument to specify - which capture to download. - - blender: Blender synthetic scenes realeased with NeRF. - - sitcoms3d: Friends TV show scenes. - - record3d: Record3d dataset. - - dnerf: D-NeRF dataset. - - phototourism: PhotoTourism dataset. Use the `capture_name` argument to specify which capture to download. - - mill19: Mill 19 dataset. Use the `capture_name` argument to specify which capture to download. + We currently support the datasets listed above in the Commands. Args: dataset: The dataset to download (from). diff --git a/nerfstudio/scripts/downloads/eyeful_tower.py b/nerfstudio/scripts/downloads/eyeful_tower.py new file mode 100644 index 0000000000..23fe080e68 --- /dev/null +++ b/nerfstudio/scripts/downloads/eyeful_tower.py @@ -0,0 +1,422 @@ +# Copyright 2022 the Regents of the University of California, Nerfstudio Team and contributors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import collections +import copy +import json +import xml.etree.ElementTree as ET +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Tuple + +import awscli.clidriver +import numpy as np +import tyro + +from nerfstudio.scripts.downloads.utils import DatasetDownload +from nerfstudio.utils.rich_utils import CONSOLE + +eyefultower_downloads = [ + "all", + "apartment", + "kitchen", + "office1a", + "office1b", + "office2", + "office_view1", + "office_view2", + "riverview", + "seating_area", + "table", + "workshop", +] + +# Crop radii empirically chosen to try to avoid hitting the rig base or go out of bounds +eyefultower_fisheye_radii = { + "office1a": 0.43, + "office2": 0.45, + "seating_area": 0.375, # could be .45 except for camera 2 + "table": 0.45, + "workshop": 0.45, +} + + +@dataclass +class EyefulTowerResolutionMetadata: + folder_name: str + width: int + height: int + extension: str + + +eyefultower_resolutions = { + "all": None, + "jpeg_1k": EyefulTowerResolutionMetadata("images-jpeg-1k", 684, 1024, "jpg"), + "jpeg_2k": EyefulTowerResolutionMetadata("images-jpeg-2k", 1368, 2048, "jpg"), + "jpeg_4k": EyefulTowerResolutionMetadata("images-jpeg-4k", 2736, 4096, "jpg"), + "jpeg_8k": EyefulTowerResolutionMetadata("images-jpeg", 5784, 8660, "jpg"), + "exr_1k": EyefulTowerResolutionMetadata("images-1k", 684, 1024, "exr"), + "exr_2k": EyefulTowerResolutionMetadata("images-2k", 1368, 2048, "exr"), +} + +if TYPE_CHECKING: + EyefulTowerCaptureName = str + EyefulTowerResolution = str +else: + EyefulTowerCaptureName = tyro.extras.literal_type_from_choices(eyefultower_downloads) + EyefulTowerResolution = tyro.extras.literal_type_from_choices(eyefultower_resolutions.keys()) + + +@dataclass +class EyefulTowerDownload(DatasetDownload): + """Download the EyefulTower dataset. + + Use the --help flag with the `eyefultower` subcommand to see all available datasets. + Find more information about the dataset at https://github.com/facebookresearch/EyefulTower. + """ + + capture_name: Tuple[EyefulTowerCaptureName, ...] = () + resolution_name: Tuple[EyefulTowerResolution, ...] = () + + @staticmethod + def scale_metashape_transform(xml_tree: ET.ElementTree, target_width: int, target_height: int) -> ET.ElementTree: + """Rescales parameters in metashape's cameras.xml format to match target width/height. + + The EyefulTower dataset provides images which have already been rescaled to smaller sizes from the original ~8K + resolution. However, the cameras.xml file provided, which contains the camera intrinsics in metashape's format, + only contains valid parameters for the original resolution. This function generates a new set of parameters + corresponding to a smaller resolution dataset by scaling the original values from cameras.xml. Non-uniform + scaling (different in X and Y) can be performed due to slight rounding differences. + + Args: + xml_tree: XML tree loaded from Metashape's cameras.xml file + target_width: Width of output images + target_height: Height of output images + + Returns: + Updated XML tree with scaled intrinsics and width/height parameters + """ + transformed = copy.deepcopy(xml_tree) + + root = transformed.getroot() + assert len(root) == 1 + chunk = root[0] + sensors = chunk.find("sensors") + assert sensors is not None + + for sensor in sensors: + resolution = sensor.find("resolution") + assert resolution is not None, "Resolution not found in EyefulTower camera.xml" + original_width = int(resolution.get("width")) # type: ignore + original_height = int(resolution.get("height")) # type: ignore + + if original_width > original_height: + target_width, target_height = max(target_width, target_height), min(target_width, target_height) + else: + target_height, target_width = max(target_width, target_height), min(target_width, target_height) + + resolution.set("width", str(target_width)) + resolution.set("height", str(target_height)) + + calib = sensor.find("calibration") + assert calib is not None, "Calibration not found in EyefulTower sensor" + + calib_resolution = calib.find("resolution") + assert calib_resolution is not None + calib_resolution.set("width", str(target_width)) + calib_resolution.set("height", str(target_height)) + + # Compute each scale individually and average for better rounding + x_scale = target_width / original_width + y_scale = target_height / original_height + scale = (x_scale + y_scale) / 2.0 + + f = calib.find("f") + assert f is not None and f.text is not None, "f not found in calib" + f.text = str(float(f.text) * scale) + + cx = calib.find("cx") + assert cx is not None and cx.text is not None, "cx not found in calib" + cx.text = str(float(cx.text) * x_scale) + + cy = calib.find("cy") + assert cy is not None and cy.text is not None, "cy not found in calib" + cy.text = str(float(cy.text) * y_scale) + + # TODO: Maybe update pixel_width / pixel_height / focal_length / layer_index? + + return transformed + + @staticmethod + def convert_cameras_to_nerfstudio_transforms( + capture_name: str, cameras: dict, splits: dict, target_width: int, target_height: int, extension: str + ) -> dict: + """Converts EyefulTower cameras.json format to Nerfstudio's transforms.json format + + The EyefulTower dataset provides a cameras.json file containing geometric calibration information for the + original resolution ~8K images, similar to the cameras.xml file from Metashape. The main advantage is that data + is provided for each individual image, rather than being structured hierarchically with rig constraints (as in + the Metashape cameras.xml). + + This function takes the cameras.json file and converts it to the transforms.json Nerfstudio expects, with the + necessary scaling of intrinsics parameters applied. This function also handles the EyefulTower splits.json file, + describing the breakdown of training and validation images, and adds the appropriate fields to transforms.json. + This function works for both fisheye (V1) and pinhole (V2) cameras. Scene-specific fisheye mask radii are added + to the returned dictionary if needed. + + Args: + capture_name: Which specific EyefulTower capture is being converted + cameras: Data loaded from EyefulTower cameras.json + splits: Data loaded from EyefulTower splits.json + target_width: Width of output images + target_height: Height of output images + extension: Extension of output images + + Returns: + Dict in the Nerfstudio transforms.json format, with scaled camera parameters, splits, and optional metadata. + """ + output = {} + + distortion_models = [c["distortionModel"] for c in cameras["KRT"]] + distortion_model = list(set(distortion_models)) + assert len(distortion_model) == 1 + distortion_model = distortion_model[0] + if distortion_model == "RadialAndTangential": + output["camera_model"] = "OPENCV" + elif distortion_model == "Fisheye": + output["camera_model"] = "OPENCV_FISHEYE" + output["fisheye_crop_radius"] = eyefultower_fisheye_radii[capture_name] + else: + raise NotImplementedError(f"Camera model {distortion_model} not implemented") + + split_sets = {k: set(v) for k, v in splits.items()} + + frames = [] + split_filenames = collections.defaultdict(list) + for camera in cameras["KRT"]: + frame = {} + # TODO EXR + frame["file_path"] = camera["cameraId"] + f".{extension}" + for split in split_sets: + if camera["cameraId"] in split_sets[split]: + split_filenames[split].append(frame["file_path"]) + + original_width = camera["width"] + original_height = camera["height"] + if original_width > original_height: + target_width, target_height = max(target_width, target_height), min(target_width, target_height) + else: + target_height, target_width = max(target_width, target_height), min(target_width, target_height) + x_scale = target_width / original_width + y_scale = target_height / original_height + + frame["w"] = target_width + frame["h"] = target_height + K = np.array(camera["K"]).T # Data stored as column-major + frame["fl_x"] = K[0][0] * x_scale + frame["fl_y"] = K[1][1] * y_scale + frame["cx"] = K[0][2] * x_scale + frame["cy"] = K[1][2] * y_scale + + if distortion_model == "RadialAndTangential": + # pinhole: [k1, k2, p1, p2, k3] + frame["k1"] = camera["distortion"][0] + frame["k2"] = camera["distortion"][1] + frame["k3"] = camera["distortion"][4] + frame["k4"] = 0.0 + frame["p1"] = camera["distortion"][2] + frame["p2"] = camera["distortion"][3] + elif distortion_model == "Fisheye": + # fisheye: [k1, k2, k3, _, _, _, p1, p2] + frame["k1"] = camera["distortion"][0] + frame["k2"] = camera["distortion"][1] + frame["k3"] = camera["distortion"][2] + frame["p1"] = camera["distortion"][6] + frame["p2"] = camera["distortion"][7] + else: + raise NotImplementedError("This shouldn't happen") + + T = np.array(camera["T"]).T # Data stored as column-major + T = np.linalg.inv(T) + T = T[[2, 0, 1, 3], :] + T[:, 1:3] *= -1 + frame["transform_matrix"] = T.tolist() + + frames.append(frame) + + frames = sorted(frames, key=lambda f: f["file_path"]) + + output["frames"] = frames + output["train_filenames"] = split_filenames["train"] + output["val_filenames"] = split_filenames["test"] + return output + + @staticmethod + def subsample_nerfstudio_transforms(transforms: dict, n: int): + """Uniformly samples n frames from a Nerfstudio transforms.json dict. + + Args: + transforms: Dictionary in Nerfstudio transforms.json format + n: Number of frames to uniformly subsample + + Returns: + New transforms.json dict with n frames. All other parameters are copied. + """ + target = min(len(transforms["frames"]), n) + indices = np.round(np.linspace(0, len(transforms["frames"]) - 1, target)).astype(int) + + frames = [] + for i in indices: + frames.append(transforms["frames"][i]) + + output = copy.deepcopy(transforms) + output["frames"] = frames + + # Remove the unused files from the splits + filenames = {f["file_path"] for f in frames} + for key in ["train_filenames", "val_filenames"]: + output[key] = sorted(list(set(transforms[key]) & filenames)) + + return output + + def download(self, save_dir: Path) -> None: + """Entrypoint to download the EyefulTower dataset. + + * Fetches the specified dataset(s) at the specified resolution(s) from the EyefulTower AWS S3 bucket. Redundant + data is not downloaded, so this function can safely (and performantly) be called multiple times with + increasing scope of datasets and resolutions. + * Generates updated Metashape cameras.xml for lower resolution downloads. + * Generates Nerfstudio transform.json for each resolution. Additionally generates transforms_300.json and + transforms_half.json containing subsets (300 frames, half the frames) of the full set to help with iteration. + + Args: + save_dir: Directory to save dataset. Output will be in save_dir/eyefultower/ + """ + if len(self.capture_name) == 0: + self.capture_name = ("riverview",) + CONSOLE.print( + f"No capture specified, using {self.capture_name} by default.", + "Add `--help` to this command to see all available captures.", + ) + + if len(self.resolution_name) == 0: + self.resolution_name = ("jpeg_2k",) + CONSOLE.print( + f"No resolution specified, using {self.resolution_name} by default.", + "Add `--help` to this command to see all available resolutions.", + ) + + captures = set() + for capture in self.capture_name: + if capture == "all": + captures.update([c for c in eyefultower_downloads if c != "all"]) + else: + captures.add(capture) + captures = sorted(captures) + if len(captures) == 0: + CONSOLE.print("[bold yellow]WARNING: No EyefulTower captures specified. Nothing will be downloaded.") + + resolutions = set() + for resolution in self.resolution_name: + if resolution == "all": + resolutions.update([r for r in eyefultower_resolutions.keys() if r != "all"]) + else: + resolutions.add(resolution) + resolutions = sorted(resolutions) + if len(resolutions) == 0: + CONSOLE.print("[bold yellow]WARNING: No EyefulTower resolutions specified. Nothing will be downloaded.") + + driver = awscli.clidriver.create_clidriver() + + for i, capture in enumerate(captures): + base_url = f"s3://fb-baas-f32eacb9-8abb-11eb-b2b8-4857dd089e15/EyefulTower/{capture}/" + output_path = save_dir / "eyefultower" / capture + includes = [] + for resolution in resolutions: + includes.extend(["--include", f"{eyefultower_resolutions[resolution].folder_name}/*"]) + command = ( + ["s3", "sync", "--no-sign-request", "--only-show-errors", "--exclude", "images*/*"] + + includes + + [base_url, str(output_path)] + ) + CONSOLE.print(f"[EyefulTower Capture {i+1: >2d}/{len(captures)}]: '{capture}'") + print( + f"\tDownloading resolutions {resolutions}", + f"to '{output_path.resolve()}' with command `aws {' '.join(command)}` ...", + end=" ", + flush=True, + ) + driver.main(command) + print("done!") + + # After downloading, we'll insert an appropriate cameras.xml file into each directory. It's quick enough + # that we can just redo it every time this is called, regardless of whether new data is downloaded. + xml_input_path = output_path / "cameras.xml" + if not xml_input_path.exists: + CONSOLE.print( + "\t[bold yellow]WARNING: cameras.xml not found. Scaled cameras.xml will not be generated." + ) + else: + tree = ET.parse(output_path / "cameras.xml") + + for resolution in resolutions: + metadata = eyefultower_resolutions[resolution] + xml_output_path = output_path / metadata.folder_name / "cameras.xml" + print( + f"\tGenerating cameras.xml for '{resolution}' to '{xml_output_path.resolve()}' ... ", + end=" ", + flush=True, + ) + scaled_tree = self.scale_metashape_transform(tree, metadata.width, metadata.height) + scaled_tree.write(xml_output_path) + print("done!") + + json_input_path = output_path / "cameras.json" + splits_input_path = output_path / "splits.json" + if not json_input_path.exists: + CONSOLE.print("\t[bold yellow]WARNING: cameras.json not found. transforms.json will not be generated.") + elif not splits_input_path.exists: + CONSOLE.print("\t[bold yellow]WARNING: splits.json not found. transforms.json will not be generated.") + else: + with open(json_input_path, "r") as f: + cameras = json.load(f) + + with open(splits_input_path, "r") as f: + splits = json.load(f) + + for resolution in resolutions: + metadata = eyefultower_resolutions[resolution] + json_output_path = output_path / metadata.folder_name / "transforms.json" + print( + f"\tGenerating transforms.json for '{resolution}' to '{json_output_path.resolve()}' ... ", + end=" ", + flush=True, + ) + transforms = self.convert_cameras_to_nerfstudio_transforms( + capture, cameras, splits, metadata.width, metadata.height, metadata.extension + ) + + with open(json_output_path, "w", encoding="utf8") as f: + json.dump(transforms, f, indent=4) + + for count, name in [ + (300, "transforms_300.json"), + (int(len(cameras["KRT"]) // 2), "transforms_half.json"), + ]: + subsampled = self.subsample_nerfstudio_transforms(transforms, count) + with open(json_output_path.with_name(name), "w", encoding="utf8") as f: + json.dump(subsampled, f, indent=4) + + print("done!") diff --git a/nerfstudio/scripts/downloads/utils.py b/nerfstudio/scripts/downloads/utils.py new file mode 100644 index 0000000000..72054edb03 --- /dev/null +++ b/nerfstudio/scripts/downloads/utils.py @@ -0,0 +1,32 @@ +# Copyright 2022 the Regents of the University of California, Nerfstudio Team and contributors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from pathlib import Path + +from nerfstudio.configs.base_config import PrintableConfig + + +@dataclass +class DatasetDownload(PrintableConfig): + """Download a dataset""" + + capture_name = None + + save_dir: Path = Path("data/") + """The directory to save the dataset to""" + + def download(self, save_dir: Path) -> None: + """Download the dataset""" + raise NotImplementedError diff --git a/nerfstudio/scripts/github/run_actions.py b/nerfstudio/scripts/github/run_actions.py index 5a966338c1..c758d23b66 100644 --- a/nerfstudio/scripts/github/run_actions.py +++ b/nerfstudio/scripts/github/run_actions.py @@ -56,11 +56,14 @@ def run_github_actions_file(filename: str, continue_on_fail: bool = False): for step in steps: if "name" in step and step["name"] in LOCAL_TESTS: - compressed = step["run"].replace("\n", ";").replace("\\", "") - if "ruff check" in compressed: - curr_command = f"{compressed} --fix" - else: - curr_command = compressed.replace("--check", "") + curr_command = step["run"].replace("\n", ";").replace("\\", "") + if curr_command.startswith("ruff"): + if "ruff check" in curr_command: + curr_command = f"{curr_command} --fix" + + curr_command = curr_command.replace(" --check", "") + curr_command = curr_command.replace(" --diff", "") + curr_command = curr_command.replace(" --output-format=github", "") CONSOLE.line() CONSOLE.rule(f"[bold green]Running: {curr_command}") diff --git a/nerfstudio/utils/tensor_dataclass.py b/nerfstudio/utils/tensor_dataclass.py index a2b8d1dadb..293d978d7e 100644 --- a/nerfstudio/utils/tensor_dataclass.py +++ b/nerfstudio/utils/tensor_dataclass.py @@ -141,6 +141,9 @@ def _broadcast_dict_fields(self, dict_: Dict, batch_shape) -> Dict: new_dict[k] = v.broadcast_to(batch_shape) elif isinstance(v, Dict): new_dict[k] = self._broadcast_dict_fields(v, batch_shape) + else: + # Don't broadcast the remaining fields + new_dict[k] = v return new_dict def __getitem__(self: TensorDataclassT, indices) -> TensorDataclassT: diff --git a/nerfstudio/viewer/viewer.py b/nerfstudio/viewer/viewer.py index c7a881b80d..579dbb5cc4 100644 --- a/nerfstudio/viewer/viewer.py +++ b/nerfstudio/viewer/viewer.py @@ -32,6 +32,7 @@ from nerfstudio.configs import base_config as cfg from nerfstudio.data.datasets.base_dataset import InputDataset from nerfstudio.models.base_model import Model +from nerfstudio.models.gaussian_splatting import GaussianSplattingModel from nerfstudio.pipelines.base_pipeline import Pipeline from nerfstudio.utils.decorators import check_main_thread, decorate_all from nerfstudio.utils.writer import GLOBAL_BUFFER, EventName @@ -247,6 +248,17 @@ def nested_folder_install(folder_labels: List[str], prev_labels: List[str], elem for c in self.viewer_controls: c._setup(self) + # Diagnostics for Gaussian Splatting: where the points are at the start of training. + # This is hidden by default, it can be shown from the Viser UI's scene tree table. + if isinstance(pipeline.model, GaussianSplattingModel): + self.viser_server.add_point_cloud( + "/gaussian_splatting_initial_points", + points=pipeline.model.means.numpy(force=True) * VISER_NERFSTUDIO_SCALE_RATIO, + colors=(255, 0, 0), + point_size=0.01, + point_shape="circle", + visible=False, # Hidden by default. + ) self.ready = True def toggle_pause_button(self) -> None: diff --git a/pyproject.toml b/pyproject.toml index bb68bd4eb6..c35ea04788 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ classifiers = [ dependencies = [ "appdirs>=1.4", "av>=9.2.0", + "awscli>=1.31.10", "comet_ml>=3.33.8", "cryptography>=38", "tyro>=0.6.6", diff --git a/tests/process_data/test_process_images.py b/tests/process_data/test_process_images.py index eed454b11f..8482676c53 100644 --- a/tests/process_data/test_process_images.py +++ b/tests/process_data/test_process_images.py @@ -12,9 +12,11 @@ from nerfstudio.data.utils.colmap_parsing_utils import ( Camera, Image as ColmapImage, + Point3D, qvec2rotmat, write_cameras_binary, write_images_binary, + write_points3D_binary, ) from nerfstudio.process_data.images_to_nerfstudio_dataset import ImagesToNerfstudioDataset @@ -50,6 +52,19 @@ def test_process_images_skip_colmap(tmp_path: Path): {1: Camera(1, "OPENCV", width, height, [110, 110, 50, 75, 0, 0, 0, 0, 0, 0])}, sparse_path / "cameras.bin", ) + write_points3D_binary( + { + 1: Point3D( + id=1, + xyz=np.array([0, 0, 0]), + rgb=np.array([0, 0, 0]), + error=np.array([0]), + image_ids=np.array([1]), + point2D_idxs=np.array([0]), + ), + }, + sparse_path / "points3D.bin", + ) frames = {} num_frames = 10 qvecs = random_quaternion(num_frames) diff --git a/tests/utils/test_tensor_dataclass.py b/tests/utils/test_tensor_dataclass.py index 8971c83f88..10e21adc05 100644 --- a/tests/utils/test_tensor_dataclass.py +++ b/tests/utils/test_tensor_dataclass.py @@ -179,6 +179,12 @@ def test_iter(): assert batch.b.shape == (4, 5) +def test_non_tensor(): + """Test iterating over tensor dataclass""" + # We shouldn't throw away non-dataclass values. + assert DummyTensorDataclass(a=torch.ones((3, 10)), b={"k": 2}, c=None).b == {"k": 2} # type: ignore + + if __name__ == "__main__": test_init() test_broadcasting()