diff --git a/environment.yaml b/environment.yaml index 62524cbb..b31a3509 100644 --- a/environment.yaml +++ b/environment.yaml @@ -10,10 +10,8 @@ dependencies: - numpy>=2.0 - pillow - pytorch>=2.1 - - rasterio - scikit-learn - tensorboard - - torchaudio - torchvision - tqdm - tifffile diff --git a/pangaea/datasets/biomassters.py b/pangaea/datasets/biomassters.py index 8e98b785..bcf90ba4 100644 --- a/pangaea/datasets/biomassters.py +++ b/pangaea/datasets/biomassters.py @@ -2,8 +2,7 @@ import torch import pandas as pd import pathlib -import rasterio -from tifffile import imread +import tifffile from os.path import join as opj from pangaea.datasets.utils import read_tif @@ -23,7 +22,7 @@ def read_imgs(multi_temporal, temp , fname, data_dir, img_size): s1_filepath = data_dir.joinpath(s1_fname) if s1_filepath.exists(): - img_s1 = imread(s1_filepath) + img_s1 = tifffile.imread(s1_filepath) m = img_s1 == -9999 img_s1 = img_s1.astype('float32') img_s1 = np.where(m, 0, img_s1) @@ -32,7 +31,7 @@ def read_imgs(multi_temporal, temp , fname, data_dir, img_size): s2_filepath = data_dir.joinpath(s2_fname) if s2_filepath.exists(): - img_s2 = imread(s2_filepath) + img_s2 = tifffile.imread(s2_filepath) img_s2 = img_s2.astype('float32') else: img_s2 = np.zeros((img_size, img_size) + (11,), dtype='float32') @@ -155,8 +154,7 @@ def __getitem__(self, index): fname = str(chip_id)+'_agbm.tif' imgs_s1, imgs_s2, mask = read_imgs(self.multi_temporal, self.temp, fname, self.dir_features, self.img_size) - with rasterio.open(self.dir_labels.joinpath(fname)) as lbl: - target = lbl.read(1) + target = tifffile.imread(self.dir_labels.joinpath(fname), key=0) target = np.nan_to_num(target) imgs_s1 = torch.from_numpy(imgs_s1).float() diff --git a/pangaea/datasets/fivebillionpixels.py b/pangaea/datasets/fivebillionpixels.py index 4cae19ef..0f656fd1 100644 --- a/pangaea/datasets/fivebillionpixels.py +++ b/pangaea/datasets/fivebillionpixels.py @@ -2,7 +2,6 @@ import time import torch import numpy as np -import rasterio import random from glob import glob diff --git a/pangaea/datasets/hlsburnscars.py b/pangaea/datasets/hlsburnscars.py index c2a2e0b9..6b254beb 100644 --- a/pangaea/datasets/hlsburnscars.py +++ b/pangaea/datasets/hlsburnscars.py @@ -2,7 +2,6 @@ import time import torch import numpy as np -# import rasterio import tifffile as tiff from typing import Sequence, Dict, Any, Union, Literal, Tuple from sklearn.model_selection import train_test_split diff --git a/pangaea/datasets/mados.py b/pangaea/datasets/mados.py index bcc6f620..e7c8f3fe 100644 --- a/pangaea/datasets/mados.py +++ b/pangaea/datasets/mados.py @@ -6,16 +6,11 @@ import zipfile from glob import glob -import rasterio +import cv2 +import tifffile import numpy as np -import warnings - -warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning) - import torch -import torchvision.transforms.functional as TF -import torchvision.transforms as T from pangaea.datasets.utils import DownloadProgressBar from pangaea.datasets.base import GeoFMDataset @@ -112,27 +107,32 @@ def __init__( self.download_url = download_url self.auto_download = auto_download - self.ROIs_split = np.genfromtxt(os.path.join(self.root_path, 'splits', f'{split}_X.txt'), dtype='str') + self.ROIs_split = np.genfromtxt( + os.path.join(self.root_path, "splits", f"{split}_X.txt"), dtype="str" + ) self.image_list = [] self.target_list = [] - self.tiles = sorted(glob(os.path.join(self.root_path, '*'))) + self.tiles = sorted(glob(os.path.join(self.root_path, "*"))) for tile in self.tiles: - splits = [f.split('_cl_')[-1] for f in glob(os.path.join(tile, '10', '*_cl_*'))] + splits = [ + f.split("_cl_")[-1] for f in glob(os.path.join(tile, "10", "*_cl_*")) + ] for crop in splits: - crop_name = os.path.basename(tile) + '_' + crop.split('.tif')[0] + crop_name = os.path.basename(tile) + "_" + crop.split(".tif")[0] if crop_name in self.ROIs_split: - all_bands = glob(os.path.join(tile, '*', '*L2R_rhorc*_' + crop)) + all_bands = glob(os.path.join(tile, "*", "*L2R_rhorc*_" + crop)) all_bands = sorted(all_bands, key=self.get_band) - # all_bands = np.array(all_bands) self.image_list.append(all_bands) - cl_path = os.path.join(tile, '10', os.path.basename(tile) + '_L2R_cl_' + crop) + cl_path = os.path.join( + tile, "10", os.path.basename(tile) + "_L2R_cl_" + crop + ) self.target_list.append(cl_path) def __len__(self): @@ -143,42 +143,36 @@ def getnames(self): def __getitem__(self, index): - all_bands = self.image_list[index] + band_paths = self.image_list[index] current_image = [] - for c, band in enumerate(all_bands): - upscale_factor = int(os.path.basename(os.path.dirname(band))) // 10 - with rasterio.open(band, mode='r') as src: - this_band = src.read(1, - out_shape=(int(src.height * upscale_factor), int(src.width * upscale_factor)), - resampling=rasterio.enums.Resampling.nearest - ) - this_band = torch.from_numpy(this_band) - #this_band[torch.isnan(this_band)] = self.data_mean['optical'][c] - current_image.append(this_band) - - image = torch.stack(current_image) - invalid_mask = torch.isnan(image) - image[invalid_mask] = 0 + for path in band_paths: + upscale_factor = int(os.path.basename(os.path.dirname(path))) // 10 + band = tifffile.imread(path) + band = cv2.resize(band, dsize=None, fx=upscale_factor, fy=upscale_factor, interpolation=cv2.INTER_NEAREST_EXACT) + band_tensor = torch.from_numpy(band).unsqueeze(0) + current_image.append(band_tensor) - with rasterio.open(self.target_list[index], mode='r') as src: - target = src.read(1) + image = torch.cat(current_image) + invalid_mask = torch.isnan(image) + image[invalid_mask] = 0 + target = tifffile.imread(self.target_list[index]) target = torch.from_numpy(target.astype(np.int64)) target = target - 1 output = { - 'image': { - 'optical': image, + "image": { + "optical": image, }, - 'target': target, - 'metadata': {} + "target": target, + "metadata": {}, } return output @staticmethod def get_band(path): - return int(path.split('_')[-2]) + return int(path.split("_")[-2]) @staticmethod def download(self, silent=False): @@ -199,15 +193,17 @@ def download(self, silent=False): try: urllib.request.urlretrieve(url, output_path / temp_file_name, pbar) except urllib.error.HTTPError as e: - print('Error while downloading dataset: The server couldn\'t fulfill the request.') - print('Error code: ', e.code) + print( + "Error while downloading dataset: The server couldn't fulfill the request." + ) + print("Error code: ", e.code) return except urllib.error.URLError as e: - print('Error while downloading dataset: Failed to reach a server.') - print('Reason: ', e.reason) + print("Error while downloading dataset: Failed to reach a server.") + print("Reason: ", e.reason) return - with zipfile.ZipFile(output_path / temp_file_name, 'r') as zip_ref: + with zipfile.ZipFile(output_path / temp_file_name, "r") as zip_ref: print(f"Extracting to {output_path} ...") # Remove top-level dir in ZIP file for nicer data dir structure members = [] @@ -219,4 +215,4 @@ def download(self, silent=False): zip_ref.extractall(output_path, members) print("done.") - (output_path / temp_file_name).unlink() \ No newline at end of file + (output_path / temp_file_name).unlink() diff --git a/pangaea/datasets/pastis.py b/pangaea/datasets/pastis.py index 9abc3ac2..cf5ca64f 100644 --- a/pangaea/datasets/pastis.py +++ b/pangaea/datasets/pastis.py @@ -10,7 +10,7 @@ import geopandas as gpd import numpy as np import pandas as pd -import rasterio +import tifffile import torch from einops import rearrange @@ -203,17 +203,15 @@ def __getitem__(self, i: int) -> dict[str, torch.Tensor | dict[str, torch.Tensor for modality in self.modalities: if modality == "aerial": - with rasterio.open( - os.path.join( + path = os.path.join( self.path, "DATA_SPOT/PASTIS_SPOT6_RVB_1M00_2019/SPOT6_RVB_1M00_2019_" + str(name) + ".tif", - ) - ) as f: - output["aerial"] = split_image( - torch.FloatTensor(f.read()), self.nb_split, part - ) + ) + output["aerial"] = split_image( + torch.FloatTensor(tifffile.imread(path).transpose(2,0,1), self.nb_split, part) + ) elif modality == "s1-median": modality_name = "s1a" images = split_image( diff --git a/pangaea/datasets/sen1floods11.py b/pangaea/datasets/sen1floods11.py index 76680c7d..937fc688 100644 --- a/pangaea/datasets/sen1floods11.py +++ b/pangaea/datasets/sen1floods11.py @@ -4,7 +4,7 @@ import geopandas import numpy as np import pandas as pd -import rasterio +import tifffile import torch from pangaea.datasets.utils import download_bucket_concurrently @@ -138,17 +138,13 @@ def _get_date(self, index): return date_np def __getitem__(self, index): - with rasterio.open(self.s2_image_list[index]) as src: - s2_image = src.read() + s2_image = tifffile.imread(self.s2_image_list[index]) - with rasterio.open(self.s1_image_list[index]) as src: - s1_image = src.read() - # Convert the missing values (clouds etc.) - s1_image = np.nan_to_num(s1_image) - - with rasterio.open(self.target_list[index]) as src: - target = src.read(1) + s1_image = tifffile.imread(self.s1_image_list[index]) + # Convert the missing values (clouds etc.) + s1_image = np.nan_to_num(s1_image) + target = tifffile.imread(self.target_list[index], key=0) timestamp = self._get_date(index) s2_image = torch.from_numpy(s2_image).float() diff --git a/pangaea/datasets/spacenet7.py b/pangaea/datasets/spacenet7.py index 144eb4b3..1a9cf574 100644 --- a/pangaea/datasets/spacenet7.py +++ b/pangaea/datasets/spacenet7.py @@ -13,7 +13,8 @@ import json from glob import glob -import rasterio +import cv2 +import tifffile import numpy as np import torch @@ -214,17 +215,17 @@ def __len__(self) -> int: def load_planet_mosaic(self, aoi_id: str, year: int, month: int) -> np.ndarray: folder = self.root_path / 'train' / aoi_id / 'images_masked' file = folder / f'global_monthly_{year}_{month:02d}_mosaic_{aoi_id}.tif' - with rasterio.open(str(file), mode='r') as src: - img = src.read(out_shape=(self.sn7_img_size, self.sn7_img_size), resampling=rasterio.enums.Resampling.nearest) - # 4th band (last oen) is alpha band - img = img[:-1] + img = tifffile.imread(file) + img = cv2.resize(img, dsize=(self.sn7_img_size, self.sn7_img_size), interpolation=cv2.INTER_NEAREST_EXACT) + # 4th band (last one) is alpha band + img = img.transpose(2, 0, 1)[:-1] return img.astype(np.float32) def load_building_label(self, aoi_id: str, year: int, month: int) -> np.ndarray: folder = self.root_path / 'train' / aoi_id / 'labels_raster' file = folder / f'global_monthly_{year}_{month:02d}_mosaic_{aoi_id}_Buildings.tif' - with rasterio.open(str(file), mode='r') as src: - label = src.read(out_shape=(self.sn7_img_size, self.sn7_img_size), resampling=rasterio.enums.Resampling.nearest) + label = tifffile.imread(file) + label = cv2.resize(label, dsize=(self.sn7_img_size, self.sn7_img_size), interpolation=cv2.INTER_NEAREST_EXACT) label = (label > 0).squeeze() return label.astype(np.int64) diff --git a/pangaea/datasets/utae_dynamicen.py b/pangaea/datasets/utae_dynamicen.py index 7705e331..0092cf10 100644 --- a/pangaea/datasets/utae_dynamicen.py +++ b/pangaea/datasets/utae_dynamicen.py @@ -1,6 +1,6 @@ import os import numpy as np -import rasterio +import tifffile import torch # from torch.utils.data import Dataset # from torchvision import transforms @@ -154,19 +154,19 @@ def load_data(self, index): cur_images, cur_dates = [], [] if self.mode == 'daily': for i in range(1, self.all_days[index][0]+1): - img = rasterio.open(os.path.join(self.root_path, self.all_days[index][i][0][1:])) - red = img.read(3) - green = img.read(2) - blue = img.read(1) - nir = img.read(4) + with tifffile.TiffFile.open(os.path.join(self.root_path, self.all_days[index][i][0][1:])) as img: + red = img.pages[2].asarray() + green = img.pages[1].asarray() + blue = img.pages[0].asarray() + nir = img.pages[3].asarray() image = np.dstack((red, green, blue, nir)) cur_images.append(np.expand_dims(np.asarray(image, dtype=np.float32), axis=0)) # np.array already\ cur_dates.append(self.all_days[index][i][1]) image_stack = np.concatenate(cur_images, axis=0) dates = torch.from_numpy(np.array(cur_dates, dtype=np.int32)) - label = rasterio.open(os.path.join(self.root_path, self.labels[index][1:])) - label = label.read() + label = tifffile.imread(os.path.join(self.root_path, self.labels[index][1:])) + label = label.transpose(2, 0, 1) mask = np.zeros((label.shape[1], label.shape[2]), dtype=np.int32) for i in range(self.num_classes + 1): @@ -180,17 +180,17 @@ def load_data(self, index): else: for i in range(len(self.dates)): # read .tif - img = rasterio.open(os.path.join(self.root_path, self.planet_day[index][i][1:])) - red = img.read(3) - green = img.read(2) - blue = img.read(1) - nir = img.read(4) + with tifffile.TiffFile.open(os.path.join(self.root_path, self.planet_day[index][i][1:])) as img: + red = img.pages[2].asarray() + green = img.pages[1].asarray() + blue = img.pages[0].asarray() + nir = img.pages[3].asarray() image = np.dstack((red, green, blue, nir)) cur_images.append(np.expand_dims(np.asarray(image, dtype=np.float32), axis=0)) # np.array already\ image_stack = np.concatenate(cur_images, axis=0) dates = torch.from_numpy(np.array(self.planet_day[index][len(self.dates):], dtype=np.int32)) - label = rasterio.open(os.path.join(self.root_path, self.labels[index][1:])) - label = label.read() + label = tifffile.imread(os.path.join(self.root_path, self.labels[index][1:])) + label = label.transpose(2, 0, 1) mask = np.zeros((label.shape[1], label.shape[2]), dtype=np.int32) for i in range(self.num_classes + 1): diff --git a/pangaea/datasets/utils.py b/pangaea/datasets/utils.py index 9b28cd88..680263b9 100644 --- a/pangaea/datasets/utils.py +++ b/pangaea/datasets/utils.py @@ -1,6 +1,6 @@ import os import tqdm -import rasterio +import tifffile import pathlib import concurrent.futures from google.cloud.storage import Client @@ -83,14 +83,5 @@ def download_blob_file_pair(blob_file_pair): def read_tif(file: pathlib.Path): - with rasterio.open(file) as dataset: - arr = dataset.read() # (bands X height X width) - return arr.transpose((1, 2, 0)) - - -def read_tif_with_metadata(file: pathlib.Path): - with rasterio.open(file) as dataset: - arr = dataset.read() # (bands X height X width) - transform = dataset.transform - crs = dataset.crs - return arr.transpose((1, 2, 0)), transform, crs \ No newline at end of file + arr = tifffile.imread(file) + return arr.transpose(2, 0, 1) diff --git a/pangaea/utils/compute_norm_std.py b/pangaea/utils/compute_norm_std.py index d16d94a1..b5a5c1ea 100644 --- a/pangaea/utils/compute_norm_std.py +++ b/pangaea/utils/compute_norm_std.py @@ -1,7 +1,7 @@ import glob import numpy as np import os -import rasterio +import tifffile def compute_norm_std(split_file_path, data_root_path): @@ -26,9 +26,9 @@ def compute_norm_std(split_file_path, data_root_path): data_list = [] for img in path: - with rasterio.open(img) as src: - data = src.read() - data = np.nan_to_num(data) + data = tifffile.imread(img) + data = data.transpose(2, 0, 1) + data = np.nan_to_num(data) data = data.reshape((2, -1)) data_list.append(data) diff --git a/requirements.txt b/requirements.txt index 015715f5..58fb9a25 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ torch>=2.1.0 torchvision geopandas -rasterio pillow scikit-learn tensorboard