NRCan · mpelchat04 · Oct 11, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+tests
+.github
+.git
+.pytest_cache
+.vscode
+__pycache__
+*.md
+docs/*
diff --git a/Dockerfile b/Dockerfile
@@ -1,46 +1,38 @@
-FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
 
 ARG CONDA_PYTHON_VERSION=3
 ARG CONDA_DIR=/opt/conda
 ARG USERNAME=gdl_user
 ARG USERID=1000
 ARG GIT_TAG=develop
-
+ENV PATH=$CONDA_DIR/bin:$PATH
 # RNCAN certificate; uncomment (with right .cer name) if you are building behind a FW
-#COPY NRCan-RootCA.cer /usr/local/share/ca-certificates/cert.crt
-#RUN chmod 644 /usr/local/share/ca-certificates/cert.crt && update-ca-certificates
+# COPY NRCan-RootCA.cer /usr/local/share/ca-certificates/cert.crt
+# RUN chmod 644 /usr/local/share/ca-certificates/cert.crt && update-ca-certificates
 
 RUN apt-get update \
     && apt-get install -y --no-install-recommends git wget unzip bzip2 build-essential sudo \
-    && apt-key del 7fa2af80 \
-    && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
-    && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
-    && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004-keyring.gpg \
-    && sudo mv cuda-ubuntu2004-keyring.gpg /usr/share/keyrings/cuda-archive-keyring.gpg \
-    && rm -f cuda-keyring_1.0-1_all.deb && rm -f /etc/apt/sources.list.d/cuda.list
-
-# Install Mamba directly
-ENV PATH $CONDA_DIR/bin:$PATH
-RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh -O /tmp/mamba.sh && \
-    /bin/bash /tmp/mamba.sh -b -p $CONDA_DIR && \
-    rm -rf /tmp/* && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LD_LIBRARY_PATH $CONDA_DIR/lib:$LD_LIBRARY_PATH
-
-# Create the user
-RUN useradd --create-home -s /bin/bash --no-user-group -u $USERID $USERNAME && \
-    chown $USERNAME $CONDA_DIR -R && \
-    adduser $USERNAME sudo && \
-    echo "$USERNAME ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
-
+    && wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh -O /tmp/mamba.sh \
+    && /bin/bash /tmp/mamba.sh -b -p $CONDA_DIR \
+    && rm -rf /tmp/* \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && useradd --create-home -s /bin/bash --no-user-group -u $USERID $USERNAME \
+    && chown $USERNAME $CONDA_DIR -R \
+    && adduser $USERNAME sudo \
+    && echo "$USERNAME ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+
+ENV LD_LIBRARY_PATH=$CONDA_DIR/lib:$LD_LIBRARY_PATH
 USER $USERNAME
-WORKDIR /home/$USERNAME/
-
-RUN cd /home/$USERNAME && git clone --depth 1 "https://github.com/NRCan/geo-deep-learning.git" --branch $GIT_TAG
-RUN conda config --set ssl_verify no
-RUN mamba env create -f /home/$USERNAME/geo-deep-learning/environment.yml
-
-ENV PATH $CONDA_DIR/envs/geo_deep_env/bin:$PATH
-RUN echo "source activate geo_deep_env" > ~/.bashrc
+WORKDIR /usr/app
+
+COPY environment.yml /usr/app
+RUN cd /home/$USERNAME && \
+    conda config --set ssl_verify no && \ 
+    mamba env create -f /usr/app/environment.yml && \
+    mamba clean --all \
+    && pip uninstall -y pip
+
+COPY . /usr/app/geo-deep-learning
+ENV PATH=$CONDA_DIR/envs/geo_ml_env/bin:$PATH
+RUN echo "source activate geo_ml_env" > ~/.bashrc
diff --git a/config/inference/default_binary.yaml b/config/inference/default_binary.yaml
@@ -5,15 +5,9 @@ inference:
   input_stac_item:  # alternatively, use a path or url to stac item directly
   model_path: ${general.save_weights_dir}/
   output_path:
-  checkpoint_dir:  # (string, optional): directory in which to save the object if url
-  batch_size: 8
-  chunk_size:  # if empty, will be calculated automatically from max_pix_per_mb_gpu
-  # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to
-  # 10, chunk_size will be set to sqrt(1000 * 10) = 100.
-  max_pix_per_mb_gpu: 25
+  patch_size: 1024
+  workers: 0
   prep_data_only: False
-  override_model_params: False
-  save_heatmap: True  # saves a heatmap to {output_dir}/{output_name}_heatmap.tif
   heatmap_threshold: 0.3
 
   # GPU parameters
@@ -22,4 +16,4 @@ inference:
   max_used_ram: ${training.max_used_ram}  # If RAM usage of detected GPU exceeds this percentage, it will be ignored
 
   # Post-processing
-  mask_to_vector: False  # if True, a polygonized version of the inference (.gpkg) will be created with rasterio tools
+  ras2vec: False  # if True, a polygonized version of the inference (.gpkg) will be created with rasterio tools
diff --git a/config/inference/default_multiclass.yaml b/config/inference/default_multiclass.yaml
@@ -5,15 +5,9 @@ inference:
   input_stac_item:  # alternatively, use a path or url to stac item directly
   model_path: ${general.save_weights_dir}/
   output_path:
-  checkpoint_dir:  # (string, optional): directory in which to save the object if url
-  batch_size: 8
-  chunk_size:  # if empty, will be calculated automatically from max_pix_per_mb_gpu
-  # Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to
-  # 10, chunk_size will be set to sqrt(1000 * 10) = 100.
-  max_pix_per_mb_gpu: 25
+  patch_size: 1024
+  workers: 0
   prep_data_only: False
-  override_model_params: False
-  save_heatmap: True  # saves a heatmap to {output_dir}/{output_name}_heatmap.tif
   heatmap_threshold: 0.3
 
   # GPU parameters
@@ -22,4 +16,4 @@ inference:
   max_used_ram: ${training.max_used_ram}  # If RAM usage of detected GPU exceeds this percentage, it will be ignored
 
   # Post-processing
-  mask_to_vector: False  # if True, a polygonized version of the inference (.gpkg) will be created with rasterio tools
+  ras2vec: False  # if True, a polygonized version of the inference (.gpkg) will be created with rasterio tools
diff --git a/docs/source/mode.rst b/docs/source/mode.rst
@@ -181,40 +181,36 @@ will be found in :ref:`configurationdefaultparam` under ``inference`` and this c
 .. literalinclude:: ../../../config/inference/default_binary.yaml
    :language: yaml
 
-- ``raw_data_csv`` (str)
-    Path to the images csv.
 - ``root_dir`` (str)
     Directory where outputs and downloads will be written by default,
-    if ``checkpoint_dir`` or ``output_path`` are omitted.
+    if ``output_path`` is omitted.
 - ``raw_data_csv`` (str)
     Points to a csv containing paths to imagery for inference. If a ground truth is present in 2nd column,
     it will be ignored.
 - ``input_stac_item`` (str)
     A path or url to :ref:`stac item <datasetstacitem>` directly. 
     See stac item example for `Spacenet test data <https://datacube-stage.services.geo.ca/api/collections/spacenet-samples/items/SpaceNet_AOI_2_Las_Vegas-056155973080_01_P001-WV03>`_, 
     also contained in `test data <https://github.com/NRCan/geo-deep-learning/tree/develop/tests/data/spacenet.zip>`_.
-- ``state_dict_path`` (str)
+- ``model_path`` (str)
     Path to checkpoint containing trained weights for a given neural network architecture.
 - ``output_path`` (str, optional)
     Complete path including parent directories and full name with extension where output inference should
     be saved. By default ``root_dir/{aoi.aoi_id}_pred.tif`` (see :ref:`AOI documentation <dataset>`), the 
     ``output_path`` parameter should only be used if a single inference is being performed. Otherwise, it 
     is recommended to set the root_dir and use the default output name.  
-- ``checkpoint_dir`` (str)
-    Directory in which to save the checkpoint file if url.
-- ``chunk_size`` (int)
-    Size of chunk (in pixels) to read use for inference iterations over input imagery. The input patch will
-    be square, therefore set at ``512`` it will generate 512 x 512 patches.
-- ``max_pix_per_mb_gpu`` (int)
-    If chunk_size is omitted, this defines a "*maximum number of pixels per MB of GPU Ram*" that should be 
-    considered. E.g. if GPU has 1000 Mb of Ram and this parameter is set to 10, chunk_size will be set to 
-    ``sqrt(1000 * 10) = 100``. By defaults it's set to 25. Since this feature is based on a rule-of-thumb 
-    and assumes some prior empirical testing. WIP. 
+- ``patch_size`` (int)
+    Size of patch (in pixels) to read use for inference iterations over input imagery. The input patch will
+    be square, therefore, if set at ``512`` it will generate 512 x 512 patches.
+- ``workers`` (int)
+    Number of workers used by the geo-inference library. Default is `0` = Number of cores available on the 
+    host, minus 1.
 - ``prep_data_only`` (bool)
     If True, the inference script will exit after preparation of input data.
     If checkpoint path is url, then the checkpoint will be download, if imagery points to urls, it will be 
     downloaded and if input model expects imagery with :ref:`histogram equalization <datatiling>`, this 
     enhancement is applied and equalized images save to disk.
+- ``heatmap_threshold`` (float)
+    Prediction probability Threshold (fraction of 1) to use. Default is ``0.3``.
 - ``gpu`` (int)
     Number of gpus to use at inference. 
 - ``max_used_perc`` (int)
@@ -224,7 +220,7 @@ will be found in :ref:`configurationdefaultparam` under ``inference`` and this c
 - ``max_used_ram`` (int)
     If RAM usage of detected GPU exceeds this percentage, it will be ignored.
 - ``ras2vec`` (bool)
-    If True, a polygonized version of the inference ``.gpkg`` will be created with rasterio tools.
+    If True, a polygonized version of the inference ``.geojson`` will be created with rasterio tools.
 
 .. note:: 
 

diff --git a/environment.yml b/environment.yml
@@ -1,16 +1,27 @@
-name: geo_deep_env
+name: geo_ml_env
 channels:
+  - pytorch
+  - nvidia
   - conda-forge
 dependencies:
-  - python==3.11.5
+  - coverage>=6.3.1
+  - geopandas>=0.14.4
+  - hydra-core>=1.2.0
   - pip
-  - gdal
-  - pystac>=0.3.0
+  - pynvml>=11.0
+  - pystac>=1.10.1
   - pytest>=7.1
+  - python>=3.11
+  - pytorch>=2.3
+  - pytorch-cuda>=12.1
+  - rich>=11.1
   - ruamel_yaml>=0.15
   - scikit-image>=0.18
+  - torchgeo>=0.5.2
+  - torchvision>=0.13
   - pip:
-    - geo-inference>=2.0.7
+    - git+https://github.com/NRCan/geo-inference.git
     - hydra-colorlog>=1.1.0
     - hydra-optuna-sweeper>=1.1.0
+    - ttach>=0.0.3
     - mlflow>=1.2 # causes env solving to hang if not with pip
diff --git a/inference_segmentation.py b/inference_segmentation.py
@@ -1,12 +1,14 @@
+import os
 import csv
-from math import sqrt
 from tqdm import tqdm
 from pathlib import Path
 from numbers import Number
 from tempfile import mkstemp
 from omegaconf import DictConfig
 from typing import Dict, Sequence, Union
 from dataset.stacitem import SingleBandItemEO
+import rasterio
+from shutil import move
 
 
 from utils.aoiutils import aois_from_csv
@@ -24,24 +26,6 @@ def stac_input_to_temp_csv(input_stac_item: Union[str, Path]) -> Path:
         csv.writer(fh).writerow([str(input_stac_item), None, "inference", Path(input_stac_item).stem])
     return Path(stac_temp_csv)
 
-def calc_inference_chunk_size(gpu_devices_dict: dict, max_pix_per_mb_gpu: int = 200, default: int = 512) -> int:
-    """
-    Calculate maximum chunk_size that could fit on GPU during inference based on thumb rule with hardcoded
-    "pixels per MB of GPU RAM" as threshold. Threshold based on inference with a large model (Deeplabv3_resnet101)
-    :param gpu_devices_dict: dictionary containing info on GPU devices as returned by lst_device_ids (utils.py)
-    :param max_pix_per_mb_gpu: Maximum number of pixels that can fit on each MB of GPU (better to underestimate)
-    :return: returns a downgraded evaluation batch size if the original batch size is considered too high
-    """
-    if not gpu_devices_dict:
-        return default
-    # get max ram for smallest gpu
-    smallest_gpu_ram = min(gpu_info['max_ram'] for _, gpu_info in gpu_devices_dict.items())
-    # rule of thumb to determine max chunk size based on approximate max pixels a gpu can handle during inference
-    max_chunk_size = sqrt(max_pix_per_mb_gpu * smallest_gpu_ram)
-    max_chunk_size_rd = int(max_chunk_size - (max_chunk_size % 256))  # round to the closest multiple of 256
-    logging.info(f'Data will be split into chunks of {max_chunk_size_rd} if chunk_size is not specified.')
-    return max_chunk_size_rd
-
 
 def main(params:Union[DictConfig, Dict]):
 
@@ -51,9 +35,10 @@ def main(params:Union[DictConfig, Dict]):
                              params['inference'], 
                              to_path=True,
                              validate_path_exists=True,
-                             wildcard='*.pt')
-    mask_to_vector = get_key_def('mask_to_vector', params['inference'], default=False, expected_type=bool)
+                             wildcard='*pt')
 
+    prep_data_only = get_key_def('prep_data_only', params['inference'], default=False, expected_type=bool)
+
     # Set the device
     num_devices = get_key_def('gpu', params['inference'], default=0, expected_type=(int, bool))
     if num_devices > 1:
@@ -64,25 +49,23 @@ def main(params:Union[DictConfig, Dict]):
         raise ValueError(f'\nMax used ram parameter should be a percentage. Got {max_used_ram}.')
     max_used_perc = get_key_def('max_used_perc', params['inference'], default=25, expected_type=int)
     gpu_devices_dict = get_device_ids(num_devices, max_used_ram_perc=max_used_ram, max_used_perc=max_used_perc)
-    max_pix_per_mb_gpu = get_key_def('max_pix_per_mb_gpu', params['inference'], default=25, expected_type=int)
-    auto_chunk_size = calc_inference_chunk_size(gpu_devices_dict=gpu_devices_dict,
-                                                max_pix_per_mb_gpu=max_pix_per_mb_gpu, default=512)
-
-
-    chunk_size = get_key_def('chunk_size', params['inference'], default=auto_chunk_size, expected_type=int)
-    batch_size = get_key_def('batch_size', params['inference'], default=8, expected_type=int)
+    patch_size = get_key_def('patch_size', params['inference'], default=1024, expected_type=int)
+    workers = get_key_def('workers', params['inference'], default=0, expected_type=int)
+    prediction_threshold = get_key_def('prediction_threshold', params['inference'], default=0.3, expected_type=float)
     device = set_device(gpu_devices_dict=gpu_devices_dict)
 
 
     # Dataset params
     bands_requested = get_key_def('bands', params['dataset'], default=[1, 2, 3], expected_type=Sequence)
+    classes_dict = get_key_def('classes_dict', params['dataset'], expected_type=DictConfig)
     download_data = get_key_def('download_data', params['inference'], default=False, expected_type=bool)
     data_dir = get_key_def('raw_data_dir', params['dataset'], default="data", to_path=True, validate_path_exists=True)
     clahe_clip_limit = get_key_def('clahe_clip_limit', params['tiling'], expected_type=Number, default=0)
     raw_data_csv = get_key_def('raw_data_csv', params['inference'], expected_type=str, to_path=True,
                                validate_path_exists=True)
     input_stac_item = get_key_def('input_stac_item', params['inference'], expected_type=str, to_path=True,
                                   validate_path_exists=True)
+    vectorize = get_key_def('ras2vec', params['inference'], expected_type=bool, default=False)
 
     if raw_data_csv and input_stac_item:
         raise ValueError(f"Input imagery should be either a csv of stac item. Got inputs from both \"raw_data_csv\" "
@@ -109,22 +92,37 @@ def main(params:Union[DictConfig, Dict]):
         data_dir=data_dir,
         equalize_clahe_clip_limit=clahe_clip_limit,
     )
+
+    if prep_data_only:
+        logging.info(f"[prep_data_only mode] Data preparation for inference is complete. Exiting...")
+        exit()
 
     # Create the inference object
     device_str = "gpu" if device.type == 'cuda' else "cpu"
     gpu_index = device.index if device.type == 'cuda' else 0
 
     geo_inference = GeoInference(model=str(model_path),
                                  work_dir=str(working_folder),
-                                 batch_size=batch_size,
-                                 mask_to_vec=mask_to_vector,
+                                 mask_to_vec=vectorize,
                                  device=device_str,
                                  gpu_id=gpu_index,
+                                 prediction_threshold=prediction_threshold,
                                  )
 
     # LOOP THROUGH LIST OF INPUT IMAGES
     for aoi in tqdm(list_aois, desc='Inferring from images', position=0, leave=True):
         logging.info(f'\nReading image: {aoi.aoi_id}')
-        raster = aoi.raster
-        geo_inference(raster, tiff_name=aoi.aoi_id, patch_size=chunk_size)
-
+        input_path = str(aoi.raster.name)
+        mask_name = geo_inference(input_path, patch_size=patch_size, workers=workers)
+        mask_path = working_folder / mask_name
+
+        # update metadata info and rename mask tif.
+        if classes_dict is not None:
+            meta_data_dict = {"checkpoint": str(model_path), 
+                            "classes_dict": classes_dict}
+            with rasterio.open(mask_path, 'r+') as raster:
+                raster.update_tags(**meta_data_dict)
+        output_path = get_key_def('output_path', params['inference'], expected_type=str, to_path=True, 
+                                  default=mask_path)
+        move(mask_path, output_path)
+        logging.info(f"finished inferring image: {aoi.aoi_id} ")
diff --git a/tests/CI/test_gh_actions_ci.py b/tests/CI/test_gh_actions_ci.py
@@ -12,9 +12,9 @@ class Test_GH_Actions(object):
     def test_ci(self) -> None:
         data_dir = "data"
         Path(data_dir).mkdir(exist_ok=True, parents=True)
-        extract_archive(src="tests/data/spacenet.zip")
-        extract_archive(src="tests/data/new_brunswick_aerial.zip")
-        extract_archive(src="tests/data/massachusetts_buildings_kaggle.zip")
+        extract_archive(from_path="tests/data/spacenet.zip")
+        extract_archive(from_path="tests/data/new_brunswick_aerial.zip")
+        extract_archive(from_path="tests/data/massachusetts_buildings_kaggle.zip")
 
         with initialize(config_path="../../config", job_name="test_ci"):
             cfg = compose(config_name="gdl_config_template")