Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update geo-inference parameters #579

Merged
merged 16 commits into from
Oct 11, 2024
8 changes: 8 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
tests
.github
.git
.pytest_cache
.vscode
__pycache__
*.md
docs/*
62 changes: 27 additions & 35 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,46 +1,38 @@
FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04

ARG CONDA_PYTHON_VERSION=3
ARG CONDA_DIR=/opt/conda
ARG USERNAME=gdl_user
ARG USERID=1000
ARG GIT_TAG=develop

ENV PATH=$CONDA_DIR/bin:$PATH
# RNCAN certificate; uncomment (with right .cer name) if you are building behind a FW
#COPY NRCan-RootCA.cer /usr/local/share/ca-certificates/cert.crt
#RUN chmod 644 /usr/local/share/ca-certificates/cert.crt && update-ca-certificates
# COPY NRCan-RootCA.cer /usr/local/share/ca-certificates/cert.crt
# RUN chmod 644 /usr/local/share/ca-certificates/cert.crt && update-ca-certificates

RUN apt-get update \
&& apt-get install -y --no-install-recommends git wget unzip bzip2 build-essential sudo \
&& apt-key del 7fa2af80 \
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
&& sudo dpkg -i cuda-keyring_1.0-1_all.deb \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont we need these cuda keyring stuff anymore?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope. Tested with a local docker image, the CI test image and on HPC. All works fine without these.

&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004-keyring.gpg \
&& sudo mv cuda-ubuntu2004-keyring.gpg /usr/share/keyrings/cuda-archive-keyring.gpg \
&& rm -f cuda-keyring_1.0-1_all.deb && rm -f /etc/apt/sources.list.d/cuda.list

# Install Mamba directly
ENV PATH $CONDA_DIR/bin:$PATH
RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh -O /tmp/mamba.sh && \
/bin/bash /tmp/mamba.sh -b -p $CONDA_DIR && \
rm -rf /tmp/* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

ENV LD_LIBRARY_PATH $CONDA_DIR/lib:$LD_LIBRARY_PATH

# Create the user
RUN useradd --create-home -s /bin/bash --no-user-group -u $USERID $USERNAME && \
chown $USERNAME $CONDA_DIR -R && \
adduser $USERNAME sudo && \
echo "$USERNAME ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers

&& wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh -O /tmp/mamba.sh \
&& /bin/bash /tmp/mamba.sh -b -p $CONDA_DIR \
&& rm -rf /tmp/* \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& useradd --create-home -s /bin/bash --no-user-group -u $USERID $USERNAME \
&& chown $USERNAME $CONDA_DIR -R \
&& adduser $USERNAME sudo \
&& echo "$USERNAME ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers

ENV LD_LIBRARY_PATH=$CONDA_DIR/lib:$LD_LIBRARY_PATH
USER $USERNAME
WORKDIR /home/$USERNAME/

RUN cd /home/$USERNAME && git clone --depth 1 "https://github.com/NRCan/geo-deep-learning.git" --branch $GIT_TAG
RUN conda config --set ssl_verify no
RUN mamba env create -f /home/$USERNAME/geo-deep-learning/environment.yml

ENV PATH $CONDA_DIR/envs/geo_deep_env/bin:$PATH
RUN echo "source activate geo_deep_env" > ~/.bashrc
WORKDIR /usr/app

COPY environment.yml /usr/app
RUN cd /home/$USERNAME && \
conda config --set ssl_verify no && \
mamba env create -f /usr/app/environment.yml && \
mamba clean --all \
&& pip uninstall -y pip

COPY . /usr/app/geo-deep-learning
ENV PATH=$CONDA_DIR/envs/geo_ml_env/bin:$PATH
RUN echo "source activate geo_ml_env" > ~/.bashrc
17 changes: 7 additions & 10 deletions config/inference/default_binary.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,18 @@ inference:
input_stac_item: # alternatively, use a path or url to stac item directly
model_path: ${general.save_weights_dir}/
output_path:
checkpoint_dir: # (string, optional): directory in which to save the object if url
batch_size: 8
chunk_size: # if empty, will be calculated automatically from max_pix_per_mb_gpu
# Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to
# 10, chunk_size will be set to sqrt(1000 * 10) = 100.
max_pix_per_mb_gpu: 25
patch_size: 1024
workers: 0
prep_data_only: False
override_model_params: False
save_heatmap: True # saves a heatmap to {output_dir}/{output_name}_heatmap.tif
heatmap_threshold: 0.3

flip: False
rotate: True
num_classes: 2

# GPU parameters
gpu: ${training.num_gpus}
max_used_perc: ${training.max_used_perc} # If GPU's usage exceeds this percentage, it will be ignored
max_used_ram: ${training.max_used_ram} # If RAM usage of detected GPU exceeds this percentage, it will be ignored

# Post-processing
mask_to_vector: False # if True, a polygonized version of the inference (.gpkg) will be created with rasterio tools
ras2vec: False # if True, a polygonized version of the inference (.gpkg) will be created with rasterio tools
15 changes: 6 additions & 9 deletions config/inference/default_multiclass.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,18 @@ inference:
input_stac_item: # alternatively, use a path or url to stac item directly
model_path: ${general.save_weights_dir}/
output_path:
checkpoint_dir: # (string, optional): directory in which to save the object if url
batch_size: 8
chunk_size: # if empty, will be calculated automatically from max_pix_per_mb_gpu
# Maximum number of pixels each Mb of GPU Ram to allow. E.g. if GPU has 1000 Mb of Ram and this parameter is set to
# 10, chunk_size will be set to sqrt(1000 * 10) = 100.
max_pix_per_mb_gpu: 25
patch_size: 1024
workers: 0
prep_data_only: False
override_model_params: False
save_heatmap: True # saves a heatmap to {output_dir}/{output_name}_heatmap.tif
heatmap_threshold: 0.3
flip: False
rotate: True
num_classes: 5

# GPU parameters
gpu: ${training.num_gpus}
max_used_perc: ${training.max_used_perc} # If GPU's usage exceeds this percentage, it will be ignored
max_used_ram: ${training.max_used_ram} # If RAM usage of detected GPU exceeds this percentage, it will be ignored

# Post-processing
mask_to_vector: False # if True, a polygonized version of the inference (.gpkg) will be created with rasterio tools
ras2vec: False # if True, a polygonized version of the inference (.gpkg) will be created with rasterio tools
31 changes: 16 additions & 15 deletions docs/source/mode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,40 +181,41 @@ will be found in :ref:`configurationdefaultparam` under ``inference`` and this c
.. literalinclude:: ../../../config/inference/default_binary.yaml
:language: yaml

- ``raw_data_csv`` (str)
Path to the images csv.
- ``root_dir`` (str)
Directory where outputs and downloads will be written by default,
if ``checkpoint_dir`` or ``output_path`` are omitted.
if ``output_path`` is omitted.
- ``raw_data_csv`` (str)
Points to a csv containing paths to imagery for inference. If a ground truth is present in 2nd column,
it will be ignored.
- ``input_stac_item`` (str)
A path or url to :ref:`stac item <datasetstacitem>` directly.
See stac item example for `Spacenet test data <https://datacube-stage.services.geo.ca/api/collections/spacenet-samples/items/SpaceNet_AOI_2_Las_Vegas-056155973080_01_P001-WV03>`_,
also contained in `test data <https://github.com/NRCan/geo-deep-learning/tree/develop/tests/data/spacenet.zip>`_.
- ``state_dict_path`` (str)
- ``model_path`` (str)
Path to checkpoint containing trained weights for a given neural network architecture.
- ``output_path`` (str, optional)
Complete path including parent directories and full name with extension where output inference should
be saved. By default ``root_dir/{aoi.aoi_id}_pred.tif`` (see :ref:`AOI documentation <dataset>`), the
``output_path`` parameter should only be used if a single inference is being performed. Otherwise, it
is recommended to set the root_dir and use the default output name.
- ``checkpoint_dir`` (str)
Directory in which to save the checkpoint file if url.
- ``chunk_size`` (int)
Size of chunk (in pixels) to read use for inference iterations over input imagery. The input patch will
be square, therefore set at ``512`` it will generate 512 x 512 patches.
- ``max_pix_per_mb_gpu`` (int)
If chunk_size is omitted, this defines a "*maximum number of pixels per MB of GPU Ram*" that should be
considered. E.g. if GPU has 1000 Mb of Ram and this parameter is set to 10, chunk_size will be set to
``sqrt(1000 * 10) = 100``. By defaults it's set to 25. Since this feature is based on a rule-of-thumb
and assumes some prior empirical testing. WIP.
- ``patch_size`` (int)
Size of patch (in pixels) to read use for inference iterations over input imagery. The input patch will
be square, therefore, if set at ``512`` it will generate 512 x 512 patches.
- ``workers`` (int)
Number of workers used by the geo-inference library. Default is `0` = Number of cores available on the
host, minus 1.
- ``prep_data_only`` (bool)
If True, the inference script will exit after preparation of input data.
If checkpoint path is url, then the checkpoint will be download, if imagery points to urls, it will be
downloaded and if input model expects imagery with :ref:`histogram equalization <datatiling>`, this
enhancement is applied and equalized images save to disk.
- ``heatmap_threshold`` (float)
Prediction probability Threshold (fraction of 1) to use. Default is ``0.3``.
- ``flip`` (bool)
If True, perform horizontal and vertical flips during inference.
- ``rotate`` (bool)
If True, perform 90 degree rotation at inference.

- ``gpu`` (int)
Number of gpus to use at inference.
- ``max_used_perc`` (int)
Expand All @@ -224,7 +225,7 @@ will be found in :ref:`configurationdefaultparam` under ``inference`` and this c
- ``max_used_ram`` (int)
If RAM usage of detected GPU exceeds this percentage, it will be ignored.
- ``ras2vec`` (bool)
If True, a polygonized version of the inference ``.gpkg`` will be created with rasterio tools.
If True, a polygonized version of the inference ``.geojson`` will be created with rasterio tools.

.. note::

Expand Down
21 changes: 16 additions & 5 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,27 @@
name: geo_deep_env
name: geo_ml_env
channels:
- pytorch
- nvidia
- conda-forge
dependencies:
- python==3.11.5
- coverage>=6.3.1
- geopandas>=0.14.4
- hydra-core>=1.2.0
- pip
- gdal
- pystac>=0.3.0
- pynvml>=11.0
- pystac>=1.10.1
- pytest>=7.1
- python>=3.11
- pytorch>=2.3
- pytorch-cuda>=12.1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need pytorch and pytorch-cuda at the same time? pytorch-cuda can work on CPU if needed. Won't having two different versions cause conflicts?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is the recommended way to install pytorch in a conda env.: https://pytorch.org/

- rich>=11.1
- ruamel_yaml>=0.15
- scikit-image>=0.18
- torchgeo>=0.5.2
- torchvision>=0.13
- pip:
- geo-inference>=2.0.7
- git+https://github.com/NRCan/geo-inference.git
- hydra-colorlog>=1.1.0
- hydra-optuna-sweeper>=1.1.0
- ttach>=0.0.3
- mlflow>=1.2 # causes env solving to hang if not with pip
75 changes: 40 additions & 35 deletions inference_segmentation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import csv
from math import sqrt
import rasterio

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it should look like this:

from shutil import move
from pathlib import Path
from numbers import Number
from tempfile import mkstemp
from typing import Dict, Sequence, Union

from tqdm import tqdm
import csv
import rasterio
from omegaconf import DictConfig

from utils.aoiutils import aois_from_csv
from dataset.stacitem import SingleBandItemEO
from utils.logger import get_logger, set_tracker
from geo_inference.geo_inference import GeoInference
from utils.utils import get_device_ids, get_key_def, set_device

but I think that PEP8 refactoring can be done later in a separate PR.

from tqdm import tqdm
from shutil import move
from pathlib import Path
from numbers import Number
from tempfile import mkstemp
from omegaconf import DictConfig
from typing import Dict, Sequence, Union
from dataset.stacitem import SingleBandItemEO


from utils.aoiutils import aois_from_csv
from dataset.stacitem import SingleBandItemEO
from utils.logger import get_logger, set_tracker
from geo_inference.geo_inference import GeoInference
from utils.utils import get_device_ids, get_key_def, set_device
Expand All @@ -24,24 +25,6 @@ def stac_input_to_temp_csv(input_stac_item: Union[str, Path]) -> Path:
csv.writer(fh).writerow([str(input_stac_item), None, "inference", Path(input_stac_item).stem])
return Path(stac_temp_csv)

def calc_inference_chunk_size(gpu_devices_dict: dict, max_pix_per_mb_gpu: int = 200, default: int = 512) -> int:
"""
Calculate maximum chunk_size that could fit on GPU during inference based on thumb rule with hardcoded
"pixels per MB of GPU RAM" as threshold. Threshold based on inference with a large model (Deeplabv3_resnet101)
:param gpu_devices_dict: dictionary containing info on GPU devices as returned by lst_device_ids (utils.py)
:param max_pix_per_mb_gpu: Maximum number of pixels that can fit on each MB of GPU (better to underestimate)
:return: returns a downgraded evaluation batch size if the original batch size is considered too high
"""
if not gpu_devices_dict:
return default
# get max ram for smallest gpu
smallest_gpu_ram = min(gpu_info['max_ram'] for _, gpu_info in gpu_devices_dict.items())
# rule of thumb to determine max chunk size based on approximate max pixels a gpu can handle during inference
max_chunk_size = sqrt(max_pix_per_mb_gpu * smallest_gpu_ram)
max_chunk_size_rd = int(max_chunk_size - (max_chunk_size % 256)) # round to the closest multiple of 256
logging.info(f'Data will be split into chunks of {max_chunk_size_rd} if chunk_size is not specified.')
return max_chunk_size_rd


def main(params:Union[DictConfig, Dict]):

Expand All @@ -51,9 +34,10 @@ def main(params:Union[DictConfig, Dict]):
params['inference'],
to_path=True,
validate_path_exists=True,
wildcard='*.pt')
mask_to_vector = get_key_def('mask_to_vector', params['inference'], default=False, expected_type=bool)
wildcard='*pt')

prep_data_only = get_key_def('prep_data_only', params['inference'], default=False, expected_type=bool)

# Set the device
num_devices = get_key_def('gpu', params['inference'], default=0, expected_type=(int, bool))
if num_devices > 1:
Expand All @@ -64,25 +48,27 @@ def main(params:Union[DictConfig, Dict]):
raise ValueError(f'\nMax used ram parameter should be a percentage. Got {max_used_ram}.')
max_used_perc = get_key_def('max_used_perc', params['inference'], default=25, expected_type=int)
gpu_devices_dict = get_device_ids(num_devices, max_used_ram_perc=max_used_ram, max_used_perc=max_used_perc)
max_pix_per_mb_gpu = get_key_def('max_pix_per_mb_gpu', params['inference'], default=25, expected_type=int)
auto_chunk_size = calc_inference_chunk_size(gpu_devices_dict=gpu_devices_dict,
max_pix_per_mb_gpu=max_pix_per_mb_gpu, default=512)


chunk_size = get_key_def('chunk_size', params['inference'], default=auto_chunk_size, expected_type=int)
batch_size = get_key_def('batch_size', params['inference'], default=8, expected_type=int)
patch_size = get_key_def('patch_size', params['inference'], default=1024, expected_type=int)
workers = get_key_def('workers', params['inference'], default=0, expected_type=int)
prediction_threshold = get_key_def('prediction_threshold', params['inference'], default=0.3, expected_type=float)
device = set_device(gpu_devices_dict=gpu_devices_dict)


# Dataset params
bands_requested = get_key_def('bands', params['dataset'], default=[1, 2, 3], expected_type=Sequence)
classes_dict = get_key_def('classes_dict', params['dataset'], expected_type=DictConfig)
download_data = get_key_def('download_data', params['inference'], default=False, expected_type=bool)
data_dir = get_key_def('raw_data_dir', params['dataset'], default="data", to_path=True, validate_path_exists=True)
clahe_clip_limit = get_key_def('clahe_clip_limit', params['tiling'], expected_type=Number, default=0)
raw_data_csv = get_key_def('raw_data_csv', params['inference'], expected_type=str, to_path=True,
validate_path_exists=True)
input_stac_item = get_key_def('input_stac_item', params['inference'], expected_type=str, to_path=True,
validate_path_exists=True)
num_classes = get_key_def('num_classes', params['inference'], expected_type=int, default=5)
vectorize = get_key_def('ras2vec', params['inference'], expected_type=bool, default=False)
transform_flip = get_key_def('flip', params['inference'], expected_type=bool, default=False)
transform_rotate = get_key_def('rotate', params['inference'], expected_type=bool, default=False)
transforms = True if transform_flip or transform_rotate else False

if raw_data_csv and input_stac_item:
raise ValueError(f"Input imagery should be either a csv of stac item. Got inputs from both \"raw_data_csv\" "
Expand All @@ -109,22 +95,41 @@ def main(params:Union[DictConfig, Dict]):
data_dir=data_dir,
equalize_clahe_clip_limit=clahe_clip_limit,
)

if prep_data_only:
logging.info(f"[prep_data_only mode] Data preparation for inference is complete. Exiting...")
exit()

# Create the inference object
device_str = "gpu" if device.type == 'cuda' else "cpu"
gpu_index = device.index if device.type == 'cuda' else 0

geo_inference = GeoInference(model=str(model_path),
work_dir=str(working_folder),
batch_size=batch_size,
mask_to_vec=mask_to_vector,
mask_to_vec=vectorize,
device=device_str,
gpu_id=gpu_index,
num_classes=num_classes,
prediction_threshold=prediction_threshold,
transformers=transforms,
transformer_flip=transform_flip,
transformer_rotate=transform_rotate,
)

# LOOP THROUGH LIST OF INPUT IMAGES
for aoi in tqdm(list_aois, desc='Inferring from images', position=0, leave=True):
logging.info(f'\nReading image: {aoi.aoi_id}')
raster = aoi.raster
geo_inference(raster, tiff_name=aoi.aoi_id, patch_size=chunk_size)

input_path = str(aoi.raster.name)
mask_name = geo_inference(input_path, patch_size=patch_size, workers=workers)
mask_path = working_folder / mask_name

# update metadata info and rename mask tif.
if classes_dict is not None:
meta_data_dict = {"checkpoint": str(model_path),
"classes_dict": classes_dict}
with rasterio.open(mask_path, 'r+') as raster:
raster.update_tags(**meta_data_dict)
output_path = get_key_def('output_path', params['inference'], expected_type=str, to_path=True,
default=mask_path)
move(mask_path, output_path)
logging.info(f"finished inferring image: {aoi.aoi_id} ")
Loading
Loading