From bff355bcc4f0e82ca48bd5043cd99bbe60792650 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 17 Dec 2023 19:13:19 +0800 Subject: [PATCH] fix(callbacks/lightning): populate callback for `lightning` (#114) --- .pre-commit-config.yaml | 6 +- CHANGELOG.md | 1 + README.md | 6 +- docs/requirements.txt | 1 + docs/source/callbacks.rst | 8 ++ docs/source/spelling_wordlist.txt | 1 + nvitop/callbacks/keras.py | 7 +- nvitop/callbacks/lightning.py | 170 +++++++++++++++++++++++++- nvitop/callbacks/pytorch_lightning.py | 20 ++- 9 files changed, 203 insertions(+), 17 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 17a08d79..48e6d91b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,16 +25,16 @@ repos: - id: debug-statements - id: double-quote-string-fixer - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.7 + rev: v0.1.8 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - repo: https://github.com/PyCQA/isort - rev: 5.13.0 + rev: 5.13.2 hooks: - id: isort - repo: https://github.com/psf/black - rev: 23.11.0 + rev: 23.12.0 hooks: - id: black - repo: https://github.com/asottile/pyupgrade diff --git a/CHANGELOG.md b/CHANGELOG.md index dfcca99b..fd0758ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Add separate implementation for `GpuStatsLogger` callback for `lightning` by [@XuehaiPan](https://github.com/XuehaiPan) in [#114](https://github.com/XuehaiPan/nvitop/pull/114). - Remove metrics if process is gone in `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/XuehaiPan/nvitop/pull/107). ### Changed diff --git a/README.md b/README.md index 1622f2c0..251d2987 100644 --- a/README.md +++ b/README.md @@ -577,11 +577,11 @@ model.fit(.., callbacks=[gpu_stats, tb_callback]) **NOTE:** Users should assign a `keras.callbacks.TensorBoard` callback or a `keras.callbacks.CSVLogger` callback to the model. And the `GpuStatsLogger` callback should be placed before the `keras.callbacks.TensorBoard` / `keras.callbacks.CSVLogger` callback. -#### Callback for [PyTorch Lightning](https://pytorchlightning.ai) +#### Callback for [PyTorch Lightning](https://lightning.ai) ```python -from pytorch_lightning import Trainer -from nvitop.callbacks.pytorch_lightning import GpuStatsLogger +from lightning.pytorch import Trainer +from nvitop.callbacks.lightning import GpuStatsLogger gpu_stats = GpuStatsLogger() trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats]) ``` diff --git a/docs/requirements.txt b/docs/requirements.txt index f0061ab7..8a1cee6e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,5 +4,6 @@ sphinx-autobuild sphinx-copybutton sphinx-rtd-theme +lightning >= 2.0.0, < 3.0.0a0 pytorch-lightning >= 1.5.0, < 2.0.0a0 tensorflow-cpu >= 2.0.0, < 2.12.0a0 diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst index b0dae221..a55fc336 100644 --- a/docs/source/callbacks.rst +++ b/docs/source/callbacks.rst @@ -12,6 +12,14 @@ nvitop.callbacks.keras module :undoc-members: :show-inheritance: +nvitop.callbacks.lightning module +--------------------------------- + +.. automodule:: nvitop.callbacks.lightning + :members: + :undoc-members: + :show-inheritance: + nvitop.callbacks.pytorch\_lightning module ------------------------------------------ diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index e87a36d4..af248a46 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -153,3 +153,4 @@ ThroughputInfo pytorch api utils +GpuStatsLogger diff --git a/nvitop/callbacks/keras.py b/nvitop/callbacks/keras.py index 3ac0855c..f4e750f3 100644 --- a/nvitop/callbacks/keras.py +++ b/nvitop/callbacks/keras.py @@ -23,15 +23,14 @@ import re import time -from tensorflow.python.keras.callbacks import ( # pylint: disable=import-error,no-name-in-module - Callback, -) +# pylint: disable-next=import-error,no-name-in-module +from tensorflow.python.keras.callbacks import Callback from nvitop.api import libnvml from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats -# Ported version of .pytorch_lightning.GpuStatsLogger for Keras +# Ported version of nvitop.callbacks.lightning.GpuStatsLogger for Keras class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and in order to use it you need to assign a TensorBoard callback or a CSVLogger callback to the model. diff --git a/nvitop/callbacks/lightning.py b/nvitop/callbacks/lightning.py index 6c9dbc58..b2bf7f36 100644 --- a/nvitop/callbacks/lightning.py +++ b/nvitop/callbacks/lightning.py @@ -15,7 +15,171 @@ # limitations under the License. # ============================================================================== -# pylint: disable=missing-module-docstring +# pylint: disable=missing-module-docstring,missing-function-docstring +# pylint: disable=unused-argument,attribute-defined-outside-init -# pylint: disable-next=unused-import -from nvitop.callbacks.pytorch_lightning import GpuStatsLogger # noqa: F401 +from __future__ import annotations + +import time +from typing import Any + +import lightning.pytorch as pl # pylint: disable=import-error +from lightning.pytorch.callbacks import Callback # pylint: disable=import-error +from lightning.pytorch.utilities import rank_zero_only # pylint: disable=import-error +from lightning.pytorch.utilities.exceptions import ( # pylint: disable=import-error + MisconfigurationException, +) + +from nvitop.api import libnvml +from nvitop.callbacks.utils import get_devices_by_logical_ids, get_gpu_stats + + +# Modified from pytorch_lightning.callbacks.GPUStatsMonitor +class GpuStatsLogger(Callback): # pylint: disable=too-many-instance-attributes + """Automatically log GPU stats during training stage. :class:`GpuStatsLogger` is a callback and + in order to use it you need to assign a logger in the ``Trainer``. + + Args: + memory_utilization (bool): + Set to :data:`True` to log used, free and the percentage of memory utilization at the + start and end of each step. Default: :data:`True`. + gpu_utilization (bool): + Set to :data:`True` to log the percentage of GPU utilization at the start and end of + each step. Default: :data:`True`. + intra_step_time (bool): + Set to :data:`True` to log the time of each step. Default: :data:`False`. + inter_step_time (bool): + Set to :data:`True` to log the time between the end of one step and the start of the + next step. Default: :data:`False`. + fan_speed (bool): + Set to :data:`True` to log percentage of fan speed. Default: :data:`False`. + temperature (bool): + Set to :data:`True` to log the gpu temperature in degree Celsius. Default: :data:`False`. + + Raises: + MisconfigurationException: + If NVIDIA driver is not installed, not running on GPUs, or ``Trainer`` has no logger. + + Examples: + >>> from lightning.pytorch import Trainer + >>> from nvitop.callbacks.lightning import GpuStatsLogger + >>> gpu_stats = GpuStatsLogger() + >>> trainer = Trainer(gpus=[..], logger=True, callbacks=[gpu_stats]) + + GPU stats are mainly based on NVML queries. The description of the queries is as follows: + + - **fan.speed** - The fan speed value is the percent of maximum speed that the device's fan is + currently intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the + intended fan speed. If the fan is physically blocked and unable to spin, this output will not + match the actual fan speed. Many parts do not report fan speeds because they rely on cooling + via fans in the surrounding enclosure. + - **memory.used** - Total memory allocated by active contexts, in MiBs. + - **memory.free** - Total free memory, in MiBs. + - **utilization.gpu** - Percent of time over the past sample period during which one or more + kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second + depending on the product. + - **utilization.memory** - Percent of time over the past sample period during which global + (device) memory was being read or written. The sample period may be between 1 second and 1/6 + second depending on the product. + - **temperature** - Core GPU temperature, in degrees C. + """ + + def __init__( # pylint: disable=too-many-arguments + self, + memory_utilization: bool = True, + gpu_utilization: bool = True, + intra_step_time: bool = False, + inter_step_time: bool = False, + fan_speed: bool = False, + temperature: bool = False, + ) -> None: + super().__init__() + + try: + libnvml.nvmlInit() + except libnvml.NVMLError as ex: + raise MisconfigurationException( + 'Cannot use GpuStatsLogger callback because NVIDIA driver is not installed.', + ) from ex + + self._memory_utilization = memory_utilization + self._gpu_utilization = gpu_utilization + self._intra_step_time = intra_step_time + self._inter_step_time = inter_step_time + self._fan_speed = fan_speed + self._temperature = temperature + + def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: + if not trainer.logger: + raise MisconfigurationException( + 'Cannot use GpuStatsLogger callback with Trainer that has no logger.', + ) + + if trainer.strategy.root_device.type != 'cuda': + raise MisconfigurationException( + f'You are using GpuStatsLogger but are not running on GPU. ' + f'The root device type is {trainer.strategy.root_device.type}.', + ) + + device_ids = trainer.device_ids + + try: + self._devices = get_devices_by_logical_ids(device_ids, unique=True) + except (libnvml.NVMLError, RuntimeError) as ex: + raise ValueError( + f'Cannot use GpuStatsLogger callback because devices unavailable. ' + f'Received: `gpus={device_ids}`', + ) from ex + + def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: + self._snap_intra_step_time = None + self._snap_inter_step_time = None + + @rank_zero_only + def on_train_batch_start( # pylint: disable=arguments-differ + self, + trainer: pl.Trainer, + pl_module: pl.LightningModule, + **kwargs: Any, + ) -> None: + if self._intra_step_time: + self._snap_intra_step_time = time.monotonic() + + logs = self._get_gpu_stats() + + if self._inter_step_time and self._snap_inter_step_time: + # First log at beginning of second step + logs['batch_time/inter_step (ms)'] = 1000.0 * ( + time.monotonic() - self._snap_inter_step_time + ) + + trainer.logger.log_metrics(logs, step=trainer.global_step) + + @rank_zero_only + def on_train_batch_end( # pylint: disable=arguments-differ + self, + trainer: pl.Trainer, + pl_module: pl.LightningModule, + **kwargs: Any, + ) -> None: + if self._inter_step_time: + self._snap_inter_step_time = time.monotonic() + + logs = self._get_gpu_stats() + + if self._intra_step_time and self._snap_intra_step_time: + logs['batch_time/intra_step (ms)'] = 1000.0 * ( + time.monotonic() - self._snap_intra_step_time + ) + + trainer.logger.log_metrics(logs, step=trainer.global_step) + + def _get_gpu_stats(self) -> dict[str, float]: + """Get the gpu status from NVML queries.""" + return get_gpu_stats( + devices=self._devices, + memory_utilization=self._memory_utilization, + gpu_utilization=self._gpu_utilization, + fan_speed=self._fan_speed, + temperature=self._temperature, + ) diff --git a/nvitop/callbacks/pytorch_lightning.py b/nvitop/callbacks/pytorch_lightning.py index 8af3f564..3e64d849 100644 --- a/nvitop/callbacks/pytorch_lightning.py +++ b/nvitop/callbacks/pytorch_lightning.py @@ -21,7 +21,9 @@ from __future__ import annotations import time +from typing import Any +import pytorch_lightning as pl # pylint: disable=import-error from pytorch_lightning.callbacks import Callback # pylint: disable=import-error from pytorch_lightning.utilities import rank_zero_only # pylint: disable=import-error from pytorch_lightning.utilities.exceptions import ( # pylint: disable=import-error @@ -107,7 +109,7 @@ def __init__( # pylint: disable=too-many-arguments self._fan_speed = fan_speed self._temperature = temperature - def on_train_start(self, trainer, pl_module) -> None: + def on_train_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: if not trainer.logger: raise MisconfigurationException( 'Cannot use GpuStatsLogger callback with Trainer that has no logger.', @@ -132,12 +134,17 @@ def on_train_start(self, trainer, pl_module) -> None: f'Received: `gpus={device_ids}`', ) from ex - def on_train_epoch_start(self, trainer, pl_module) -> None: + def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: self._snap_intra_step_time = None self._snap_inter_step_time = None @rank_zero_only - def on_train_batch_start(self, trainer, **kwargs) -> None: # pylint: disable=arguments-differ + def on_train_batch_start( # pylint: disable=arguments-differ + self, + trainer: pl.Trainer, + pl_module: pl.LightningModule, + **kwargs: Any, + ) -> None: if self._intra_step_time: self._snap_intra_step_time = time.monotonic() @@ -152,7 +159,12 @@ def on_train_batch_start(self, trainer, **kwargs) -> None: # pylint: disable=ar trainer.logger.log_metrics(logs, step=trainer.global_step) @rank_zero_only - def on_train_batch_end(self, trainer, **kwargs) -> None: # pylint: disable=arguments-differ + def on_train_batch_end( # pylint: disable=arguments-differ + self, + trainer: pl.Trainer, + pl_module: pl.LightningModule, + **kwargs: Any, + ) -> None: if self._inter_step_time: self._snap_inter_step_time = time.monotonic()