From b8ab8df4cba1d767695bd1bd2bc9c62821cd9fc5 Mon Sep 17 00:00:00 2001 From: Michael Haslgruebler Date: Thu, 18 Apr 2024 16:06:05 +0200 Subject: [PATCH] added energytracking capabilites for NVIDIA Orin --- codecarbon/core/gpu.py | 6 +- codecarbon/core/tegrametrics.py | 128 ++++++++++++++++++++++++++++++++ codecarbon/emissions_tracker.py | 44 ++++++++++- codecarbon/external/hardware.py | 75 ++++++++++++++++++- 4 files changed, 248 insertions(+), 5 deletions(-) create mode 100644 codecarbon/core/tegrametrics.py diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 70a81cabc..3193c312f 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -1,5 +1,6 @@ from dataclasses import dataclass, field - +import os +import platform import pynvml from codecarbon.core.units import Energy, Power, Time @@ -269,6 +270,9 @@ def get_delta(self, last_duration: Time): def is_gpu_details_available(): """Returns True if the GPU details are available.""" try: + if platform.system()=="Linux": + if "tegra" in platform.release(): + return False pynvml.nvmlInit() return True diff --git a/codecarbon/core/tegrametrics.py b/codecarbon/core/tegrametrics.py new file mode 100644 index 000000000..d9262f6d4 --- /dev/null +++ b/codecarbon/core/tegrametrics.py @@ -0,0 +1,128 @@ +import os +import re +import shutil +import sys +from typing import Dict +import os +import platform +import numpy as np +import queue + +import subprocess as sp +# Threading +from threading import Thread, Event + +from codecarbon.core.util import detect_cpu_model +from codecarbon.external.logger import logger +import re +WATT_RE = re.compile(r'\b(\w+) ([0-9.]+)(\w?)W?\/([0-9.]+)(\w?)W?\b') +GPU_WATT_RE = re.compile(r'\b(VDD_GPU_SOC) ([0-9.]+)(\w?)W?\/([0-9.]+)(\w?)W?\b') +CPU_WATT_RE = re.compile(r'\b(VDD_CPU_CV) ([0-9.]+)(\w?)W?\/([0-9.]+)(\w?)W?\b') + +def is_tegrametrics_available(): + try: + if platform.system()=="Linux": + if "tegra" in platform.release(): + import pynvml + pynvml.nvmlInit() + return True + return False + except Exception as e: + logger.debug( + "Not using Tegrametrics, an exception occurred while instantiating pynvml" + + f" Tegrametrics : {e}", + ) + return False + +class NvidiaTegrametrics: + def __init__( + self, + n_points=10, + interval=100, + ): + self._interval = interval + self._n_point=n_points + + self._running = Event() + self.path = "/usr/bin/tegrastats" + self._error = None + self._thread = None + + self.cpu_queue = queue.Queue() + self.gpu_queue= queue.Queue() + self._gpu_power_list=[] + self._cpu_power_list=[] + + def _decode(self, text): + name, gpu_cur, gpu_unit_cur, avg, unit_avg = re.findall(GPU_WATT_RE, text)[0] + name, cpu_cur, cpu_unit_cur, avg, unit_avg = re.findall(CPU_WATT_RE, text)[0] + if gpu_unit_cur=='m': + gpu_cur=float(gpu_cur)/1000.0 + if cpu_unit_cur=='m': + cpu_cur=float(cpu_cur)/1000.0 + self.cpu_queue.put(float(cpu_cur), True, 1) + self.gpu_queue.put(float(gpu_cur), True, 1) + + def _thread_read_tegrastats(self): + pts = sp.Popen([self.path, '--interval', str(self._interval)], stdout=sp.PIPE) + try: + # Reading loop + while self._running.is_set(): + if pts.poll() is not None: + continue + out = pts.stdout + if out is not None: + # Read line process output + line = out.readline().decode("utf-8") + stats = self._decode(line) + except AttributeError: + pass + except OSError: + pass + except Exception: + # Write error message + self._error = sys.exc_info() + ex_type, ex_value, tb_str = self._error + logger.info(tb_str) + finally: + # Kill process + try: + pts.kill() + except OSError: + pass + + def get_details(self, **kwargs) -> Dict: + details = dict() + if self.gpu_queue.qsize()self._n_point): + self._gpu_power_list.append(self.gpu_queue.get(True,1)) + self._cpu_power_list.append(self.cpu_queue.get(True,1)) + if len(self._gpu_power_list)>0 and len(self._cpu_power_list)>0: + details["CPU Power"] = np.mean(self._cpu_power_list) + details["CPU Energy Delta"] = np.sum(self._cpu_power_list)*(float(self._interval) / 1000.0) + details["GPU Power"] = np.mean(self._gpu_power_list) + details["GPU Energy Delta"] = np.sum(self._gpu_power_list)*(float(self._interval) / 1000.0) + return details + + def start(self): + if self._thread is not None: + return False + logger.info("starting tegrastats thread with %s ms" % self._interval) + self._running.set() + self._thread = Thread(target=self._thread_read_tegrastats, args=()) + self._thread.start() + def stop(self,timeout=None): + if self._error: + # Extract exception and raise + ex_type, ex_value, tb_str = self._error + ex_value.__traceback__ = tb_str + raise ex_value + # stop thread main loop + self._running.clear() + if self._thread is not None: + logger.info("stopping tegrastats thread") + self._thread.join(timeout) + self._thread = None + return True diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 62e7968a4..9db297041 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -14,13 +14,13 @@ from typing import Any, Callable, Dict, List, Optional, Union from codecarbon._version import __version__ -from codecarbon.core import cpu, gpu, powermetrics +from codecarbon.core import cpu, gpu, powermetrics, tegrametrics from codecarbon.core.config import get_hierarchical_config, parse_gpu_ids from codecarbon.core.emissions import Emissions from codecarbon.core.units import Energy, Power, Time from codecarbon.core.util import count_cpus, suppress from codecarbon.external.geography import CloudMetadata, GeoMetadata -from codecarbon.external.hardware import CPU, GPU, RAM, AppleSiliconChip +from codecarbon.external.hardware import CPU, GPU, RAM, AppleSiliconChip, NvidiaTegraChip from codecarbon.external.logger import logger, set_logger_format, set_logger_level from codecarbon.external.scheduler import PeriodicScheduler from codecarbon.external.task import Task @@ -307,7 +307,26 @@ def __init__( logger.info("No GPU found.") logger.info("[setup] CPU Tracking...") - if cpu.is_powergadget_available() and self._default_cpu_power is None: + if tegrametrics.is_tegrametrics_available() and self._default_cpu_power is None: + logger.info("Tracking Nvidia Tegra CPU and GPU via TegraMetrics") + hardware_cpu = NvidiaTegraChip.from_utils( + self._output_dir, chip_part="CPU" + ) + + logger.info("Hardware CPU"+str(hardware_cpu)) + self._hardware.append(hardware_cpu) + self._conf["cpu_model"] = hardware_cpu.get_model() + hardware_gpu = NvidiaTegraChip.from_utils( + self._output_dir, chip_part="GPU", interface=hardware_cpu._interface + ) + self._hardware.append(hardware_gpu) + import pynvml + pynvml.nvmlDeviceGetCount() + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + self._conf["gpu_model"] = pynvml.nvmlDeviceGetName(handle) + self._conf["gpu_count"] = 1 + + elif cpu.is_powergadget_available() and self._default_cpu_power is None: logger.info("Tracking Intel CPU via Power Gadget") hardware = CPU.from_utils(self._output_dir, "intel_power_gadget") self._hardware.append(hardware) @@ -566,6 +585,10 @@ def stop(self) -> Optional[float]: experiment_name=self._experiment_name, ) + for hardware in self._hardware: + if "_interface" in hardware.__dict__: + if "stop" in dir(hardware._interface): + hardware._interface.stop() self.final_emissions_data = emissions_data self.final_emissions = emissions_data.emissions return emissions_data.emissions @@ -722,6 +745,21 @@ def _do_measurements(self) -> None: f"Energy consumed for all GPUs : {self._total_gpu_energy.kWh:.6f} kWh" + f". Total GPU Power : {self._gpu_power.W} W" ) + elif isinstance(hardware, NvidiaTegraChip): + if hardware.chip_part == "CPU": + self._total_cpu_energy += energy + self._cpu_power = power + logger.info( + f"Energy consumed for all CPUs : {self._total_cpu_energy.kWh:.6f} kWh" + + f". Total CPU Power : {self._cpu_power.W} W" + ) + elif hardware.chip_part == "GPU": + self._total_gpu_energy += energy + self._gpu_power = power + logger.info( + f"Energy consumed for all GPUs : {self._total_gpu_energy.kWh:.6f} kWh" + + f". Total GPU Power : {self._gpu_power.W} W" + ) else: logger.error(f"Unknown hardware type: {hardware} ({type(hardware)})") h_time = time.time() - h_time diff --git a/codecarbon/external/hardware.py b/codecarbon/external/hardware.py index 7579446db..18e6f0c4e 100644 --- a/codecarbon/external/hardware.py +++ b/codecarbon/external/hardware.py @@ -12,7 +12,8 @@ from codecarbon.core.cpu import IntelPowerGadget, IntelRAPL from codecarbon.core.gpu import AllGPUDevices -from codecarbon.core.powermetrics import ApplePowermetrics +from codecarbon.core.powermetrics import ApplePowermetrics +from codecarbon.core.tegrametrics import NvidiaTegrametrics from codecarbon.core.units import Energy, Power, Time from codecarbon.core.util import SLURM_JOB_ID, detect_cpu_model from codecarbon.external.logger import logger @@ -397,6 +398,78 @@ def total_power(self) -> Power: return ram_power +@dataclass +class NvidiaTegraChip(BaseHardware): + def __init__( + self, + output_dir: str, + model: str, + chip_part: str = "CPU", + interface=None + ): + self._output_dir = output_dir + self._model = model + self._interface = interface + self.chip_part = chip_part + + def __repr__(self) -> str: + return f"NvidiaTegraChip ({self._model} > {self.chip_part})" + + def _get_power(self) -> Power: + """ + Get Chip part power + Args: + chip_part (str): Chip part to get power from (CPU, GPU) + :return: power in kW + """ + + all_details: Dict = self._interface.get_details() + + power = 0 + for metric, value in all_details.items(): + if re.match(rf"^{self.chip_part} Power", metric): + power += value + logger.debug(f"_get_power_from_cpus - MATCH {metric} : {value}") + + else: + logger.debug(f"_get_power_from_cpus - DONT MATCH {metric} : {value}") + return Power.from_watts(power) + + def _get_energy(self, delay: Time) -> Energy: + """ + Get Chip part energy deltas + Args: + chip_part (str): Chip part to get power from (Processor, GPU, etc.) + :return: energy in kWh + """ + all_details: Dict = self._interface.get_details(delay) + + energy = 0 + for metric, value in all_details.items(): + if re.match(rf"^{self.chip_part} Energy Delta_\d", metric): + energy += value + return Energy.from_energy(energy) + + def total_power(self) -> Power: + return self._get_power() + + def start(self): + self._interface.start() + + def get_model(self): + return self._model + + @classmethod + def from_utils( + cls, output_dir: str, model: Optional[str] = None, chip_part: str = "Processor",interface=NvidiaTegrametrics() + ) -> "NvidiaTegraChip": + if model is None: + model = detect_cpu_model() + if model is None: + logger.warning("Could not read NvidiaTegraChip model.") + + return cls(output_dir=output_dir, model=model, chip_part=chip_part, interface=interface) + @dataclass class AppleSiliconChip(BaseHardware): def __init__(