Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added energytracking capabilites for NVIDIA Tegra Devices (Orin) #571

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion codecarbon/core/gpu.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass, field

import os
import platform
import pynvml

from codecarbon.core.units import Energy, Power, Time
Expand Down Expand Up @@ -269,6 +270,9 @@ def get_delta(self, last_duration: Time):
def is_gpu_details_available():
"""Returns True if the GPU details are available."""
try:
if platform.system()=="Linux":
if "tegra" in platform.release():
return False
pynvml.nvmlInit()
return True

Expand Down
128 changes: 128 additions & 0 deletions codecarbon/core/tegrametrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import os
import re
import shutil
import sys
from typing import Dict
import os
import platform
import numpy as np
import queue

import subprocess as sp
# Threading
from threading import Thread, Event

from codecarbon.core.util import detect_cpu_model
from codecarbon.external.logger import logger
import re
WATT_RE = re.compile(r'\b(\w+) ([0-9.]+)(\w?)W?\/([0-9.]+)(\w?)W?\b')
GPU_WATT_RE = re.compile(r'\b(VDD_GPU_SOC) ([0-9.]+)(\w?)W?\/([0-9.]+)(\w?)W?\b')
CPU_WATT_RE = re.compile(r'\b(VDD_CPU_CV) ([0-9.]+)(\w?)W?\/([0-9.]+)(\w?)W?\b')

def is_tegrametrics_available():
try:
if platform.system()=="Linux":
if "tegra" in platform.release():
import pynvml
pynvml.nvmlInit()
return True
return False
except Exception as e:
logger.debug(
"Not using Tegrametrics, an exception occurred while instantiating pynvml"
+ f" Tegrametrics : {e}",
)
return False

class NvidiaTegrametrics:
def __init__(
self,
n_points=10,
interval=100,
):
self._interval = interval
self._n_point=n_points

self._running = Event()
self.path = "/usr/bin/tegrastats"
self._error = None
self._thread = None

self.cpu_queue = queue.Queue()
self.gpu_queue= queue.Queue()
self._gpu_power_list=[]
self._cpu_power_list=[]

def _decode(self, text):
name, gpu_cur, gpu_unit_cur, avg, unit_avg = re.findall(GPU_WATT_RE, text)[0]
name, cpu_cur, cpu_unit_cur, avg, unit_avg = re.findall(CPU_WATT_RE, text)[0]
if gpu_unit_cur=='m':
gpu_cur=float(gpu_cur)/1000.0
if cpu_unit_cur=='m':
cpu_cur=float(cpu_cur)/1000.0
self.cpu_queue.put(float(cpu_cur), True, 1)
self.gpu_queue.put(float(gpu_cur), True, 1)

def _thread_read_tegrastats(self):
pts = sp.Popen([self.path, '--interval', str(self._interval)], stdout=sp.PIPE)
try:
# Reading loop
while self._running.is_set():
if pts.poll() is not None:
continue
out = pts.stdout
if out is not None:
# Read line process output
line = out.readline().decode("utf-8")
stats = self._decode(line)
except AttributeError:
pass
except OSError:
pass
except Exception:
# Write error message
self._error = sys.exc_info()
ex_type, ex_value, tb_str = self._error
logger.info(tb_str)
finally:
# Kill process
try:
pts.kill()
except OSError:
pass

def get_details(self, **kwargs) -> Dict:
details = dict()
if self.gpu_queue.qsize()<self._n_point:
logger.info("not enough data yet size=%d" % self.gpu_queue.qsize())
return details
while(self.gpu_queue.qsize()>self._n_point):
self._gpu_power_list.append(self.gpu_queue.get(True,1))
self._cpu_power_list.append(self.cpu_queue.get(True,1))
if len(self._gpu_power_list)>0 and len(self._cpu_power_list)>0:
details["CPU Power"] = np.mean(self._cpu_power_list)
details["CPU Energy Delta"] = np.sum(self._cpu_power_list)*(float(self._interval) / 1000.0)
details["GPU Power"] = np.mean(self._gpu_power_list)
details["GPU Energy Delta"] = np.sum(self._gpu_power_list)*(float(self._interval) / 1000.0)
return details

def start(self):
if self._thread is not None:
return False
logger.info("starting tegrastats thread with %s ms" % self._interval)
self._running.set()
self._thread = Thread(target=self._thread_read_tegrastats, args=())
self._thread.start()
def stop(self,timeout=None):
if self._error:
# Extract exception and raise
ex_type, ex_value, tb_str = self._error
ex_value.__traceback__ = tb_str
raise ex_value
# stop thread main loop
self._running.clear()
if self._thread is not None:
logger.info("stopping tegrastats thread")
self._thread.join(timeout)
self._thread = None
return True
44 changes: 41 additions & 3 deletions codecarbon/emissions_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
from typing import Any, Callable, Dict, List, Optional, Union

from codecarbon._version import __version__
from codecarbon.core import cpu, gpu, powermetrics
from codecarbon.core import cpu, gpu, powermetrics, tegrametrics
from codecarbon.core.config import get_hierarchical_config, parse_gpu_ids
from codecarbon.core.emissions import Emissions
from codecarbon.core.units import Energy, Power, Time
from codecarbon.core.util import count_cpus, suppress
from codecarbon.external.geography import CloudMetadata, GeoMetadata
from codecarbon.external.hardware import CPU, GPU, RAM, AppleSiliconChip
from codecarbon.external.hardware import CPU, GPU, RAM, AppleSiliconChip, NvidiaTegraChip
from codecarbon.external.logger import logger, set_logger_format, set_logger_level
from codecarbon.external.scheduler import PeriodicScheduler
from codecarbon.external.task import Task
Expand Down Expand Up @@ -307,7 +307,26 @@ def __init__(
logger.info("No GPU found.")

logger.info("[setup] CPU Tracking...")
if cpu.is_powergadget_available() and self._default_cpu_power is None:
if tegrametrics.is_tegrametrics_available() and self._default_cpu_power is None:
logger.info("Tracking Nvidia Tegra CPU and GPU via TegraMetrics")
hardware_cpu = NvidiaTegraChip.from_utils(
self._output_dir, chip_part="CPU"
)

logger.info("Hardware CPU"+str(hardware_cpu))
self._hardware.append(hardware_cpu)
self._conf["cpu_model"] = hardware_cpu.get_model()
hardware_gpu = NvidiaTegraChip.from_utils(
self._output_dir, chip_part="GPU", interface=hardware_cpu._interface
)
self._hardware.append(hardware_gpu)
import pynvml
pynvml.nvmlDeviceGetCount()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
self._conf["gpu_model"] = pynvml.nvmlDeviceGetName(handle)
self._conf["gpu_count"] = 1

elif cpu.is_powergadget_available() and self._default_cpu_power is None:
logger.info("Tracking Intel CPU via Power Gadget")
hardware = CPU.from_utils(self._output_dir, "intel_power_gadget")
self._hardware.append(hardware)
Expand Down Expand Up @@ -566,6 +585,10 @@ def stop(self) -> Optional[float]:
experiment_name=self._experiment_name,
)

for hardware in self._hardware:
if "_interface" in hardware.__dict__:
if "stop" in dir(hardware._interface):
hardware._interface.stop()
self.final_emissions_data = emissions_data
self.final_emissions = emissions_data.emissions
return emissions_data.emissions
Expand Down Expand Up @@ -722,6 +745,21 @@ def _do_measurements(self) -> None:
f"Energy consumed for all GPUs : {self._total_gpu_energy.kWh:.6f} kWh"
+ f". Total GPU Power : {self._gpu_power.W} W"
)
elif isinstance(hardware, NvidiaTegraChip):
if hardware.chip_part == "CPU":
self._total_cpu_energy += energy
self._cpu_power = power
logger.info(
f"Energy consumed for all CPUs : {self._total_cpu_energy.kWh:.6f} kWh"
+ f". Total CPU Power : {self._cpu_power.W} W"
)
elif hardware.chip_part == "GPU":
self._total_gpu_energy += energy
self._gpu_power = power
logger.info(
f"Energy consumed for all GPUs : {self._total_gpu_energy.kWh:.6f} kWh"
+ f". Total GPU Power : {self._gpu_power.W} W"
)
else:
logger.error(f"Unknown hardware type: {hardware} ({type(hardware)})")
h_time = time.time() - h_time
Expand Down
75 changes: 74 additions & 1 deletion codecarbon/external/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

from codecarbon.core.cpu import IntelPowerGadget, IntelRAPL
from codecarbon.core.gpu import AllGPUDevices
from codecarbon.core.powermetrics import ApplePowermetrics
from codecarbon.core.powermetrics import ApplePowermetrics
from codecarbon.core.tegrametrics import NvidiaTegrametrics
from codecarbon.core.units import Energy, Power, Time
from codecarbon.core.util import SLURM_JOB_ID, detect_cpu_model
from codecarbon.external.logger import logger
Expand Down Expand Up @@ -397,6 +398,78 @@ def total_power(self) -> Power:
return ram_power


@dataclass
class NvidiaTegraChip(BaseHardware):
def __init__(
self,
output_dir: str,
model: str,
chip_part: str = "CPU",
interface=None
):
self._output_dir = output_dir
self._model = model
self._interface = interface
self.chip_part = chip_part

def __repr__(self) -> str:
return f"NvidiaTegraChip ({self._model} > {self.chip_part})"

def _get_power(self) -> Power:
"""
Get Chip part power
Args:
chip_part (str): Chip part to get power from (CPU, GPU)
:return: power in kW
"""

all_details: Dict = self._interface.get_details()

power = 0
for metric, value in all_details.items():
if re.match(rf"^{self.chip_part} Power", metric):
power += value
logger.debug(f"_get_power_from_cpus - MATCH {metric} : {value}")

else:
logger.debug(f"_get_power_from_cpus - DONT MATCH {metric} : {value}")
return Power.from_watts(power)

def _get_energy(self, delay: Time) -> Energy:
"""
Get Chip part energy deltas
Args:
chip_part (str): Chip part to get power from (Processor, GPU, etc.)
:return: energy in kWh
"""
all_details: Dict = self._interface.get_details(delay)

energy = 0
for metric, value in all_details.items():
if re.match(rf"^{self.chip_part} Energy Delta_\d", metric):
energy += value
return Energy.from_energy(energy)

def total_power(self) -> Power:
return self._get_power()

def start(self):
self._interface.start()

def get_model(self):
return self._model

@classmethod
def from_utils(
cls, output_dir: str, model: Optional[str] = None, chip_part: str = "Processor",interface=NvidiaTegrametrics()
) -> "NvidiaTegraChip":
if model is None:
model = detect_cpu_model()
if model is None:
logger.warning("Could not read NvidiaTegraChip model.")

return cls(output_dir=output_dir, model=model, chip_part=chip_part, interface=interface)

@dataclass
class AppleSiliconChip(BaseHardware):
def __init__(
Expand Down
Loading