diff --git a/go.mod b/go.mod
index 5df161a474..4c2b59bf17 100644
--- a/go.mod
+++ b/go.mod
@@ -3,6 +3,7 @@ module github.com/sustainable-computing-io/kepler
 go 1.20
 
 require (
+	github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f
 	github.com/NVIDIA/go-nvml v0.12.0-1
 	github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0
 	github.com/containerd/cgroups v1.1.0
@@ -29,8 +30,10 @@ require (
 )
 
 require (
+	github.com/Masterminds/semver v1.5.0 // indirect
 	github.com/StackExchange/wmi v1.2.1 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/bits-and-blooms/bitset v1.13.0 // indirect
 	github.com/cespare/xxhash/v2 v2.2.0 // indirect
 	github.com/cilium/ebpf v0.9.1 // indirect
 	github.com/coreos/go-systemd/v22 v22.5.0 // indirect
@@ -67,9 +70,11 @@ require (
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/client_model v0.5.0 // indirect
 	github.com/prometheus/procfs v0.11.1 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
+	github.com/stretchr/testify v1.8.4 // indirect
 	golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect
 	golang.org/x/net v0.17.0 // indirect
 	golang.org/x/oauth2 v0.13.0 // indirect
diff --git a/go.sum b/go.sum
index 1da1f3f146..cf7192d91d 100644
--- a/go.sum
+++ b/go.sum
@@ -1,3 +1,7 @@
+github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
+github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y=
+github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f h1:HEY1H1By8XI2P6KHA0wk+nXsBE+l/iYRCAwR6nZAoU8=
+github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4=
 github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM=
 github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
 github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
@@ -7,6 +11,8 @@ github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0/go.mod h1:UD3Mfr+JZ/ASK2VMu
 github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE=
+github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
 github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
 github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4=
@@ -130,6 +136,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q=
 github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY=
 github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
@@ -156,6 +163,7 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
diff --git a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go
index 3a933a4da3..a1d4377792 100644
--- a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go
+++ b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go
@@ -46,8 +46,8 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats
 	var err error
 	var processesUtilization map[uint32]gpu_source.ProcessUtilizationSample
 	// calculate the gpu's processes energy consumption for each gpu
-	for _, device := range gpu.GetGpus() {
-		if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, time.Since(lastUtilizationTimestamp)); err != nil {
+	for gpuID, device := range gpu.GetGpus() {
+		if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, gpuID, time.Since(lastUtilizationTimestamp)); err != nil {
 			klog.Infoln(err)
 			continue
 		}
@@ -78,8 +78,9 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats
 				}
 				processStats[uintPid] = stats.NewProcessStats(uintPid, uint64(0), containerID, vmID, command)
 			}
-			processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.SmUtil))
-			processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.MemUtil))
+			gpuName := fmt.Sprintf("%s%v", utils.GenericGPUID, gpuID)
+			processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(gpuName, uint64(processUtilization.SmUtil))
+			processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(gpuName, uint64(processUtilization.MemUtil))
 		}
 	}
 
diff --git a/pkg/metrics/consts/conts.go b/pkg/metrics/consts/conts.go
index 786a767118..8b0d583d78 100644
--- a/pkg/metrics/consts/conts.go
+++ b/pkg/metrics/consts/conts.go
@@ -39,6 +39,7 @@ var (
 	ContainerResUtilLabels = []string{"container_id", "pod_name", "container_name", "container_namespace"}
 	VMResUtilLabels        = []string{"vm_id"}
 	NodeResUtilLabels      = []string{"device", "instance"}
+	GPUResUtilLabels       = []string{"gpu_id"}
 )
 
 var (
diff --git a/pkg/metrics/metricfactory/metric_factory.go b/pkg/metrics/metricfactory/metric_factory.go
index 705f24729b..6a132287e8 100644
--- a/pkg/metrics/metricfactory/metric_factory.go
+++ b/pkg/metrics/metricfactory/metric_factory.go
@@ -139,12 +139,18 @@ func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) {
 		klog.Errorf("Unexpected prometheus context: %s", context)
 		return
 	}
+	// if this is a GPU metric, we need to add the GPU ID label
+	for _, gpuMetric := range consts.GPUMetricNames {
+		if name == gpuMetric {
+			labels = append(labels, consts.GPUResUtilLabels...)
+		}
+	}
 	return MetricsPromDesc(context, name, consts.UsageMetricNameSuffix, source, labels)
 }
 
-func MetricsPromDesc(context, name, sufix, source string, labels []string) (desc *prometheus.Desc) {
+func MetricsPromDesc(context, name, suffix, source string, labels []string) (desc *prometheus.Desc) {
 	return prometheus.NewDesc(
-		prometheus.BuildFQName(consts.MetricsNamespace, context, name+sufix),
+		prometheus.BuildFQName(consts.MetricsNamespace, context, name+suffix),
 		"Aggregated value in "+name+" value from "+source,
 		labels,
 		prometheus.Labels{"source": source},
diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go
index 82bb22f837..64f6aeaa13 100644
--- a/pkg/metrics/utils/utils.go
+++ b/pkg/metrics/utils/utils.go
@@ -122,9 +122,25 @@ func CollectResUtil(ch chan<- prometheus.Metric, instance interface{}, metricNam
 	switch v := instance.(type) {
 	case *stats.ContainerStats:
 		container := instance.(*stats.ContainerStats)
-		value = float64(container.ResourceUsage[metricName].SumAllAggrValues())
-		labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace}
-		collect(ch, collector, value, labelValues)
+		// special case for GPU devices, the metrics are reported per device
+		isGPUMetric := false
+		for _, m := range consts.GPUMetricNames {
+			if metricName == m {
+				isGPUMetric = true
+				break
+			}
+		}
+		if isGPUMetric {
+			for deviceID, utilization := range container.ResourceUsage[metricName].Stat {
+				value = float64(utilization.Aggr)
+				labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace, deviceID}
+				collect(ch, collector, value, labelValues)
+			}
+		} else {
+			value = float64(container.ResourceUsage[metricName].SumAllAggrValues())
+			labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace}
+			collect(ch, collector, value, labelValues)
+		}
 
 	case *stats.ProcessStats:
 		process := instance.(*stats.ProcessStats)
diff --git a/pkg/sensors/accelerator/gpu/gpu.go b/pkg/sensors/accelerator/gpu/gpu.go
index 7e7e1a3ffa..7def54d210 100644
--- a/pkg/sensors/accelerator/gpu/gpu.go
+++ b/pkg/sensors/accelerator/gpu/gpu.go
@@ -34,8 +34,18 @@ Then, we use gpu.go file to initialize the acceleratorImpl from power.go when gp
 
 // init initialize the acceleratorImpl and start it
 func init() {
-	acceleratorImpl = &gpu_source.GPUNvml{}
+	acceleratorImpl = &gpu_source.GPUDcgm{}
 	err := acceleratorImpl.Init()
+	if err == nil {
+		klog.Infoln("Using dcgm to obtain gpu power")
+		// If the library was successfully initialized, we don't need to return an error in the Init() function
+		errLib = nil
+		return
+	}
+	// if dcgm fail to work, we use nvml
+	klog.Infof("Failed to init dcgm, err: %v\n", err)
+	acceleratorImpl = &gpu_source.GPUNvml{}
+	err = acceleratorImpl.Init()
 	if err == nil {
 		klog.Infoln("Using nvml to obtain gpu power")
 		// If the library was successfully initialized, we don't need to return an error in the Init() function
diff --git a/pkg/sensors/accelerator/gpu/power.go b/pkg/sensors/accelerator/gpu/power.go
index f344d9a462..17ae228c18 100644
--- a/pkg/sensors/accelerator/gpu/power.go
+++ b/pkg/sensors/accelerator/gpu/power.go
@@ -39,11 +39,11 @@ type acceleratorInterface interface {
 	// Shutdown stops the GPU metric collector
 	Shutdown() bool
 	// GetGpus returns a map with gpu device
-	GetGpus() []interface{}
+	GetGpus() map[string]interface{}
 	// GetAbsEnergyFromGPU returns a map with mJ in each gpu device. Absolute energy is the sum of Idle + Dynamic energy.
 	GetAbsEnergyFromGPU() []uint32
 	// GetProcessResourceUtilization returns a map of ProcessUtilizationSample where the key is the process pid
-	GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error)
+	GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error)
 	// IsGPUCollectionSupported returns if it is possible to use this collector
 	IsGPUCollectionSupported() bool
 	// SetGPUCollectionSupported manually set if it is possible to use this collector. This is for testing purpose only.
@@ -65,11 +65,11 @@ func Shutdown() bool {
 	return true
 }
 
-func GetGpus() []interface{} {
+func GetGpus() map[string]interface{} {
 	if acceleratorImpl != nil && config.EnabledGPU {
 		return acceleratorImpl.GetGpus()
 	}
-	return []interface{}{}
+	return map[string]interface{}{}
 }
 
 func GetAbsEnergyFromGPU() []uint32 {
@@ -82,9 +82,9 @@ func GetAbsEnergyFromGPU() []uint32 {
 // GetProcessResourceUtilizationPerDevice tries to collect the GPU metrics.
 // There is a known issue that some clusters the nvidia GPU can stop to respod and we need to start it again.
 // See https://github.com/sustainable-computing-io/kepler/issues/610.
-func GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) {
+func GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) {
 	if acceleratorImpl != nil && config.EnabledGPU {
-		processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, since)
+		processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, deviceName, since)
 		if err != nil {
 			klog.Infof("Failed to collect GPU metrics, trying to initizalize again: %v\n", err)
 			err = acceleratorImpl.Init()
diff --git a/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go
new file mode 100644
index 0000000000..edff20a38b
--- /dev/null
+++ b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go
@@ -0,0 +1,376 @@
+//go:build gpu
+// +build gpu
+
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package source
+
+import (
+	"fmt"
+	"strconv"
+	"time"
+
+	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	"github.com/sustainable-computing-io/kepler/pkg/config"
+	"k8s.io/klog/v2"
+)
+
+var (
+	deviceFields []dcgm.Short = []dcgm.Short{
+		// https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.htm
+		dcgm.DCGM_FI_PROF_SM_ACTIVE,
+		dcgm.DCGM_FI_PROF_SM_OCCUPANCY,
+		dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE,
+		dcgm.DCGM_FI_DEV_POWER_USAGE,
+	}
+	deviceFieldsString = []string{
+		"dcgm.DCGM_FI_PROF_SM_ACTIVE",
+		"dcgm.DCGM_FI_PROF_SM_OCCUPANCY",
+		"dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE",
+		"dcgm.DCGM_FI_DEV_POWER_USAGE",
+	}
+	ratioFields              uint = dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE // this is the field that we will use to calculate the utilization per @yuezhu1
+	SkipDCGMValue                 = "SKIPPING DCGM VALUE"
+	FailedToConvert               = "ERROR - FAILED TO CONVERT TO STRING"
+	gpuMigArray              [][]MigDevice
+	totalMultiProcessorCount map[string]int
+)
+
+type GPUDcgm struct {
+	collectionSupported bool
+	devices             map[string]interface{}
+	deviceGroupName     string
+	deviceGroupHandle   dcgm.GroupHandle
+	fieldGroupName      string
+	fieldGroupHandle    dcgm.FieldHandle
+	pidGroupName        string
+	pidGroupHandle      dcgm.GroupHandle // TODO: wait till https://github.com/NVIDIA/go-dcgm/issues/59 is resolved
+	entities            map[string]dcgm.GroupEntityPair
+	cleanup             func()
+}
+
+func (d *GPUDcgm) GetName() string {
+	return "dcgm"
+}
+
+func (d *GPUDcgm) Init() error {
+	d.devices = make(map[string]interface{})
+	d.entities = make(map[string]dcgm.GroupEntityPair)
+	if err := d.initDCGM(); err != nil {
+		return err
+	}
+
+	if err := d.initNVML(); err != nil {
+		d.Shutdown()
+		return err
+	}
+
+	if err := d.createDeviceGroup(); err != nil {
+		d.Shutdown()
+		return err
+	}
+
+	if err := d.addDevicesToGroup(); err != nil {
+		d.Shutdown()
+		return err
+	}
+
+	if err := d.createFieldGroup(); err != nil {
+		d.Shutdown()
+		return err
+	}
+
+	if err := d.setupWatcher(); err != nil {
+		d.Shutdown()
+		return err
+	}
+
+	d.collectionSupported = true
+	return nil
+}
+
+func (d *GPUDcgm) Shutdown() bool {
+	if d.deviceGroupName != "" {
+		dcgm.DestroyGroup(d.deviceGroupHandle)
+	}
+	if d.fieldGroupName != "" {
+		dcgm.FieldGroupDestroy(d.fieldGroupHandle)
+	}
+	if d.cleanup != nil {
+		d.cleanup()
+	}
+	d.collectionSupported = false
+	return true
+}
+
+func (d *GPUDcgm) GetAbsEnergyFromGPU() []uint32 {
+	gpuEnergy := []uint32{}
+	for _, device := range d.devices {
+		power, ret := device.(nvml.Device).GetPowerUsage()
+		if ret != nvml.SUCCESS {
+			klog.V(2).Infof("failed to get power usage on device %v: %v\n", device, nvml.ErrorString(ret))
+			continue
+		}
+		// since Kepler collects metrics at intervals of SamplePeriodSec, which is greater than 1 second, it is
+		// necessary to calculate the energy consumption for the entire waiting period
+		energy := uint32(uint64(power) * config.SamplePeriodSec)
+		gpuEnergy = append(gpuEnergy, energy)
+	}
+	return gpuEnergy
+}
+
+func (d *GPUDcgm) GetGpus() map[string]interface{} {
+	return d.devices
+}
+
+func (d *GPUDcgm) GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]ProcessUtilizationSample, error) {
+	processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{}
+
+	if device == nil { // this is a MIG device, it is already tracked in the parent device
+		return processAcceleratorMetrics, nil
+	}
+
+	var vals, miVals []dcgm.FieldValue_v1
+	var err error
+
+	klog.Infof("Device %v\n", deviceName)
+	if device != nil { // this is a GPU
+		deviceIndex, strErr := strconv.Atoi(deviceName)
+		if strErr != nil {
+			klog.Infof("failed to convert %q to an integer: %v", deviceName, strErr)
+			return processAcceleratorMetrics, strErr
+		}
+		vals, err = dcgm.GetLatestValuesForFields(uint(deviceIndex), deviceFields)
+		if err != nil {
+			klog.Infof("failed to get latest values for fields: %v", err)
+			return processAcceleratorMetrics, err
+		}
+		gpuSMActive := uint32(0)
+		if err == nil {
+			for i, val := range vals {
+				value := ToString(val)
+				label := deviceFieldsString[i]
+				if val.FieldId == ratioFields {
+					smUtil, _ := strconv.ParseFloat(value, 32)
+					gpuSMActive = uint32(smUtil * 100)
+				}
+				klog.Infof("Device %v Label %v Val: %v", deviceName, label, ToString(val))
+			}
+			klog.Infof("\n")
+		}
+		processInfo, ret := device.(nvml.Device).GetComputeRunningProcesses()
+		if ret != nvml.SUCCESS {
+			klog.Infof("failed to get running processes: %v", nvml.ErrorString(ret))
+			return processAcceleratorMetrics, fmt.Errorf("failed to get running processes: %v", nvml.ErrorString(ret))
+		}
+		for _, p := range processInfo {
+			// klog.Infof("pid: %d, memUtil: %d gpu instance id %d compute id %d\n", p.Pid, p.UsedGpuMemory, p.GpuInstanceId, p.ComputeInstanceId)
+			if p.GpuInstanceId > 0 { // this is a MIG, get it entity id and reads the related fields
+				entityName := gpuMigArray[deviceIndex][p.GpuInstanceId].EntityName
+				multiprocessorCountRatio := gpuMigArray[deviceIndex][p.GpuInstanceId].MultiprocessorCountRatio
+				mi := d.entities[entityName]
+				miVals, err = dcgm.EntityGetLatestValues(mi.EntityGroupId, mi.EntityId, deviceFields)
+				if err == nil {
+					for i, val := range miVals {
+						label := deviceFieldsString[i]
+						value := ToString(val)
+						klog.Infof("Device %v Label %v Val: %v", entityName, label, value)
+						if val.FieldId == ratioFields {
+							floatVal, _ := strconv.ParseFloat(value, 32)
+							// ratio of active multiprocessors to total multiprocessors
+							smUtil := uint32(floatVal * 100 * multiprocessorCountRatio)
+							//klog.Infof("pid %d smUtil %d multiprocessor count ratio %v\n", p.Pid, smUtil, multiprocessorCountRatio)
+							processAcceleratorMetrics[p.Pid] = ProcessUtilizationSample{
+								Pid:       p.Pid,
+								TimeStamp: uint64(time.Now().UnixNano()),
+								SmUtil:    smUtil,
+							}
+						}
+					}
+					klog.Infof("\n")
+				}
+			} else {
+				processAcceleratorMetrics[p.Pid] = ProcessUtilizationSample{
+					Pid:       p.Pid,
+					TimeStamp: uint64(time.Now().UnixNano()),
+					SmUtil:    gpuSMActive, // if this is not a MIG, we will use the GPU SM active value. FIXME: what if there are multiple pids in the same GPU?
+				}
+			}
+		}
+	}
+
+	return processAcceleratorMetrics, nil
+}
+
+func (d *GPUDcgm) IsGPUCollectionSupported() bool {
+	return d.collectionSupported
+}
+
+func (d *GPUDcgm) SetGPUCollectionSupported(supported bool) {
+	d.collectionSupported = supported
+}
+
+// helper functions
+func (d *GPUDcgm) initDCGM() error {
+	cleanup, err := dcgm.Init(dcgm.Embedded)
+	if err != nil {
+		if cleanup != nil {
+			cleanup()
+		}
+		return fmt.Errorf("not able to connect to DCGM: %s", err)
+	}
+	d.cleanup = cleanup
+	return nil
+}
+
+func (d *GPUDcgm) initNVML() error {
+	if ret := nvml.Init(); ret != nvml.SUCCESS {
+		d.collectionSupported = false
+		d.Shutdown()
+		return fmt.Errorf("failed to init nvml. %s", nvmlErrorString(ret))
+	}
+	return nil
+}
+
+func (d *GPUDcgm) createDeviceGroup() error {
+	deviceGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05")
+	deviceGroup, err := dcgm.CreateGroup(deviceGroupName)
+	if err != nil {
+		return fmt.Errorf("failed to create group %q: %v", deviceGroupName, err)
+	}
+	d.deviceGroupName = deviceGroupName
+	d.deviceGroupHandle = deviceGroup
+	klog.Infof("Created device group %q", deviceGroupName)
+	return nil
+}
+
+func (d *GPUDcgm) addDevicesToGroup() error {
+	supportedDeviceIndices, err := dcgm.GetSupportedDevices()
+	if err != nil {
+		return fmt.Errorf("failed to find supported devices: %v", err)
+	}
+	klog.Infof("found %d supported devices", len(supportedDeviceIndices))
+	for _, gpuIndex := range supportedDeviceIndices {
+		err = dcgm.AddEntityToGroup(d.deviceGroupHandle, dcgm.FE_GPU, gpuIndex)
+		if err != nil {
+			klog.Infof("failed to add device %d to group %q: %v", gpuIndex, d.deviceGroupName, err)
+		} else {
+			device, ret := nvml.DeviceGetHandleByIndex(int(gpuIndex))
+			if ret != nvml.SUCCESS {
+				klog.Infof("failed to get nvml device %d: %v ", gpuIndex, nvml.ErrorString(ret))
+				continue
+			}
+			d.devices[fmt.Sprintf("%v", gpuIndex)] = device
+			d.entities[fmt.Sprintf("%v", gpuIndex)] = dcgm.GroupEntityPair{dcgm.FE_GPU, gpuIndex}
+		}
+	}
+	// add entity to the group
+	hierarchy, err := dcgm.GetGpuInstanceHierarchy()
+	if err != nil {
+		d.Shutdown()
+		return fmt.Errorf("failed to get gpu hierachy: %v", err)
+	}
+
+	if hierarchy.Count > 0 {
+		// if MIG is enabled, we need to know the hierarchy as well as the multiprocessor count in each device.
+		// we will use the multiprocessor count to calculate the utilization of each instance
+		if gpuMigArray, totalMultiProcessorCount, err = RetriveFromNvidiaSMI(true); err != nil {
+			klog.Infof("failed to retrive from nvidia-smi: %v", err)
+			// if we cannot get the multiprocessor count, we will not be able to calculate the utilization
+		}
+		for i := uint(0); i < hierarchy.Count; i++ {
+			if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU {
+				// add a GPU instance
+				info := hierarchy.EntityList[i].Info
+				entityId := hierarchy.EntityList[i].Entity.EntityId
+				gpuId := hierarchy.EntityList[i].Parent.EntityId
+				klog.Infof("gpu id %v entity id %v gpu index %v instance id %v", gpuId, entityId, info.NvmlGpuIndex, info.NvmlInstanceId)
+				entityName := fmt.Sprintf("entity-%d", entityId)
+				gpuMigArray[info.NvmlGpuIndex][info.NvmlInstanceId].EntityName = entityName
+				err = dcgm.AddEntityToGroup(d.deviceGroupHandle, dcgm.FE_GPU_I, entityId)
+				d.entities[entityName] = dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId}
+				klog.Infof("Adding GPU instance %d, err: %v", entityId, err)
+			}
+		}
+	}
+	return nil
+}
+
+func (d *GPUDcgm) createFieldGroup() error {
+	fieldGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05")
+	fieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields)
+	if err != nil {
+		return fmt.Errorf("failed to create field group %q: %v", fieldGroupName, err)
+	}
+	d.fieldGroupName = fieldGroupName
+	d.fieldGroupHandle = fieldGroup
+	return nil
+}
+
+func (d *GPUDcgm) setupWatcher() error {
+	err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(config.SamplePeriodSec*1000), 0.0, 1)
+	if err != nil {
+		return fmt.Errorf("failed to set up watcher, err %v", err)
+	}
+	return nil
+}
+
+// ToString converts a dcgm.FieldValue_v1 to a string
+// credit to dcgm_exporter
+func ToString(value dcgm.FieldValue_v1) string {
+	switch v := value.Int64(); v {
+	case dcgm.DCGM_FT_INT32_BLANK:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_INT32_NOT_FOUND:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_INT32_NOT_SUPPORTED:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_INT64_BLANK:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_INT64_NOT_FOUND:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_INT64_NOT_SUPPORTED:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED:
+		return SkipDCGMValue
+	}
+	switch v := value.Float64(); v {
+	case dcgm.DCGM_FT_FP64_BLANK:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_FP64_NOT_FOUND:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_FP64_NOT_SUPPORTED:
+		return SkipDCGMValue
+	case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED:
+		return SkipDCGMValue
+	}
+	switch v := value.FieldType; v {
+	case dcgm.DCGM_FT_STRING:
+		return value.String()
+	case dcgm.DCGM_FT_DOUBLE:
+		return fmt.Sprintf("%f", value.Float64())
+	case dcgm.DCGM_FT_INT64:
+		return fmt.Sprintf("%d", value.Int64())
+	default:
+		return FailedToConvert
+	}
+
+	return FailedToConvert
+}
diff --git a/pkg/sensors/accelerator/gpu/source/gpu_dummy.go b/pkg/sensors/accelerator/gpu/source/gpu_dummy.go
index aa7237ee78..c9200c2b9d 100644
--- a/pkg/sensors/accelerator/gpu/source/gpu_dummy.go
+++ b/pkg/sensors/accelerator/gpu/source/gpu_dummy.go
@@ -21,8 +21,6 @@ package source
 
 import (
 	"time"
-
-	"github.com/NVIDIA/go-nvml/pkg/nvml"
 )
 
 type GPUDummy struct {
@@ -47,13 +45,12 @@ func (d *GPUDummy) GetAbsEnergyFromGPU() []uint32 {
 	return []uint32{}
 }
 
-func (d *GPUDummy) GetGpus() []interface{} {
-	var devices []interface{}
-	devices = append(devices, nvml.Device{})
+func (d *GPUDummy) GetGpus() map[string]interface{} {
+	var devices map[string]interface{}
 	return devices
 }
 
-func (n *GPUDummy) GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]ProcessUtilizationSample, error) {
+func (n *GPUDummy) GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]ProcessUtilizationSample, error) {
 	processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{}
 	processAcceleratorMetrics[0] = ProcessUtilizationSample{
 		Pid:       0,
diff --git a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go
index cd87962507..a078c2d136 100644
--- a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go
+++ b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go
@@ -30,7 +30,7 @@ import (
 
 var (
 	// List of GPU identifiers for the device
-	devices []interface{}
+	devices map[string]interface{}
 	// bool to check if the process utilization collection is supported
 	processUtilizationSupported bool = true
 )
@@ -66,7 +66,7 @@ func (n *GPUNvml) Init() (err error) {
 		return err
 	}
 	klog.Infof("found %d gpu devices\n", count)
-	devices = make([]interface{}, count)
+	devices = make(map[string]interface{}, count)
 	for i := 0; i < count; i++ {
 		device, ret := nvml.DeviceGetHandleByIndex(i)
 		if ret != nvml.SUCCESS {
@@ -76,8 +76,9 @@ func (n *GPUNvml) Init() (err error) {
 			return err
 		}
 		name, _ := device.GetName()
-		klog.Infoln("GPU", i, name)
-		devices[i] = device
+		uuid, _ := device.GetUUID()
+		klog.Infof("GPU %v %q %q", i, name, uuid)
+		devices[uuid] = device
 	}
 	n.collectionSupported = true
 	return nil
@@ -89,7 +90,7 @@ func (n *GPUNvml) Shutdown() bool {
 }
 
 // GetGpus returns a map with gpu device
-func (n *GPUNvml) GetGpus() []interface{} {
+func (n *GPUNvml) GetGpus() map[string]interface{} {
 	return devices
 }
 
@@ -114,7 +115,7 @@ func (n *GPUNvml) GetAbsEnergyFromGPU() []uint32 {
 //
 //	ProcessUtilizationSample.SmUtil represents the process Streaming Multiprocessors - SM (3D/Compute) utilization in percentage.
 //	ProcessUtilizationSample.MemUtil represents the process Frame Buffer Memory utilization Value.
-func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]ProcessUtilizationSample, error) {
+func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]ProcessUtilizationSample, error) {
 	processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{}
 	lastUtilizationTimestamp := uint64(time.Now().Add(-1*since).UnixNano() / 1000)
 
@@ -164,7 +165,7 @@ func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, sin
 					Pid:     pinfo.Pid,
 					MemUtil: uint32(pinfo.UsedGpuMemory * 100 / memoryInfo.Total),
 				}
-				klog.V(5).Infof("pid: %d, memUtil: %d\n", pinfo.Pid, processAcceleratorMetrics[pinfo.Pid].MemUtil)
+				klog.V(1).Infof("pid: %d, memUtil: %d gpu instance %d compute instance %d\n", pinfo.Pid, processAcceleratorMetrics[pinfo.Pid].MemUtil, pinfo.GpuInstanceId, pinfo.ComputeInstanceId)
 			}
 		}
 	}
diff --git a/pkg/sensors/accelerator/gpu/source/nvml_util.go b/pkg/sensors/accelerator/gpu/source/nvml_util.go
new file mode 100644
index 0000000000..6ef8b70b53
--- /dev/null
+++ b/pkg/sensors/accelerator/gpu/source/nvml_util.go
@@ -0,0 +1,145 @@
+//go:build gpu
+// +build gpu
+
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+package source
+
+import (
+	"encoding/xml"
+	"fmt"
+	"os/exec"
+
+	"k8s.io/klog/v2"
+)
+
+type NvidiaSmiLog struct {
+	XMLName       xml.Name `xml:"nvidia_smi_log"`
+	Timestamp     string   `xml:"timestamp,omitempty"`
+	DriverVersion string   `xml:"driver_version,omitempty"`
+	CudaVersion   string   `xml:"cuda_version,omitempty"`
+	AttachedGPUs  int      `xml:"attached_gpus,omitempty"`
+	GPU           []GPU    `xml:"gpu"`
+}
+
+type GPU struct {
+	ID         string      `xml:"id,attr"`
+	MigMode    MigMode     `xml:"mig_mode,omitempty"`
+	MigDevices []MigDevice `xml:"mig_devices>mig_device,omitempty"`
+	UUID       string      `xml:"uuid,omitempty"`
+}
+
+type MigMode struct {
+	CurrentMig string `xml:"current_mig,omitempty"`
+	PendingMig string `xml:"pending_mig,omitempty"`
+}
+
+type MigDevice struct {
+	Index                    int              `xml:"index,omitempty"`
+	GPUInstanceID            int              `xml:"gpu_instance_id,omitempty"`
+	ComputeInstanceID        int              `xml:"compute_instance_id,omitempty"`
+	DeviceAttributes         DeviceAttributes `xml:"device_attributes,omitempty"`
+	ECCErrorCount            ECCErrorCount    `xml:"ecc_error_count,omitempty"`
+	FBMemoryUsage            MemoryUsage      `xml:"fb_memory_usage,omitempty"`
+	Bar1MemoryUsage          MemoryUsage      `xml:"bar1_memory_usage,omitempty"`
+	EntityName               string           // this is set later
+	MultiprocessorCountRatio float64          // this is set later
+}
+
+type DeviceAttributes struct {
+	Shared SharedAttributes `xml:"shared,omitempty"`
+}
+
+type SharedAttributes struct {
+	MultiprocessorCount int `xml:"multiprocessor_count,omitempty"`
+	CopyEngineCount     int `xml:"copy_engine_count,omitempty"`
+	EncoderCount        int `xml:"encoder_count,omitempty"`
+	DecoderCount        int `xml:"decoder_count,omitempty"`
+	OFACount            int `xml:"ofa_count,omitempty"`
+	JPGCount            int `xml:"jpg_count,omitempty"`
+}
+
+type ECCErrorCount struct {
+	VolatileCount VolatileCount `xml:"volatile_count,omitempty"`
+}
+
+type VolatileCount struct {
+	SRAMUncorrectable int `xml:"sram_uncorrectable,omitempty"`
+}
+
+type MemoryUsage struct {
+	Total    string `xml:"total,omitempty"`
+	Reserved string `xml:"reserved,omitempty"`
+	Used     string `xml:"used,omitempty"`
+	Free     string `xml:"free,omitempty"`
+}
+
+// RetriveFromNvidiaSMI retrives the MIG information from nvidia-smi
+func RetriveFromNvidiaSMI(debug bool) (gpuMigArray [][]MigDevice, totalMultiProcessorCount map[string]int, err error) {
+	cmd := exec.Command("nvidia-smi", "-q", "-x")
+	output, err := cmd.Output()
+	if err != nil {
+		err = fmt.Errorf("Error running nvidia-smi command:", err)
+		return
+	}
+
+	var nvidiaSmiLog NvidiaSmiLog
+	err = xml.Unmarshal(output, &nvidiaSmiLog)
+	if err != nil {
+		err = fmt.Errorf("Error unmarshaling XML:", err)
+		return
+	}
+
+	gpuMigArray = make([][]MigDevice, len(nvidiaSmiLog.GPU))
+	totalMultiProcessorCount = make(map[string]int, len(nvidiaSmiLog.GPU))
+	for i, gpu := range nvidiaSmiLog.GPU {
+		// find the largest GPUInstanceID among the MIGDevices, to make sure we have enough space in the array
+		maxGPUInstanceID := 0
+		for _, migDevice := range gpu.MigDevices {
+			if migDevice.GPUInstanceID > maxGPUInstanceID {
+				maxGPUInstanceID = migDevice.GPUInstanceID
+			}
+		}
+		gpuMigArray[i] = make([]MigDevice, maxGPUInstanceID+1)
+		totalMultiProcessorCount[gpu.UUID] = 0
+		for _, migDevice := range gpu.MigDevices {
+			gpuMigArray[i][migDevice.GPUInstanceID] = migDevice
+			totalMultiProcessorCount[gpu.UUID] += migDevice.DeviceAttributes.Shared.MultiprocessorCount
+		}
+		// count MultiprocessorCountRatio for each device
+		for j, migDevice := range gpuMigArray[i] {
+			gpuMigArray[i][j].MultiprocessorCountRatio = float64(migDevice.DeviceAttributes.Shared.MultiprocessorCount) / float64(totalMultiProcessorCount[gpu.UUID])
+		}
+	}
+
+	if debug {
+		for i, gpu := range nvidiaSmiLog.GPU {
+			for _, device := range gpuMigArray[i] {
+				klog.Infof("GPU %d %q", i, gpu.UUID)
+				klog.Infof("\tIndex: %d\n", device.Index)
+				klog.Infof("\tGPUInstanceID: %d\n", device.GPUInstanceID)
+				klog.Infof("\tComputeInstanceID: %d\n", device.ComputeInstanceID)
+				klog.Infof("\tShared MultiprocessorCount: %d\n", device.DeviceAttributes.Shared.MultiprocessorCount)
+				klog.Infof("\tShared MultiprocessorCountRatio: %f\n", device.MultiprocessorCountRatio)
+				klog.Infof("\tFBMemoryUsage Total: %s\n", device.FBMemoryUsage.Total)
+				klog.Infof("\tFBMemoryUsage Reserved: %s\n", device.FBMemoryUsage.Reserved)
+				klog.Infof("\tFBMemoryUsage Used: %s\n", device.FBMemoryUsage.Used)
+				klog.Infof("\tFBMemoryUsage Free: %s\n", device.FBMemoryUsage.Free)
+			}
+		}
+	}
+	return
+}
diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go
index ffe7d38685..0166131d3c 100644
--- a/pkg/utils/utils.go
+++ b/pkg/utils/utils.go
@@ -49,6 +49,7 @@ const (
 	SystemProcessNamespace string = "system"
 	EmptyString            string = ""
 	GenericSocketID        string = "socket0"
+	GenericGPUID           string = "gpu"
 )
 
 func GetPathFromPID(searchPath string, pid uint64) (string, error) {
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 717873c366..cbe311bb66 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -1,3 +1,9 @@
+# github.com/Masterminds/semver v1.5.0
+## explicit
+github.com/Masterminds/semver
+# github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f
+## explicit; go 1.21
+github.com/NVIDIA/go-dcgm/pkg/dcgm
 # github.com/NVIDIA/go-nvml v0.12.0-1
 ## explicit; go 1.15
 github.com/NVIDIA/go-nvml/pkg/dl
@@ -11,6 +17,9 @@ github.com/aquasecurity/libbpfgo
 # github.com/beorn7/perks v1.0.1
 ## explicit; go 1.11
 github.com/beorn7/perks/quantile
+# github.com/bits-and-blooms/bitset v1.13.0
+## explicit; go 1.16
+github.com/bits-and-blooms/bitset
 # github.com/cespare/xxhash/v2 v2.2.0
 ## explicit; go 1.11
 github.com/cespare/xxhash/v2
@@ -248,6 +257,9 @@ github.com/opencontainers/runtime-spec/specs-go
 # github.com/pkg/errors v0.9.1
 ## explicit
 github.com/pkg/errors
+# github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2
+## explicit
+github.com/pmezard/go-difflib/difflib
 # github.com/prometheus/client_golang v1.17.0
 ## explicit; go 1.19
 github.com/prometheus/client_golang/api
@@ -283,6 +295,9 @@ github.com/sirupsen/logrus
 # github.com/spf13/pflag v1.0.5
 ## explicit; go 1.12
 github.com/spf13/pflag
+# github.com/stretchr/testify v1.8.4
+## explicit; go 1.20
+github.com/stretchr/testify/assert
 # golang.org/x/exp v0.0.0-20231006140011-7918f672742d
 ## explicit; go 1.20
 golang.org/x/exp/constraints