reduce unnecessary info and overhead

Signed-off-by: Huamin Chen <[email protected]>
sustainable-computing-io · Feb 17, 2024 · df927f0 · df927f0
1 parent 112140c
commit df927f0
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 61 deletions.
diff --git a/pkg/bpfassets/attacher/libbpf_attacher.go b/pkg/bpfassets/attacher/libbpf_attacher.go
@@ -57,7 +57,7 @@ var (
 	uint64Key uint64
 	maxRetry  = config.MaxLookupRetry
 	bpfArrays = []string{
-		"cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_event_reader",
+		"cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_ms_event_reader",
 		"cpu_cycles", "cpu_ref_cycles", "cpu_instructions", "cache_miss", "cpu_freq_array", "task_clock",
 	}
 	cpuCores = getCPUCores()

diff --git a/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go
@@ -37,16 +37,10 @@ const (
 var (
 	deviceFields []dcgm.Short = []dcgm.Short{
 		// https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.htm
-		dcgm.DCGM_FI_PROF_SM_ACTIVE,
-		dcgm.DCGM_FI_PROF_SM_OCCUPANCY,
 		dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE,
-		dcgm.DCGM_FI_DEV_POWER_USAGE,
 	}
 	deviceFieldsString = []string{
-		"dcgm.DCGM_FI_PROF_SM_ACTIVE",
-		"dcgm.DCGM_FI_PROF_SM_OCCUPANCY",
 		"dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE",
-		"dcgm.DCGM_FI_DEV_POWER_USAGE",
 	}
 	ratioFields              uint = dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE // this is the field that we will use to calculate the utilization per @yuezhu1
 	SkipDCGMValue                 = "SKIPPING DCGM VALUE"
@@ -75,9 +69,16 @@ func (d *GPUDcgm) GetName() string {
 func (d *GPUDcgm) Init() error {
 	d.devices = make(map[string]interface{})
 	d.entities = make(map[string]dcgm.GroupEntityPair)
-	if err := d.initDCGM(); err != nil {
-		return err
+
+	cleanup, err := dcgm.Init(dcgm.Embedded)
+	if err != nil {
+		if cleanup != nil {
+			cleanup()
+		}
+		return fmt.Errorf("not able to connect to DCGM: %s", err)
 	}
+	d.cleanup = cleanup
+	dcgm.FieldsInit()
 
 	if err := d.initNVML(); err != nil {
 		d.Shutdown()
@@ -103,12 +104,22 @@ func (d *GPUDcgm) Init() error {
 		d.Shutdown()
 		return err
 	}
-
+	klog.Infof("DCGM initialized successfully")
 	d.collectionSupported = true
 	return nil
 }
 
+func (d *GPUDcgm) IsGPUCollectionSupported() bool {
+	return d.collectionSupported
+}
+
+func (d *GPUDcgm) SetGPUCollectionSupported(supported bool) {
+	d.collectionSupported = supported
+}
+
 func (d *GPUDcgm) Shutdown() bool {
+	nvml.Shutdown()
+	dcgm.FieldsTerm()
 	if d.deviceGroupName != "" {
 		dcgm.DestroyGroup(d.deviceGroupHandle)
 	}
@@ -220,27 +231,7 @@ func (d *GPUDcgm) GetProcessResourceUtilizationPerDevice(device interface{}, dev
 	return processAcceleratorMetrics, nil
 }
 
-func (d *GPUDcgm) IsGPUCollectionSupported() bool {
-	return d.collectionSupported
-}
-
-func (d *GPUDcgm) SetGPUCollectionSupported(supported bool) {
-	d.collectionSupported = supported
-}
-
 // helper functions
-func (d *GPUDcgm) initDCGM() error {
-	cleanup, err := dcgm.Init(dcgm.Embedded)
-	if err != nil {
-		if cleanup != nil {
-			cleanup()
-		}
-		return fmt.Errorf("not able to connect to DCGM: %s", err)
-	}
-	d.cleanup = cleanup
-	return nil
-}
-
 func (d *GPUDcgm) initNVML() error {
 	if ret := nvml.Init(); ret != nvml.SUCCESS {
 		d.collectionSupported = false
@@ -282,6 +273,7 @@ func (d *GPUDcgm) addDevicesToGroup() error {
 			d.entities[fmt.Sprintf("%v", gpuIndex)] = dcgm.GroupEntityPair{dcgm.FE_GPU, gpuIndex}
 		}
 	}
+
 	// add entity to the group
 	hierarchy, err := dcgm.GetGpuInstanceHierarchy()
 	if err != nil {
@@ -292,7 +284,7 @@ func (d *GPUDcgm) addDevicesToGroup() error {
 	if hierarchy.Count > 0 {
 		// if MIG is enabled, we need to know the hierarchy as well as the multiprocessor count in each device.
 		// we will use the multiprocessor count to calculate the utilization of each instance
-		if gpuMigArray, totalMultiProcessorCount, err = RetriveFromNvidiaSMI(true); err != nil {
+		if gpuMigArray, totalMultiProcessorCount, err = RetriveFromNvidiaSMI(false); err != nil {
 			klog.Infof("failed to retrive from nvidia-smi: %v", err)
 			// if we cannot get the multiprocessor count, we will not be able to calculate the utilization
 		}
@@ -326,7 +318,8 @@ func (d *GPUDcgm) createFieldGroup() error {
 }
 
 func (d *GPUDcgm) setupWatcher() error {
-	err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(config.SamplePeriodSec*1000), 0.0, 1)
+	// watch interval has an impact on cpu usage, set it carefully
+	err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(1000)*1000, 0.0, 1)
 	if err != nil {
 		return fmt.Errorf("failed to set up watcher, err %v", err)
 	}

diff --git a/pkg/sensors/accelerator/gpu/source/nvml_util.go b/pkg/sensors/accelerator/gpu/source/nvml_util.go
@@ -48,13 +48,9 @@ type MigMode struct {
 }
 
 type MigDevice struct {
-	Index                    int              `xml:"index,omitempty"`
 	GPUInstanceID            int              `xml:"gpu_instance_id,omitempty"`
 	ComputeInstanceID        int              `xml:"compute_instance_id,omitempty"`
 	DeviceAttributes         DeviceAttributes `xml:"device_attributes,omitempty"`
-	ECCErrorCount            ECCErrorCount    `xml:"ecc_error_count,omitempty"`
-	FBMemoryUsage            MemoryUsage      `xml:"fb_memory_usage,omitempty"`
-	Bar1MemoryUsage          MemoryUsage      `xml:"bar1_memory_usage,omitempty"`
 	EntityName               string           // this is set later
 	MultiprocessorCountRatio float64          // this is set later
 }
@@ -65,26 +61,6 @@ type DeviceAttributes struct {
 
 type SharedAttributes struct {
 	MultiprocessorCount int `xml:"multiprocessor_count,omitempty"`
-	CopyEngineCount     int `xml:"copy_engine_count,omitempty"`
-	EncoderCount        int `xml:"encoder_count,omitempty"`
-	DecoderCount        int `xml:"decoder_count,omitempty"`
-	OFACount            int `xml:"ofa_count,omitempty"`
-	JPGCount            int `xml:"jpg_count,omitempty"`
-}
-
-type ECCErrorCount struct {
-	VolatileCount VolatileCount `xml:"volatile_count,omitempty"`
-}
-
-type VolatileCount struct {
-	SRAMUncorrectable int `xml:"sram_uncorrectable,omitempty"`
-}
-
-type MemoryUsage struct {
-	Total    string `xml:"total,omitempty"`
-	Reserved string `xml:"reserved,omitempty"`
-	Used     string `xml:"used,omitempty"`
-	Free     string `xml:"free,omitempty"`
 }
 
 // RetriveFromNvidiaSMI retrives the MIG information from nvidia-smi
@@ -129,15 +105,10 @@ func RetriveFromNvidiaSMI(debug bool) (gpuMigArray [][]MigDevice, totalMultiProc
 		for i, gpu := range nvidiaSmiLog.GPU {
 			for _, device := range gpuMigArray[i] {
 				klog.Infof("GPU %d %q", i, gpu.UUID)
-				klog.Infof("\tIndex: %d\n", device.Index)
 				klog.Infof("\tGPUInstanceID: %d\n", device.GPUInstanceID)
 				klog.Infof("\tComputeInstanceID: %d\n", device.ComputeInstanceID)
 				klog.Infof("\tShared MultiprocessorCount: %d\n", device.DeviceAttributes.Shared.MultiprocessorCount)
 				klog.Infof("\tShared MultiprocessorCountRatio: %f\n", device.MultiprocessorCountRatio)
-				klog.Infof("\tFBMemoryUsage Total: %s\n", device.FBMemoryUsage.Total)
-				klog.Infof("\tFBMemoryUsage Reserved: %s\n", device.FBMemoryUsage.Reserved)
-				klog.Infof("\tFBMemoryUsage Used: %s\n", device.FBMemoryUsage.Used)
-				klog.Infof("\tFBMemoryUsage Free: %s\n", device.FBMemoryUsage.Free)
 			}
 		}
 	}