Skip to content

Commit

Permalink
reduce unnecessary info and overhead
Browse files Browse the repository at this point in the history
Signed-off-by: Huamin Chen <[email protected]>
  • Loading branch information
rootfs committed Feb 17, 2024
1 parent 112140c commit df927f0
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 61 deletions.
2 changes: 1 addition & 1 deletion pkg/bpfassets/attacher/libbpf_attacher.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ var (
uint64Key uint64
maxRetry = config.MaxLookupRetry
bpfArrays = []string{
"cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_event_reader",
"cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_ms_event_reader",
"cpu_cycles", "cpu_ref_cycles", "cpu_instructions", "cache_miss", "cpu_freq_array", "task_clock",
}
cpuCores = getCPUCores()
Expand Down
55 changes: 24 additions & 31 deletions pkg/sensors/accelerator/gpu/source/gpu_dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,10 @@ const (
var (
deviceFields []dcgm.Short = []dcgm.Short{
// https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.htm
dcgm.DCGM_FI_PROF_SM_ACTIVE,
dcgm.DCGM_FI_PROF_SM_OCCUPANCY,
dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE,
dcgm.DCGM_FI_DEV_POWER_USAGE,
}
deviceFieldsString = []string{
"dcgm.DCGM_FI_PROF_SM_ACTIVE",
"dcgm.DCGM_FI_PROF_SM_OCCUPANCY",
"dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE",
"dcgm.DCGM_FI_DEV_POWER_USAGE",
}
ratioFields uint = dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE // this is the field that we will use to calculate the utilization per @yuezhu1
SkipDCGMValue = "SKIPPING DCGM VALUE"
Expand Down Expand Up @@ -75,9 +69,16 @@ func (d *GPUDcgm) GetName() string {
func (d *GPUDcgm) Init() error {
d.devices = make(map[string]interface{})
d.entities = make(map[string]dcgm.GroupEntityPair)
if err := d.initDCGM(); err != nil {
return err

cleanup, err := dcgm.Init(dcgm.Embedded)
if err != nil {
if cleanup != nil {
cleanup()
}
return fmt.Errorf("not able to connect to DCGM: %s", err)
}
d.cleanup = cleanup
dcgm.FieldsInit()

if err := d.initNVML(); err != nil {
d.Shutdown()
Expand All @@ -103,12 +104,22 @@ func (d *GPUDcgm) Init() error {
d.Shutdown()
return err
}

klog.Infof("DCGM initialized successfully")
d.collectionSupported = true
return nil
}

func (d *GPUDcgm) IsGPUCollectionSupported() bool {
return d.collectionSupported
}

func (d *GPUDcgm) SetGPUCollectionSupported(supported bool) {
d.collectionSupported = supported
}

func (d *GPUDcgm) Shutdown() bool {
nvml.Shutdown()
dcgm.FieldsTerm()
if d.deviceGroupName != "" {
dcgm.DestroyGroup(d.deviceGroupHandle)
}
Expand Down Expand Up @@ -220,27 +231,7 @@ func (d *GPUDcgm) GetProcessResourceUtilizationPerDevice(device interface{}, dev
return processAcceleratorMetrics, nil
}

func (d *GPUDcgm) IsGPUCollectionSupported() bool {
return d.collectionSupported
}

func (d *GPUDcgm) SetGPUCollectionSupported(supported bool) {
d.collectionSupported = supported
}

// helper functions
func (d *GPUDcgm) initDCGM() error {
cleanup, err := dcgm.Init(dcgm.Embedded)
if err != nil {
if cleanup != nil {
cleanup()
}
return fmt.Errorf("not able to connect to DCGM: %s", err)
}
d.cleanup = cleanup
return nil
}

func (d *GPUDcgm) initNVML() error {
if ret := nvml.Init(); ret != nvml.SUCCESS {
d.collectionSupported = false
Expand Down Expand Up @@ -282,6 +273,7 @@ func (d *GPUDcgm) addDevicesToGroup() error {
d.entities[fmt.Sprintf("%v", gpuIndex)] = dcgm.GroupEntityPair{dcgm.FE_GPU, gpuIndex}
}
}

// add entity to the group
hierarchy, err := dcgm.GetGpuInstanceHierarchy()
if err != nil {
Expand All @@ -292,7 +284,7 @@ func (d *GPUDcgm) addDevicesToGroup() error {
if hierarchy.Count > 0 {
// if MIG is enabled, we need to know the hierarchy as well as the multiprocessor count in each device.
// we will use the multiprocessor count to calculate the utilization of each instance
if gpuMigArray, totalMultiProcessorCount, err = RetriveFromNvidiaSMI(true); err != nil {
if gpuMigArray, totalMultiProcessorCount, err = RetriveFromNvidiaSMI(false); err != nil {
klog.Infof("failed to retrive from nvidia-smi: %v", err)
// if we cannot get the multiprocessor count, we will not be able to calculate the utilization
}
Expand Down Expand Up @@ -326,7 +318,8 @@ func (d *GPUDcgm) createFieldGroup() error {
}

func (d *GPUDcgm) setupWatcher() error {
err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(config.SamplePeriodSec*1000), 0.0, 1)
// watch interval has an impact on cpu usage, set it carefully
err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(1000)*1000, 0.0, 1)
if err != nil {
return fmt.Errorf("failed to set up watcher, err %v", err)
}
Expand Down
29 changes: 0 additions & 29 deletions pkg/sensors/accelerator/gpu/source/nvml_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,9 @@ type MigMode struct {
}

type MigDevice struct {
Index int `xml:"index,omitempty"`
GPUInstanceID int `xml:"gpu_instance_id,omitempty"`
ComputeInstanceID int `xml:"compute_instance_id,omitempty"`
DeviceAttributes DeviceAttributes `xml:"device_attributes,omitempty"`
ECCErrorCount ECCErrorCount `xml:"ecc_error_count,omitempty"`
FBMemoryUsage MemoryUsage `xml:"fb_memory_usage,omitempty"`
Bar1MemoryUsage MemoryUsage `xml:"bar1_memory_usage,omitempty"`
EntityName string // this is set later
MultiprocessorCountRatio float64 // this is set later
}
Expand All @@ -65,26 +61,6 @@ type DeviceAttributes struct {

type SharedAttributes struct {
MultiprocessorCount int `xml:"multiprocessor_count,omitempty"`
CopyEngineCount int `xml:"copy_engine_count,omitempty"`
EncoderCount int `xml:"encoder_count,omitempty"`
DecoderCount int `xml:"decoder_count,omitempty"`
OFACount int `xml:"ofa_count,omitempty"`
JPGCount int `xml:"jpg_count,omitempty"`
}

type ECCErrorCount struct {
VolatileCount VolatileCount `xml:"volatile_count,omitempty"`
}

type VolatileCount struct {
SRAMUncorrectable int `xml:"sram_uncorrectable,omitempty"`
}

type MemoryUsage struct {
Total string `xml:"total,omitempty"`
Reserved string `xml:"reserved,omitempty"`
Used string `xml:"used,omitempty"`
Free string `xml:"free,omitempty"`
}

// RetriveFromNvidiaSMI retrives the MIG information from nvidia-smi
Expand Down Expand Up @@ -129,15 +105,10 @@ func RetriveFromNvidiaSMI(debug bool) (gpuMigArray [][]MigDevice, totalMultiProc
for i, gpu := range nvidiaSmiLog.GPU {
for _, device := range gpuMigArray[i] {
klog.Infof("GPU %d %q", i, gpu.UUID)
klog.Infof("\tIndex: %d\n", device.Index)
klog.Infof("\tGPUInstanceID: %d\n", device.GPUInstanceID)
klog.Infof("\tComputeInstanceID: %d\n", device.ComputeInstanceID)
klog.Infof("\tShared MultiprocessorCount: %d\n", device.DeviceAttributes.Shared.MultiprocessorCount)
klog.Infof("\tShared MultiprocessorCountRatio: %f\n", device.MultiprocessorCountRatio)
klog.Infof("\tFBMemoryUsage Total: %s\n", device.FBMemoryUsage.Total)
klog.Infof("\tFBMemoryUsage Reserved: %s\n", device.FBMemoryUsage.Reserved)
klog.Infof("\tFBMemoryUsage Used: %s\n", device.FBMemoryUsage.Used)
klog.Infof("\tFBMemoryUsage Free: %s\n", device.FBMemoryUsage.Free)
}
}
}
Expand Down

0 comments on commit df927f0

Please sign in to comment.