diff --git a/Makefile b/Makefile index 9436e02708..6b25dfa68a 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,8 @@ else GC_FLAGS = endif -GENERAL_TAGS := 'include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo gpu libbpf ' +GENERAL_TAGS := 'include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo libbpf ' +GPU_TAGS := ' gpu ' GO_LD_FLAGS := $(GC_FLAGS) -ldflags "-X $(LD_FLAGS)" $(CFLAGS) # set GOENV @@ -53,7 +54,8 @@ GOENV = GO111MODULE="" GOOS=$(GOOS) GOARCH=$(GOARCH) CGO_ENABLED=1 CC=clang CGO_ DOCKERFILE := $(SRC_ROOT)/build/Dockerfile IMAGE_BUILD_TAG := $(SOURCE_GIT_TAG)-linux-$(GOARCH) -GO_BUILD_TAGS := $(GENERAL_TAGS)$(GOOS) +GO_BUILD_TAGS := $(GENERAL_TAGS)$(GOOS)$(GPU_TAGS) +GO_TEST_TAGS := $(GENERAL_TAGS)$(GOOS) # for testsuite ENVTEST_ASSETS_DIR=$(SRC_ROOT)/test-bin @@ -214,28 +216,28 @@ container_test: make test-container-verbose' test: ginkgo-set tidy-vendor - @echo TAGS=$(GO_BUILD_TAGS) - @$(GOENV) go test -tags $(GO_BUILD_TAGS) ./... --race --bench=. -cover --count=1 --vet=all + @echo TAGS=$(GO_TEST_TAGS) + @$(GOENV) go test -tags $(GO_TEST_TAGS) ./... --race --bench=. -cover --count=1 --vet=all -v test-verbose: ginkgo-set tidy-vendor - @echo TAGS=$(GO_BUILD_TAGS) + @echo TAGS=$(GO_TEST_TAGS) @echo GOENV=$(GOENV) - @$(GOENV) go test -tags $(GO_BUILD_TAGS) \ + @$(GOENV) go test -tags $(GO_TEST_TAGS) \ -timeout=30m \ -covermode=atomic -coverprofile=coverage.out \ -v $$(go list ./... | grep pkg | grep -v bpfassets) \ --race --bench=. -cover --count=1 --vet=all test-container-verbose: ginkgo-set tidy-vendor - @echo TAGS=$(GO_BUILD_TAGS) + @echo TAGS=$(GO_TEST_TAGS) @echo GOENV=$(GOENV) - @$(GOENV) go test -tags $(GO_BUILD_TAGS) \ + @$(GOENV) go test -tags $(GO_TEST_TAGS) \ -covermode=atomic -coverprofile=coverage.out \ -v $$(go list ./... | grep pkg | grep -v bpfassets) \ --race -cover --count=1 --vet=all test-mac-verbose: ginkgo-set - @echo TAGS=$(GO_BUILD_TAGS) + @echo TAGS=$(GO_TEST_TAGS) @go test $$(go list ./... | grep pkg | grep -v bpfassets) --race --bench=. -cover --count=1 --vet=all escapes_detect: tidy-vendor diff --git a/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o b/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o index aa8e612463..1991cfd50f 100644 Binary files a/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o and b/bpfassets/libbpf/bpf.o/amd64_kepler.bpf.o differ diff --git a/go.mod b/go.mod index 5df161a474..4c2b59bf17 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/sustainable-computing-io/kepler go 1.20 require ( + github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f github.com/NVIDIA/go-nvml v0.12.0-1 github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0 github.com/containerd/cgroups v1.1.0 @@ -29,8 +30,10 @@ require ( ) require ( + github.com/Masterminds/semver v1.5.0 // indirect github.com/StackExchange/wmi v1.2.1 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/bits-and-blooms/bitset v1.13.0 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cilium/ebpf v0.9.1 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect @@ -67,9 +70,11 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/procfs v0.11.1 // indirect github.com/spf13/pflag v1.0.5 // indirect + github.com/stretchr/testify v1.8.4 // indirect golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect golang.org/x/net v0.17.0 // indirect golang.org/x/oauth2 v0.13.0 // indirect diff --git a/go.sum b/go.sum index 1da1f3f146..cf7192d91d 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= +github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= +github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f h1:HEY1H1By8XI2P6KHA0wk+nXsBE+l/iYRCAwR6nZAoU8= +github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f/go.mod h1:kaRlwPjisNMY7xH8QWJ+6q76YJ/1eu6pWV45B5Ew6C4= github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM= github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA= @@ -7,6 +11,8 @@ github.com/aquasecurity/libbpfgo v0.4.9-libbpf-1.2.0/go.mod h1:UD3Mfr+JZ/ASK2VMu github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= +github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4= @@ -130,6 +136,7 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q= github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY= github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= @@ -156,6 +163,7 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= diff --git a/pkg/bpfassets/attacher/libbpf_attacher.go b/pkg/bpfassets/attacher/libbpf_attacher.go index 9eddf83c8f..c1708acdaf 100644 --- a/pkg/bpfassets/attacher/libbpf_attacher.go +++ b/pkg/bpfassets/attacher/libbpf_attacher.go @@ -57,7 +57,7 @@ var ( uint64Key uint64 maxRetry = config.MaxLookupRetry bpfArrays = []string{ - "cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_event_reader", + "cpu_cycles_event_reader", "cpu_ref_cycles_event_reader", "cpu_instructions_event_reader", "cache_miss_event_reader", "task_clock_ms_event_reader", "cpu_cycles", "cpu_ref_cycles", "cpu_instructions", "cache_miss", "cpu_freq_array", "task_clock", } cpuCores = getCPUCores() diff --git a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go index 3a933a4da3..a1d4377792 100644 --- a/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go +++ b/pkg/collector/resourceutilization/accelerator/process_gpu_collector.go @@ -46,8 +46,8 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats var err error var processesUtilization map[uint32]gpu_source.ProcessUtilizationSample // calculate the gpu's processes energy consumption for each gpu - for _, device := range gpu.GetGpus() { - if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, time.Since(lastUtilizationTimestamp)); err != nil { + for gpuID, device := range gpu.GetGpus() { + if processesUtilization, err = gpu.GetProcessResourceUtilizationPerDevice(device, gpuID, time.Since(lastUtilizationTimestamp)); err != nil { klog.Infoln(err) continue } @@ -78,8 +78,9 @@ func UpdateNodeGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats } processStats[uintPid] = stats.NewProcessStats(uintPid, uint64(0), containerID, vmID, command) } - processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.SmUtil)) - processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(utils.GenericSocketID, uint64(processUtilization.MemUtil)) + gpuName := fmt.Sprintf("%s%v", utils.GenericGPUID, gpuID) + processStats[uintPid].ResourceUsage[config.GPUSMUtilization].AddDeltaStat(gpuName, uint64(processUtilization.SmUtil)) + processStats[uintPid].ResourceUsage[config.GPUMemUtilization].AddDeltaStat(gpuName, uint64(processUtilization.MemUtil)) } } diff --git a/pkg/collector/stats/stats.go b/pkg/collector/stats/stats.go index a2b104c327..7bbb9ddf5a 100644 --- a/pkg/collector/stats/stats.go +++ b/pkg/collector/stats/stats.go @@ -75,7 +75,7 @@ func NewStats() *Stats { m.ResourceUsage[metricName] = types.NewUInt64StatCollection() } - if gpu.IsGPUCollectionSupported() { + if config.EnabledGPU { m.ResourceUsage[config.GPUSMUtilization] = types.NewUInt64StatCollection() m.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() } diff --git a/pkg/config/config.go b/pkg/config/config.go index a58895b4f3..53463e8f79 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -455,3 +455,7 @@ func IsCgroupMetricsEnabled() bool { func IsIRQCounterMetricsEnabled() bool { return ExposeIRQCounterMetrics } + +func SetGpuUsageMetric(metric string) { + GpuUsageMetric = metric +} diff --git a/pkg/metrics/consts/conts.go b/pkg/metrics/consts/conts.go index 3a37dcfe12..8b0d583d78 100644 --- a/pkg/metrics/consts/conts.go +++ b/pkg/metrics/consts/conts.go @@ -39,6 +39,7 @@ var ( ContainerResUtilLabels = []string{"container_id", "pod_name", "container_name", "container_namespace"} VMResUtilLabels = []string{"vm_id"} NodeResUtilLabels = []string{"device", "instance"} + GPUResUtilLabels = []string{"gpu_id"} ) var ( @@ -90,4 +91,8 @@ var ( config.CgroupfsSystemCPU, config.CgroupfsUserCPU, } + GPUMetricNames = []string{ + config.GPUSMUtilization, + config.GPUMemUtilization, + } ) diff --git a/pkg/metrics/container/metrics.go b/pkg/metrics/container/metrics.go index 99e7655be5..a6b6f41ddd 100644 --- a/pkg/metrics/container/metrics.go +++ b/pkg/metrics/container/metrics.go @@ -79,6 +79,10 @@ func (c *collector) initMetrics() { c.descriptions[name] = desc c.collectors[name] = metricfactory.NewPromCounter(desc) } + for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) { + c.descriptions[name] = desc + c.collectors[name] = metricfactory.NewPromCounter(desc) + } desc := metricfactory.MetricsPromDesc(context, "joules", "_total", "", consts.ContainerEnergyLabels) c.descriptions["total"] = desc diff --git a/pkg/metrics/metricfactory/metric_factory.go b/pkg/metrics/metricfactory/metric_factory.go index 77fac69322..6a132287e8 100644 --- a/pkg/metrics/metricfactory/metric_factory.go +++ b/pkg/metrics/metricfactory/metric_factory.go @@ -114,6 +114,16 @@ func NodeCPUFrequencyMetricsPromDesc(context string) (descriptions map[string]*p return descriptions } +func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) { + descriptions = make(map[string]*prometheus.Desc) + if config.EnabledGPU { + for _, name := range consts.GPUMetricNames { + descriptions[name] = resMetricsPromDesc(context, name, "nvidia-nvml") + } + } + return descriptions +} + func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) { var labels []string switch context { @@ -129,12 +139,18 @@ func resMetricsPromDesc(context, name, source string) (desc *prometheus.Desc) { klog.Errorf("Unexpected prometheus context: %s", context) return } + // if this is a GPU metric, we need to add the GPU ID label + for _, gpuMetric := range consts.GPUMetricNames { + if name == gpuMetric { + labels = append(labels, consts.GPUResUtilLabels...) + } + } return MetricsPromDesc(context, name, consts.UsageMetricNameSuffix, source, labels) } -func MetricsPromDesc(context, name, sufix, source string, labels []string) (desc *prometheus.Desc) { +func MetricsPromDesc(context, name, suffix, source string, labels []string) (desc *prometheus.Desc) { return prometheus.NewDesc( - prometheus.BuildFQName(consts.MetricsNamespace, context, name+sufix), + prometheus.BuildFQName(consts.MetricsNamespace, context, name+suffix), "Aggregated value in "+name+" value from "+source, labels, prometheus.Labels{"source": source}, diff --git a/pkg/metrics/process/metrics.go b/pkg/metrics/process/metrics.go index 526cc79fc6..0cf029f9a0 100644 --- a/pkg/metrics/process/metrics.go +++ b/pkg/metrics/process/metrics.go @@ -78,6 +78,10 @@ func (c *collector) initMetrics() { c.descriptions[name] = desc c.collectors[name] = metricfactory.NewPromCounter(desc) } + for name, desc := range metricfactory.GPUUsageMetricsPromDesc(context) { + c.descriptions[name] = desc + c.collectors[name] = metricfactory.NewPromCounter(desc) + } } func (c *collector) Describe(ch chan<- *prometheus.Desc) { diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go index 5022bfc2ea..64f6aeaa13 100644 --- a/pkg/metrics/utils/utils.go +++ b/pkg/metrics/utils/utils.go @@ -66,6 +66,12 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac CollectResUtil(ch, instance, collectorName, collectors[collectorName]) } } + + if config.EnabledGPU { + for _, collectorName := range consts.GPUMetricNames { + CollectResUtil(ch, instance, collectorName, collectors[collectorName]) + } + } } func collect(ch chan<- prometheus.Metric, collector metricfactory.PromMetric, value float64, labelValues []string) { @@ -116,9 +122,25 @@ func CollectResUtil(ch chan<- prometheus.Metric, instance interface{}, metricNam switch v := instance.(type) { case *stats.ContainerStats: container := instance.(*stats.ContainerStats) - value = float64(container.ResourceUsage[metricName].SumAllAggrValues()) - labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace} - collect(ch, collector, value, labelValues) + // special case for GPU devices, the metrics are reported per device + isGPUMetric := false + for _, m := range consts.GPUMetricNames { + if metricName == m { + isGPUMetric = true + break + } + } + if isGPUMetric { + for deviceID, utilization := range container.ResourceUsage[metricName].Stat { + value = float64(utilization.Aggr) + labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace, deviceID} + collect(ch, collector, value, labelValues) + } + } else { + value = float64(container.ResourceUsage[metricName].SumAllAggrValues()) + labelValues = []string{container.ContainerID, container.PodName, container.ContainerName, container.Namespace} + collect(ch, collector, value, labelValues) + } case *stats.ProcessStats: process := instance.(*stats.ProcessStats) diff --git a/pkg/sensors/accelerator/gpu/gpu.go b/pkg/sensors/accelerator/gpu/gpu.go index 7e7e1a3ffa..7def54d210 100644 --- a/pkg/sensors/accelerator/gpu/gpu.go +++ b/pkg/sensors/accelerator/gpu/gpu.go @@ -34,8 +34,18 @@ Then, we use gpu.go file to initialize the acceleratorImpl from power.go when gp // init initialize the acceleratorImpl and start it func init() { - acceleratorImpl = &gpu_source.GPUNvml{} + acceleratorImpl = &gpu_source.GPUDcgm{} err := acceleratorImpl.Init() + if err == nil { + klog.Infoln("Using dcgm to obtain gpu power") + // If the library was successfully initialized, we don't need to return an error in the Init() function + errLib = nil + return + } + // if dcgm fail to work, we use nvml + klog.Infof("Failed to init dcgm, err: %v\n", err) + acceleratorImpl = &gpu_source.GPUNvml{} + err = acceleratorImpl.Init() if err == nil { klog.Infoln("Using nvml to obtain gpu power") // If the library was successfully initialized, we don't need to return an error in the Init() function diff --git a/pkg/sensors/accelerator/gpu/power.go b/pkg/sensors/accelerator/gpu/power.go index f344d9a462..17ae228c18 100644 --- a/pkg/sensors/accelerator/gpu/power.go +++ b/pkg/sensors/accelerator/gpu/power.go @@ -39,11 +39,11 @@ type acceleratorInterface interface { // Shutdown stops the GPU metric collector Shutdown() bool // GetGpus returns a map with gpu device - GetGpus() []interface{} + GetGpus() map[string]interface{} // GetAbsEnergyFromGPU returns a map with mJ in each gpu device. Absolute energy is the sum of Idle + Dynamic energy. GetAbsEnergyFromGPU() []uint32 // GetProcessResourceUtilization returns a map of ProcessUtilizationSample where the key is the process pid - GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) + GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) // IsGPUCollectionSupported returns if it is possible to use this collector IsGPUCollectionSupported() bool // SetGPUCollectionSupported manually set if it is possible to use this collector. This is for testing purpose only. @@ -65,11 +65,11 @@ func Shutdown() bool { return true } -func GetGpus() []interface{} { +func GetGpus() map[string]interface{} { if acceleratorImpl != nil && config.EnabledGPU { return acceleratorImpl.GetGpus() } - return []interface{}{} + return map[string]interface{}{} } func GetAbsEnergyFromGPU() []uint32 { @@ -82,9 +82,9 @@ func GetAbsEnergyFromGPU() []uint32 { // GetProcessResourceUtilizationPerDevice tries to collect the GPU metrics. // There is a known issue that some clusters the nvidia GPU can stop to respod and we need to start it again. // See https://github.com/sustainable-computing-io/kepler/issues/610. -func GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) { +func GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]gpu_source.ProcessUtilizationSample, error) { if acceleratorImpl != nil && config.EnabledGPU { - processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, since) + processesUtilization, err := acceleratorImpl.GetProcessResourceUtilizationPerDevice(device, deviceName, since) if err != nil { klog.Infof("Failed to collect GPU metrics, trying to initizalize again: %v\n", err) err = acceleratorImpl.Init() diff --git a/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go new file mode 100644 index 0000000000..edad7a264b --- /dev/null +++ b/pkg/sensors/accelerator/gpu/source/gpu_dcgm.go @@ -0,0 +1,372 @@ +//go:build gpu +// +build gpu + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package source + +import ( + "fmt" + "strconv" + "time" + + "github.com/NVIDIA/go-dcgm/pkg/dcgm" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/sustainable-computing-io/kepler/pkg/config" + "k8s.io/klog/v2" +) + +const ( + debugLevel = 5 +) + +var ( + deviceFields []dcgm.Short = []dcgm.Short{ + // https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.htm + dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, + } + deviceFieldsString = []string{ + "dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE", + } + ratioFields uint = dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE // this is the field that we will use to calculate the utilization per @yuezhu1 + SkipDCGMValue = "SKIPPING DCGM VALUE" + FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING" + gpuMigArray [][]MigDevice + totalMultiProcessorCount map[string]int +) + +type GPUDcgm struct { + collectionSupported bool + devices map[string]interface{} + deviceGroupName string + deviceGroupHandle dcgm.GroupHandle + fieldGroupName string + fieldGroupHandle dcgm.FieldHandle + pidGroupName string + pidGroupHandle dcgm.GroupHandle // TODO: wait till https://github.com/NVIDIA/go-dcgm/issues/59 is resolved + entities map[string]dcgm.GroupEntityPair + cleanup func() +} + +func (d *GPUDcgm) GetName() string { + return "dcgm" +} + +func (d *GPUDcgm) Init() error { + d.devices = make(map[string]interface{}) + d.entities = make(map[string]dcgm.GroupEntityPair) + + cleanup, err := dcgm.Init(dcgm.Embedded) + if err != nil { + if cleanup != nil { + cleanup() + } + return fmt.Errorf("not able to connect to DCGM: %s", err) + } + d.cleanup = cleanup + dcgm.FieldsInit() + + if err := d.initNVML(); err != nil { + d.Shutdown() + return err + } + + if err := d.createDeviceGroup(); err != nil { + d.Shutdown() + return err + } + + if err := d.addDevicesToGroup(); err != nil { + d.Shutdown() + return err + } + + if err := d.createFieldGroup(); err != nil { + d.Shutdown() + return err + } + + if err := d.setupWatcher(); err != nil { + d.Shutdown() + return err + } + klog.Infof("DCGM initialized successfully") + d.collectionSupported = true + return nil +} + +func (d *GPUDcgm) IsGPUCollectionSupported() bool { + return d.collectionSupported +} + +func (d *GPUDcgm) SetGPUCollectionSupported(supported bool) { + d.collectionSupported = supported +} + +func (d *GPUDcgm) Shutdown() bool { + nvml.Shutdown() + dcgm.FieldsTerm() + if d.deviceGroupName != "" { + dcgm.DestroyGroup(d.deviceGroupHandle) + } + if d.fieldGroupName != "" { + dcgm.FieldGroupDestroy(d.fieldGroupHandle) + } + if d.cleanup != nil { + d.cleanup() + } + d.collectionSupported = false + return true +} + +func (d *GPUDcgm) GetAbsEnergyFromGPU() []uint32 { + gpuEnergy := []uint32{} + for _, device := range d.devices { + power, ret := device.(nvml.Device).GetPowerUsage() + if ret != nvml.SUCCESS { + klog.V(2).Infof("failed to get power usage on device %v: %v\n", device, nvml.ErrorString(ret)) + continue + } + // since Kepler collects metrics at intervals of SamplePeriodSec, which is greater than 1 second, it is + // necessary to calculate the energy consumption for the entire waiting period + energy := uint32(uint64(power) * config.SamplePeriodSec) + gpuEnergy = append(gpuEnergy, energy) + } + return gpuEnergy +} + +func (d *GPUDcgm) GetGpus() map[string]interface{} { + return d.devices +} + +func (d *GPUDcgm) GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { + processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{} + + if device == nil { // this is a MIG device, it is already tracked in the parent device + return processAcceleratorMetrics, nil + } + + var vals, miVals []dcgm.FieldValue_v1 + var err error + + klog.V(debugLevel).Infof("Device %v\n", deviceName) + + deviceIndex, strErr := strconv.Atoi(deviceName) + if strErr != nil { + klog.V(debugLevel).Infof("failed to convert %q to an integer: %v", deviceName, strErr) + return processAcceleratorMetrics, strErr + } + vals, err = dcgm.GetLatestValuesForFields(uint(deviceIndex), deviceFields) + if err != nil { + klog.V(debugLevel).Infof("failed to get latest values for fields: %v", err) + return processAcceleratorMetrics, err + } + gpuSMActive := uint32(0) + if err == nil { + for i, val := range vals { + value := ToString(val) + label := deviceFieldsString[i] + if val.FieldId == ratioFields { + smUtil, _ := strconv.ParseFloat(value, 32) + gpuSMActive = uint32(smUtil * 100) + } + klog.V(debugLevel).Infof("Device %v Label %v Val: %v", deviceName, label, ToString(val)) + } + klog.V(debugLevel).Infof("\n") + } + processInfo, ret := device.(nvml.Device).GetComputeRunningProcesses() + if ret != nvml.SUCCESS { + klog.V(debugLevel).Infof("failed to get running processes: %v", nvml.ErrorString(ret)) + return processAcceleratorMetrics, fmt.Errorf("failed to get running processes: %v", nvml.ErrorString(ret)) + } + for _, p := range processInfo { + // klog.V(debugLevel).Infof("pid: %d, memUtil: %d gpu instance id %d compute id %d\n", p.Pid, p.UsedGpuMemory, p.GpuInstanceId, p.ComputeInstanceId) + if p.GpuInstanceId > 0 { // this is a MIG, get it entity id and reads the related fields + entityName := gpuMigArray[deviceIndex][p.GpuInstanceId].EntityName + multiprocessorCountRatio := gpuMigArray[deviceIndex][p.GpuInstanceId].MultiprocessorCountRatio + mi := d.entities[entityName] + miVals, err = dcgm.EntityGetLatestValues(mi.EntityGroupId, mi.EntityId, deviceFields) + if err == nil { + for i, val := range miVals { + label := deviceFieldsString[i] + value := ToString(val) + klog.V(debugLevel).Infof("Device %v Label %v Val: %v", entityName, label, value) + if val.FieldId == ratioFields { + floatVal, _ := strconv.ParseFloat(value, 32) + // ratio of active multiprocessors to total multiprocessors + smUtil := uint32(floatVal * 100 * multiprocessorCountRatio) + klog.V(debugLevel).Infof("pid %d smUtil %d multiprocessor count ratio %v\n", p.Pid, smUtil, multiprocessorCountRatio) + processAcceleratorMetrics[p.Pid] = ProcessUtilizationSample{ + Pid: p.Pid, + TimeStamp: uint64(time.Now().UnixNano()), + SmUtil: smUtil, + } + } + } + klog.V(debugLevel).Infof("\n") + } + } else { + processAcceleratorMetrics[p.Pid] = ProcessUtilizationSample{ + Pid: p.Pid, + TimeStamp: uint64(time.Now().UnixNano()), + SmUtil: gpuSMActive, // if this is not a MIG, we will use the GPU SM active value. FIXME: what if there are multiple pids in the same GPU? + } + } + } + + return processAcceleratorMetrics, nil +} + +// helper functions +func (d *GPUDcgm) initNVML() error { + if ret := nvml.Init(); ret != nvml.SUCCESS { + d.collectionSupported = false + d.Shutdown() + return fmt.Errorf("failed to init nvml. %s", nvmlErrorString(ret)) + } + return nil +} + +func (d *GPUDcgm) createDeviceGroup() error { + deviceGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05") + deviceGroup, err := dcgm.CreateGroup(deviceGroupName) + if err != nil { + return fmt.Errorf("failed to create group %q: %v", deviceGroupName, err) + } + d.deviceGroupName = deviceGroupName + d.deviceGroupHandle = deviceGroup + klog.Infof("Created device group %q", deviceGroupName) + return nil +} + +func (d *GPUDcgm) addDevicesToGroup() error { + supportedDeviceIndices, err := dcgm.GetSupportedDevices() + if err != nil { + return fmt.Errorf("failed to find supported devices: %v", err) + } + klog.V(debugLevel).Infof("found %d supported devices", len(supportedDeviceIndices)) + for _, gpuIndex := range supportedDeviceIndices { + err = dcgm.AddEntityToGroup(d.deviceGroupHandle, dcgm.FE_GPU, gpuIndex) + if err != nil { + klog.Infof("failed to add device %d to group %q: %v", gpuIndex, d.deviceGroupName, err) + } else { + device, ret := nvml.DeviceGetHandleByIndex(int(gpuIndex)) + if ret != nvml.SUCCESS { + klog.Infof("failed to get nvml device %d: %v ", gpuIndex, nvml.ErrorString(ret)) + continue + } + d.devices[fmt.Sprintf("%v", gpuIndex)] = device + d.entities[fmt.Sprintf("%v", gpuIndex)] = dcgm.GroupEntityPair{dcgm.FE_GPU, gpuIndex} + } + } + + // add entity to the group + hierarchy, err := dcgm.GetGpuInstanceHierarchy() + if err != nil { + d.Shutdown() + return fmt.Errorf("failed to get gpu hierachy: %v", err) + } + + if hierarchy.Count > 0 { + // if MIG is enabled, we need to know the hierarchy as well as the multiprocessor count in each device. + // we will use the multiprocessor count to calculate the utilization of each instance + if gpuMigArray, totalMultiProcessorCount, err = RetriveFromNvidiaSMI(false); err != nil { + klog.Infof("failed to retrive from nvidia-smi: %v", err) + // if we cannot get the multiprocessor count, we will not be able to calculate the utilization + } + for i := uint(0); i < hierarchy.Count; i++ { + if hierarchy.EntityList[i].Parent.EntityGroupId == dcgm.FE_GPU { + // add a GPU instance + info := hierarchy.EntityList[i].Info + entityId := hierarchy.EntityList[i].Entity.EntityId + gpuId := hierarchy.EntityList[i].Parent.EntityId + klog.V(debugLevel).Infof("gpu id %v entity id %v gpu index %v instance id %v", gpuId, entityId, info.NvmlGpuIndex, info.NvmlInstanceId) + entityName := fmt.Sprintf("entity-%d", entityId) + gpuMigArray[info.NvmlGpuIndex][info.NvmlInstanceId].EntityName = entityName + err = dcgm.AddEntityToGroup(d.deviceGroupHandle, dcgm.FE_GPU_I, entityId) + d.entities[entityName] = dcgm.GroupEntityPair{dcgm.FE_GPU_I, entityId} + klog.V(debugLevel).Infof("Adding GPU instance %d, err: %v", entityId, err) + } + } + } + return nil +} + +func (d *GPUDcgm) createFieldGroup() error { + fieldGroupName := "kepler-exporter-" + time.Now().Format("2006-01-02-15-04-05") + fieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields) + if err != nil { + return fmt.Errorf("failed to create field group %q: %v", fieldGroupName, err) + } + d.fieldGroupName = fieldGroupName + d.fieldGroupHandle = fieldGroup + return nil +} + +func (d *GPUDcgm) setupWatcher() error { + // watch interval has an impact on cpu usage, set it carefully + err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(1000)*1000, 0.0, 1) + if err != nil { + return fmt.Errorf("failed to set up watcher, err %v", err) + } + return nil +} + +// ToString converts a dcgm.FieldValue_v1 to a string +// credit to dcgm_exporter +func ToString(value dcgm.FieldValue_v1) string { + switch v := value.Int64(); v { + case dcgm.DCGM_FT_INT32_BLANK: + return SkipDCGMValue + case dcgm.DCGM_FT_INT32_NOT_FOUND: + return SkipDCGMValue + case dcgm.DCGM_FT_INT32_NOT_SUPPORTED: + return SkipDCGMValue + case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_BLANK: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_NOT_FOUND: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_NOT_SUPPORTED: + return SkipDCGMValue + case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED: + return SkipDCGMValue + } + switch v := value.Float64(); v { + case dcgm.DCGM_FT_FP64_BLANK: + return SkipDCGMValue + case dcgm.DCGM_FT_FP64_NOT_FOUND: + return SkipDCGMValue + case dcgm.DCGM_FT_FP64_NOT_SUPPORTED: + return SkipDCGMValue + case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED: + return SkipDCGMValue + } + switch v := value.FieldType; v { + case dcgm.DCGM_FT_STRING: + return value.String() + case dcgm.DCGM_FT_DOUBLE: + return fmt.Sprintf("%f", value.Float64()) + case dcgm.DCGM_FT_INT64: + return fmt.Sprintf("%d", value.Int64()) + default: + return FailedToConvert + } + + return FailedToConvert +} diff --git a/pkg/sensors/accelerator/gpu/source/gpu_dummy.go b/pkg/sensors/accelerator/gpu/source/gpu_dummy.go index aa7237ee78..c9200c2b9d 100644 --- a/pkg/sensors/accelerator/gpu/source/gpu_dummy.go +++ b/pkg/sensors/accelerator/gpu/source/gpu_dummy.go @@ -21,8 +21,6 @@ package source import ( "time" - - "github.com/NVIDIA/go-nvml/pkg/nvml" ) type GPUDummy struct { @@ -47,13 +45,12 @@ func (d *GPUDummy) GetAbsEnergyFromGPU() []uint32 { return []uint32{} } -func (d *GPUDummy) GetGpus() []interface{} { - var devices []interface{} - devices = append(devices, nvml.Device{}) +func (d *GPUDummy) GetGpus() map[string]interface{} { + var devices map[string]interface{} return devices } -func (n *GPUDummy) GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { +func (n *GPUDummy) GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{} processAcceleratorMetrics[0] = ProcessUtilizationSample{ Pid: 0, diff --git a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go index b50dd0def9..a078c2d136 100644 --- a/pkg/sensors/accelerator/gpu/source/gpu_nvml.go +++ b/pkg/sensors/accelerator/gpu/source/gpu_nvml.go @@ -30,7 +30,9 @@ import ( var ( // List of GPU identifiers for the device - devices []interface{} + devices map[string]interface{} + // bool to check if the process utilization collection is supported + processUtilizationSupported bool = true ) type GPUNvml struct { @@ -64,7 +66,7 @@ func (n *GPUNvml) Init() (err error) { return err } klog.Infof("found %d gpu devices\n", count) - devices = make([]interface{}, count) + devices = make(map[string]interface{}, count) for i := 0; i < count; i++ { device, ret := nvml.DeviceGetHandleByIndex(i) if ret != nvml.SUCCESS { @@ -74,8 +76,9 @@ func (n *GPUNvml) Init() (err error) { return err } name, _ := device.GetName() - klog.Infoln("GPU", i, name) - devices[i] = device + uuid, _ := device.GetUUID() + klog.Infof("GPU %v %q %q", i, name, uuid) + devices[uuid] = device } n.collectionSupported = true return nil @@ -87,7 +90,7 @@ func (n *GPUNvml) Shutdown() bool { } // GetGpus returns a map with gpu device -func (n *GPUNvml) GetGpus() []interface{} { +func (n *GPUNvml) GetGpus() map[string]interface{} { return devices } @@ -112,33 +115,60 @@ func (n *GPUNvml) GetAbsEnergyFromGPU() []uint32 { // // ProcessUtilizationSample.SmUtil represents the process Streaming Multiprocessors - SM (3D/Compute) utilization in percentage. // ProcessUtilizationSample.MemUtil represents the process Frame Buffer Memory utilization Value. -func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { +func (n *GPUNvml) GetProcessResourceUtilizationPerDevice(device interface{}, deviceName string, since time.Duration) (map[uint32]ProcessUtilizationSample, error) { processAcceleratorMetrics := map[uint32]ProcessUtilizationSample{} lastUtilizationTimestamp := uint64(time.Now().Add(-1*since).UnixNano() / 1000) - processUtilizationSample, ret := device.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp) - if ret != nvml.SUCCESS { - if ret == nvml.ERROR_NOT_FOUND { - // ignore the error if there is no process running in the GPU - return nil, nil + if processUtilizationSupported { + processUtilizationSample, ret := device.(nvml.Device).GetProcessUtilization(lastUtilizationTimestamp) + if ret != nvml.SUCCESS { + if ret == nvml.ERROR_NOT_FOUND { + // ignore the error if there is no process running in the GPU + return nil, nil + } + processUtilizationSupported = false + } else { + for _, pinfo := range processUtilizationSample { + // pid 0 means no data. + if pinfo.Pid != 0 { + processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{ + Pid: pinfo.Pid, + TimeStamp: pinfo.TimeStamp, + SmUtil: pinfo.SmUtil, + MemUtil: pinfo.MemUtil, + EncUtil: pinfo.EncUtil, + DecUtil: pinfo.DecUtil, + } + } + } } - return nil, fmt.Errorf("failed to get processes' utilization on device %v: %v", device, nvml.ErrorString(ret)) } - - for _, pinfo := range processUtilizationSample { - // pid 0 means no data. - if pinfo.Pid != 0 { - processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{ - Pid: pinfo.Pid, - TimeStamp: pinfo.TimeStamp, - SmUtil: pinfo.SmUtil, - MemUtil: pinfo.MemUtil, - EncUtil: pinfo.EncUtil, - DecUtil: pinfo.DecUtil, + if !processUtilizationSupported { // if processUtilizationSupported is false, try deviceGetMPSComputeRunningProcesses_v3 to use memory usage to ratio power usage + config.GpuUsageMetric = config.GPUMemUtilization + processInfo, ret := device.(nvml.Device).GetComputeRunningProcesses() + if ret != nvml.SUCCESS { + if ret == nvml.ERROR_NOT_FOUND { + // ignore the error if there is no process running in the GPU + return nil, nil + } + return nil, fmt.Errorf("failed to get processes' utilization on device %v: %v", device, nvml.ErrorString(ret)) + } + memoryInfo, ret := device.(nvml.Device).GetMemoryInfo() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get memory info on device %v: %v", device, nvml.ErrorString(ret)) + } + // convert processInfo to processUtilizationSample + for _, pinfo := range processInfo { + // pid 0 means no data. + if pinfo.Pid != 0 { + processAcceleratorMetrics[pinfo.Pid] = ProcessUtilizationSample{ + Pid: pinfo.Pid, + MemUtil: uint32(pinfo.UsedGpuMemory * 100 / memoryInfo.Total), + } + klog.V(1).Infof("pid: %d, memUtil: %d gpu instance %d compute instance %d\n", pinfo.Pid, processAcceleratorMetrics[pinfo.Pid].MemUtil, pinfo.GpuInstanceId, pinfo.ComputeInstanceId) } } } - return processAcceleratorMetrics, nil } diff --git a/pkg/sensors/accelerator/gpu/source/nvml_util.go b/pkg/sensors/accelerator/gpu/source/nvml_util.go new file mode 100644 index 0000000000..3c87a373b1 --- /dev/null +++ b/pkg/sensors/accelerator/gpu/source/nvml_util.go @@ -0,0 +1,116 @@ +//go:build gpu +// +build gpu + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package source + +import ( + "encoding/xml" + "fmt" + "os/exec" + + "k8s.io/klog/v2" +) + +type NvidiaSmiLog struct { + XMLName xml.Name `xml:"nvidia_smi_log"` + Timestamp string `xml:"timestamp,omitempty"` + DriverVersion string `xml:"driver_version,omitempty"` + CudaVersion string `xml:"cuda_version,omitempty"` + AttachedGPUs int `xml:"attached_gpus,omitempty"` + GPU []GPU `xml:"gpu"` +} + +type GPU struct { + ID string `xml:"id,attr"` + MigMode MigMode `xml:"mig_mode,omitempty"` + MigDevices []MigDevice `xml:"mig_devices>mig_device,omitempty"` + UUID string `xml:"uuid,omitempty"` +} + +type MigMode struct { + CurrentMig string `xml:"current_mig,omitempty"` + PendingMig string `xml:"pending_mig,omitempty"` +} + +type MigDevice struct { + GPUInstanceID int `xml:"gpu_instance_id,omitempty"` + ComputeInstanceID int `xml:"compute_instance_id,omitempty"` + DeviceAttributes DeviceAttributes `xml:"device_attributes,omitempty"` + EntityName string // this is set later + MultiprocessorCountRatio float64 // this is set later +} + +type DeviceAttributes struct { + Shared SharedAttributes `xml:"shared,omitempty"` +} + +type SharedAttributes struct { + MultiprocessorCount int `xml:"multiprocessor_count,omitempty"` +} + +// RetriveFromNvidiaSMI retrives the MIG information from nvidia-smi +func RetriveFromNvidiaSMI(debug bool) (gpuMigArray [][]MigDevice, totalMultiProcessorCount map[string]int, err error) { + cmd := exec.Command("nvidia-smi", "-q", "-x") + output, err := cmd.Output() + if err != nil { + err = fmt.Errorf("Error running nvidia-smi command:", err) + return + } + + var nvidiaSmiLog NvidiaSmiLog + err = xml.Unmarshal(output, &nvidiaSmiLog) + if err != nil { + err = fmt.Errorf("Error unmarshaling XML:", err) + return + } + + gpuMigArray = make([][]MigDevice, len(nvidiaSmiLog.GPU)) + totalMultiProcessorCount = make(map[string]int, len(nvidiaSmiLog.GPU)) + for i, gpu := range nvidiaSmiLog.GPU { + // find the largest GPUInstanceID among the MIGDevices, to make sure we have enough space in the array + maxGPUInstanceID := 0 + for _, migDevice := range gpu.MigDevices { + if migDevice.GPUInstanceID > maxGPUInstanceID { + maxGPUInstanceID = migDevice.GPUInstanceID + } + } + gpuMigArray[i] = make([]MigDevice, maxGPUInstanceID+1) + totalMultiProcessorCount[gpu.UUID] = 0 + for _, migDevice := range gpu.MigDevices { + gpuMigArray[i][migDevice.GPUInstanceID] = migDevice + totalMultiProcessorCount[gpu.UUID] += migDevice.DeviceAttributes.Shared.MultiprocessorCount + } + // count MultiprocessorCountRatio for each device + for j, migDevice := range gpuMigArray[i] { + gpuMigArray[i][j].MultiprocessorCountRatio = float64(migDevice.DeviceAttributes.Shared.MultiprocessorCount) / float64(totalMultiProcessorCount[gpu.UUID]) + } + } + + if debug { + for i, gpu := range nvidiaSmiLog.GPU { + for _, device := range gpuMigArray[i] { + klog.Infof("GPU %d %q", i, gpu.UUID) + klog.Infof("\tGPUInstanceID: %d\n", device.GPUInstanceID) + klog.Infof("\tComputeInstanceID: %d\n", device.ComputeInstanceID) + klog.Infof("\tShared MultiprocessorCount: %d\n", device.DeviceAttributes.Shared.MultiprocessorCount) + klog.Infof("\tShared MultiprocessorCountRatio: %f\n", device.MultiprocessorCountRatio) + } + } + } + return +} diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index ffe7d38685..0166131d3c 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -49,6 +49,7 @@ const ( SystemProcessNamespace string = "system" EmptyString string = "" GenericSocketID string = "socket0" + GenericGPUID string = "gpu" ) func GetPathFromPID(searchPath string, pid uint64) (string, error) { diff --git a/vendor/github.com/Masterminds/semver/.travis.yml b/vendor/github.com/Masterminds/semver/.travis.yml new file mode 100644 index 0000000000..096369d44d --- /dev/null +++ b/vendor/github.com/Masterminds/semver/.travis.yml @@ -0,0 +1,29 @@ +language: go + +go: + - 1.6.x + - 1.7.x + - 1.8.x + - 1.9.x + - 1.10.x + - 1.11.x + - 1.12.x + - tip + +# Setting sudo access to false will let Travis CI use containers rather than +# VMs to run the tests. For more details see: +# - http://docs.travis-ci.com/user/workers/container-based-infrastructure/ +# - http://docs.travis-ci.com/user/workers/standard-infrastructure/ +sudo: false + +script: + - make setup + - make test + +notifications: + webhooks: + urls: + - https://webhooks.gitter.im/e/06e3328629952dabe3e0 + on_success: change # options: [always|never|change] default: always + on_failure: always # options: [always|never|change] default: always + on_start: never # options: [always|never|change] default: always diff --git a/vendor/github.com/Masterminds/semver/CHANGELOG.md b/vendor/github.com/Masterminds/semver/CHANGELOG.md new file mode 100644 index 0000000000..e405c9a84d --- /dev/null +++ b/vendor/github.com/Masterminds/semver/CHANGELOG.md @@ -0,0 +1,109 @@ +# 1.5.0 (2019-09-11) + +## Added + +- #103: Add basic fuzzing for `NewVersion()` (thanks @jesse-c) + +## Changed + +- #82: Clarify wildcard meaning in range constraints and update tests for it (thanks @greysteil) +- #83: Clarify caret operator range for pre-1.0.0 dependencies (thanks @greysteil) +- #72: Adding docs comment pointing to vert for a cli +- #71: Update the docs on pre-release comparator handling +- #89: Test with new go versions (thanks @thedevsaddam) +- #87: Added $ to ValidPrerelease for better validation (thanks @jeremycarroll) + +## Fixed + +- #78: Fix unchecked error in example code (thanks @ravron) +- #70: Fix the handling of pre-releases and the 0.0.0 release edge case +- #97: Fixed copyright file for proper display on GitHub +- #107: Fix handling prerelease when sorting alphanum and num +- #109: Fixed where Validate sometimes returns wrong message on error + +# 1.4.2 (2018-04-10) + +## Changed +- #72: Updated the docs to point to vert for a console appliaction +- #71: Update the docs on pre-release comparator handling + +## Fixed +- #70: Fix the handling of pre-releases and the 0.0.0 release edge case + +# 1.4.1 (2018-04-02) + +## Fixed +- Fixed #64: Fix pre-release precedence issue (thanks @uudashr) + +# 1.4.0 (2017-10-04) + +## Changed +- #61: Update NewVersion to parse ints with a 64bit int size (thanks @zknill) + +# 1.3.1 (2017-07-10) + +## Fixed +- Fixed #57: number comparisons in prerelease sometimes inaccurate + +# 1.3.0 (2017-05-02) + +## Added +- #45: Added json (un)marshaling support (thanks @mh-cbon) +- Stability marker. See https://masterminds.github.io/stability/ + +## Fixed +- #51: Fix handling of single digit tilde constraint (thanks @dgodd) + +## Changed +- #55: The godoc icon moved from png to svg + +# 1.2.3 (2017-04-03) + +## Fixed +- #46: Fixed 0.x.x and 0.0.x in constraints being treated as * + +# Release 1.2.2 (2016-12-13) + +## Fixed +- #34: Fixed issue where hyphen range was not working with pre-release parsing. + +# Release 1.2.1 (2016-11-28) + +## Fixed +- #24: Fixed edge case issue where constraint "> 0" does not handle "0.0.1-alpha" + properly. + +# Release 1.2.0 (2016-11-04) + +## Added +- #20: Added MustParse function for versions (thanks @adamreese) +- #15: Added increment methods on versions (thanks @mh-cbon) + +## Fixed +- Issue #21: Per the SemVer spec (section 9) a pre-release is unstable and + might not satisfy the intended compatibility. The change here ignores pre-releases + on constraint checks (e.g., ~ or ^) when a pre-release is not part of the + constraint. For example, `^1.2.3` will ignore pre-releases while + `^1.2.3-alpha` will include them. + +# Release 1.1.1 (2016-06-30) + +## Changed +- Issue #9: Speed up version comparison performance (thanks @sdboyer) +- Issue #8: Added benchmarks (thanks @sdboyer) +- Updated Go Report Card URL to new location +- Updated Readme to add code snippet formatting (thanks @mh-cbon) +- Updating tagging to v[SemVer] structure for compatibility with other tools. + +# Release 1.1.0 (2016-03-11) + +- Issue #2: Implemented validation to provide reasons a versions failed a + constraint. + +# Release 1.0.1 (2015-12-31) + +- Fixed #1: * constraint failing on valid versions. + +# Release 1.0.0 (2015-10-20) + +- Initial release diff --git a/vendor/github.com/Masterminds/semver/LICENSE.txt b/vendor/github.com/Masterminds/semver/LICENSE.txt new file mode 100644 index 0000000000..9ff7da9c48 --- /dev/null +++ b/vendor/github.com/Masterminds/semver/LICENSE.txt @@ -0,0 +1,19 @@ +Copyright (C) 2014-2019, Matt Butcher and Matt Farina + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/Masterminds/semver/Makefile b/vendor/github.com/Masterminds/semver/Makefile new file mode 100644 index 0000000000..a7a1b4e36d --- /dev/null +++ b/vendor/github.com/Masterminds/semver/Makefile @@ -0,0 +1,36 @@ +.PHONY: setup +setup: + go get -u gopkg.in/alecthomas/gometalinter.v1 + gometalinter.v1 --install + +.PHONY: test +test: validate lint + @echo "==> Running tests" + go test -v + +.PHONY: validate +validate: + @echo "==> Running static validations" + @gometalinter.v1 \ + --disable-all \ + --enable deadcode \ + --severity deadcode:error \ + --enable gofmt \ + --enable gosimple \ + --enable ineffassign \ + --enable misspell \ + --enable vet \ + --tests \ + --vendor \ + --deadline 60s \ + ./... || exit_code=1 + +.PHONY: lint +lint: + @echo "==> Running linters" + @gometalinter.v1 \ + --disable-all \ + --enable golint \ + --vendor \ + --deadline 60s \ + ./... || : diff --git a/vendor/github.com/Masterminds/semver/README.md b/vendor/github.com/Masterminds/semver/README.md new file mode 100644 index 0000000000..1b52d2f436 --- /dev/null +++ b/vendor/github.com/Masterminds/semver/README.md @@ -0,0 +1,194 @@ +# SemVer + +The `semver` package provides the ability to work with [Semantic Versions](http://semver.org) in Go. Specifically it provides the ability to: + +* Parse semantic versions +* Sort semantic versions +* Check if a semantic version fits within a set of constraints +* Optionally work with a `v` prefix + +[![Stability: +Active](https://masterminds.github.io/stability/active.svg)](https://masterminds.github.io/stability/active.html) +[![Build Status](https://travis-ci.org/Masterminds/semver.svg)](https://travis-ci.org/Masterminds/semver) [![Build status](https://ci.appveyor.com/api/projects/status/jfk66lib7hb985k8/branch/master?svg=true&passingText=windows%20build%20passing&failingText=windows%20build%20failing)](https://ci.appveyor.com/project/mattfarina/semver/branch/master) [![GoDoc](https://godoc.org/github.com/Masterminds/semver?status.svg)](https://godoc.org/github.com/Masterminds/semver) [![Go Report Card](https://goreportcard.com/badge/github.com/Masterminds/semver)](https://goreportcard.com/report/github.com/Masterminds/semver) + +If you are looking for a command line tool for version comparisons please see +[vert](https://github.com/Masterminds/vert) which uses this library. + +## Parsing Semantic Versions + +To parse a semantic version use the `NewVersion` function. For example, + +```go + v, err := semver.NewVersion("1.2.3-beta.1+build345") +``` + +If there is an error the version wasn't parseable. The version object has methods +to get the parts of the version, compare it to other versions, convert the +version back into a string, and get the original string. For more details +please see the [documentation](https://godoc.org/github.com/Masterminds/semver). + +## Sorting Semantic Versions + +A set of versions can be sorted using the [`sort`](https://golang.org/pkg/sort/) +package from the standard library. For example, + +```go + raw := []string{"1.2.3", "1.0", "1.3", "2", "0.4.2",} + vs := make([]*semver.Version, len(raw)) + for i, r := range raw { + v, err := semver.NewVersion(r) + if err != nil { + t.Errorf("Error parsing version: %s", err) + } + + vs[i] = v + } + + sort.Sort(semver.Collection(vs)) +``` + +## Checking Version Constraints + +Checking a version against version constraints is one of the most featureful +parts of the package. + +```go + c, err := semver.NewConstraint(">= 1.2.3") + if err != nil { + // Handle constraint not being parseable. + } + + v, _ := semver.NewVersion("1.3") + if err != nil { + // Handle version not being parseable. + } + // Check if the version meets the constraints. The a variable will be true. + a := c.Check(v) +``` + +## Basic Comparisons + +There are two elements to the comparisons. First, a comparison string is a list +of comma separated and comparisons. These are then separated by || separated or +comparisons. For example, `">= 1.2, < 3.0.0 || >= 4.2.3"` is looking for a +comparison that's greater than or equal to 1.2 and less than 3.0.0 or is +greater than or equal to 4.2.3. + +The basic comparisons are: + +* `=`: equal (aliased to no operator) +* `!=`: not equal +* `>`: greater than +* `<`: less than +* `>=`: greater than or equal to +* `<=`: less than or equal to + +## Working With Pre-release Versions + +Pre-releases, for those not familiar with them, are used for software releases +prior to stable or generally available releases. Examples of pre-releases include +development, alpha, beta, and release candidate releases. A pre-release may be +a version such as `1.2.3-beta.1` while the stable release would be `1.2.3`. In the +order of precidence, pre-releases come before their associated releases. In this +example `1.2.3-beta.1 < 1.2.3`. + +According to the Semantic Version specification pre-releases may not be +API compliant with their release counterpart. It says, + +> A pre-release version indicates that the version is unstable and might not satisfy the intended compatibility requirements as denoted by its associated normal version. + +SemVer comparisons without a pre-release comparator will skip pre-release versions. +For example, `>=1.2.3` will skip pre-releases when looking at a list of releases +while `>=1.2.3-0` will evaluate and find pre-releases. + +The reason for the `0` as a pre-release version in the example comparison is +because pre-releases can only contain ASCII alphanumerics and hyphens (along with +`.` separators), per the spec. Sorting happens in ASCII sort order, again per the spec. The lowest character is a `0` in ASCII sort order (see an [ASCII Table](http://www.asciitable.com/)) + +Understanding ASCII sort ordering is important because A-Z comes before a-z. That +means `>=1.2.3-BETA` will return `1.2.3-alpha`. What you might expect from case +sensitivity doesn't apply here. This is due to ASCII sort ordering which is what +the spec specifies. + +## Hyphen Range Comparisons + +There are multiple methods to handle ranges and the first is hyphens ranges. +These look like: + +* `1.2 - 1.4.5` which is equivalent to `>= 1.2, <= 1.4.5` +* `2.3.4 - 4.5` which is equivalent to `>= 2.3.4, <= 4.5` + +## Wildcards In Comparisons + +The `x`, `X`, and `*` characters can be used as a wildcard character. This works +for all comparison operators. When used on the `=` operator it falls +back to the pack level comparison (see tilde below). For example, + +* `1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` +* `>= 1.2.x` is equivalent to `>= 1.2.0` +* `<= 2.x` is equivalent to `< 3` +* `*` is equivalent to `>= 0.0.0` + +## Tilde Range Comparisons (Patch) + +The tilde (`~`) comparison operator is for patch level ranges when a minor +version is specified and major level changes when the minor number is missing. +For example, + +* `~1.2.3` is equivalent to `>= 1.2.3, < 1.3.0` +* `~1` is equivalent to `>= 1, < 2` +* `~2.3` is equivalent to `>= 2.3, < 2.4` +* `~1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` +* `~1.x` is equivalent to `>= 1, < 2` + +## Caret Range Comparisons (Major) + +The caret (`^`) comparison operator is for major level changes. This is useful +when comparisons of API versions as a major change is API breaking. For example, + +* `^1.2.3` is equivalent to `>= 1.2.3, < 2.0.0` +* `^0.0.1` is equivalent to `>= 0.0.1, < 1.0.0` +* `^1.2.x` is equivalent to `>= 1.2.0, < 2.0.0` +* `^2.3` is equivalent to `>= 2.3, < 3` +* `^2.x` is equivalent to `>= 2.0.0, < 3` + +# Validation + +In addition to testing a version against a constraint, a version can be validated +against a constraint. When validation fails a slice of errors containing why a +version didn't meet the constraint is returned. For example, + +```go + c, err := semver.NewConstraint("<= 1.2.3, >= 1.4") + if err != nil { + // Handle constraint not being parseable. + } + + v, _ := semver.NewVersion("1.3") + if err != nil { + // Handle version not being parseable. + } + + // Validate a version against a constraint. + a, msgs := c.Validate(v) + // a is false + for _, m := range msgs { + fmt.Println(m) + + // Loops over the errors which would read + // "1.3 is greater than 1.2.3" + // "1.3 is less than 1.4" + } +``` + +# Fuzzing + + [dvyukov/go-fuzz](https://github.com/dvyukov/go-fuzz) is used for fuzzing. + +1. `go-fuzz-build` +2. `go-fuzz -workdir=fuzz` + +# Contribute + +If you find an issue or want to contribute please file an [issue](https://github.com/Masterminds/semver/issues) +or [create a pull request](https://github.com/Masterminds/semver/pulls). diff --git a/vendor/github.com/Masterminds/semver/appveyor.yml b/vendor/github.com/Masterminds/semver/appveyor.yml new file mode 100644 index 0000000000..b2778df15a --- /dev/null +++ b/vendor/github.com/Masterminds/semver/appveyor.yml @@ -0,0 +1,44 @@ +version: build-{build}.{branch} + +clone_folder: C:\gopath\src\github.com\Masterminds\semver +shallow_clone: true + +environment: + GOPATH: C:\gopath + +platform: + - x64 + +install: + - go version + - go env + - go get -u gopkg.in/alecthomas/gometalinter.v1 + - set PATH=%PATH%;%GOPATH%\bin + - gometalinter.v1.exe --install + +build_script: + - go install -v ./... + +test_script: + - "gometalinter.v1 \ + --disable-all \ + --enable deadcode \ + --severity deadcode:error \ + --enable gofmt \ + --enable gosimple \ + --enable ineffassign \ + --enable misspell \ + --enable vet \ + --tests \ + --vendor \ + --deadline 60s \ + ./... || exit_code=1" + - "gometalinter.v1 \ + --disable-all \ + --enable golint \ + --vendor \ + --deadline 60s \ + ./... || :" + - go test -v + +deploy: off diff --git a/vendor/github.com/Masterminds/semver/collection.go b/vendor/github.com/Masterminds/semver/collection.go new file mode 100644 index 0000000000..a78235895f --- /dev/null +++ b/vendor/github.com/Masterminds/semver/collection.go @@ -0,0 +1,24 @@ +package semver + +// Collection is a collection of Version instances and implements the sort +// interface. See the sort package for more details. +// https://golang.org/pkg/sort/ +type Collection []*Version + +// Len returns the length of a collection. The number of Version instances +// on the slice. +func (c Collection) Len() int { + return len(c) +} + +// Less is needed for the sort interface to compare two Version objects on the +// slice. If checks if one is less than the other. +func (c Collection) Less(i, j int) bool { + return c[i].LessThan(c[j]) +} + +// Swap is needed for the sort interface to replace the Version objects +// at two different positions in the slice. +func (c Collection) Swap(i, j int) { + c[i], c[j] = c[j], c[i] +} diff --git a/vendor/github.com/Masterminds/semver/constraints.go b/vendor/github.com/Masterminds/semver/constraints.go new file mode 100644 index 0000000000..b94b93413f --- /dev/null +++ b/vendor/github.com/Masterminds/semver/constraints.go @@ -0,0 +1,423 @@ +package semver + +import ( + "errors" + "fmt" + "regexp" + "strings" +) + +// Constraints is one or more constraint that a semantic version can be +// checked against. +type Constraints struct { + constraints [][]*constraint +} + +// NewConstraint returns a Constraints instance that a Version instance can +// be checked against. If there is a parse error it will be returned. +func NewConstraint(c string) (*Constraints, error) { + + // Rewrite - ranges into a comparison operation. + c = rewriteRange(c) + + ors := strings.Split(c, "||") + or := make([][]*constraint, len(ors)) + for k, v := range ors { + cs := strings.Split(v, ",") + result := make([]*constraint, len(cs)) + for i, s := range cs { + pc, err := parseConstraint(s) + if err != nil { + return nil, err + } + + result[i] = pc + } + or[k] = result + } + + o := &Constraints{constraints: or} + return o, nil +} + +// Check tests if a version satisfies the constraints. +func (cs Constraints) Check(v *Version) bool { + // loop over the ORs and check the inner ANDs + for _, o := range cs.constraints { + joy := true + for _, c := range o { + if !c.check(v) { + joy = false + break + } + } + + if joy { + return true + } + } + + return false +} + +// Validate checks if a version satisfies a constraint. If not a slice of +// reasons for the failure are returned in addition to a bool. +func (cs Constraints) Validate(v *Version) (bool, []error) { + // loop over the ORs and check the inner ANDs + var e []error + + // Capture the prerelease message only once. When it happens the first time + // this var is marked + var prerelesase bool + for _, o := range cs.constraints { + joy := true + for _, c := range o { + // Before running the check handle the case there the version is + // a prerelease and the check is not searching for prereleases. + if c.con.pre == "" && v.pre != "" { + if !prerelesase { + em := fmt.Errorf("%s is a prerelease version and the constraint is only looking for release versions", v) + e = append(e, em) + prerelesase = true + } + joy = false + + } else { + + if !c.check(v) { + em := fmt.Errorf(c.msg, v, c.orig) + e = append(e, em) + joy = false + } + } + } + + if joy { + return true, []error{} + } + } + + return false, e +} + +var constraintOps map[string]cfunc +var constraintMsg map[string]string +var constraintRegex *regexp.Regexp + +func init() { + constraintOps = map[string]cfunc{ + "": constraintTildeOrEqual, + "=": constraintTildeOrEqual, + "!=": constraintNotEqual, + ">": constraintGreaterThan, + "<": constraintLessThan, + ">=": constraintGreaterThanEqual, + "=>": constraintGreaterThanEqual, + "<=": constraintLessThanEqual, + "=<": constraintLessThanEqual, + "~": constraintTilde, + "~>": constraintTilde, + "^": constraintCaret, + } + + constraintMsg = map[string]string{ + "": "%s is not equal to %s", + "=": "%s is not equal to %s", + "!=": "%s is equal to %s", + ">": "%s is less than or equal to %s", + "<": "%s is greater than or equal to %s", + ">=": "%s is less than %s", + "=>": "%s is less than %s", + "<=": "%s is greater than %s", + "=<": "%s is greater than %s", + "~": "%s does not have same major and minor version as %s", + "~>": "%s does not have same major and minor version as %s", + "^": "%s does not have same major version as %s", + } + + ops := make([]string, 0, len(constraintOps)) + for k := range constraintOps { + ops = append(ops, regexp.QuoteMeta(k)) + } + + constraintRegex = regexp.MustCompile(fmt.Sprintf( + `^\s*(%s)\s*(%s)\s*$`, + strings.Join(ops, "|"), + cvRegex)) + + constraintRangeRegex = regexp.MustCompile(fmt.Sprintf( + `\s*(%s)\s+-\s+(%s)\s*`, + cvRegex, cvRegex)) +} + +// An individual constraint +type constraint struct { + // The callback function for the restraint. It performs the logic for + // the constraint. + function cfunc + + msg string + + // The version used in the constraint check. For example, if a constraint + // is '<= 2.0.0' the con a version instance representing 2.0.0. + con *Version + + // The original parsed version (e.g., 4.x from != 4.x) + orig string + + // When an x is used as part of the version (e.g., 1.x) + minorDirty bool + dirty bool + patchDirty bool +} + +// Check if a version meets the constraint +func (c *constraint) check(v *Version) bool { + return c.function(v, c) +} + +type cfunc func(v *Version, c *constraint) bool + +func parseConstraint(c string) (*constraint, error) { + m := constraintRegex.FindStringSubmatch(c) + if m == nil { + return nil, fmt.Errorf("improper constraint: %s", c) + } + + ver := m[2] + orig := ver + minorDirty := false + patchDirty := false + dirty := false + if isX(m[3]) { + ver = "0.0.0" + dirty = true + } else if isX(strings.TrimPrefix(m[4], ".")) || m[4] == "" { + minorDirty = true + dirty = true + ver = fmt.Sprintf("%s.0.0%s", m[3], m[6]) + } else if isX(strings.TrimPrefix(m[5], ".")) { + dirty = true + patchDirty = true + ver = fmt.Sprintf("%s%s.0%s", m[3], m[4], m[6]) + } + + con, err := NewVersion(ver) + if err != nil { + + // The constraintRegex should catch any regex parsing errors. So, + // we should never get here. + return nil, errors.New("constraint Parser Error") + } + + cs := &constraint{ + function: constraintOps[m[1]], + msg: constraintMsg[m[1]], + con: con, + orig: orig, + minorDirty: minorDirty, + patchDirty: patchDirty, + dirty: dirty, + } + return cs, nil +} + +// Constraint functions +func constraintNotEqual(v *Version, c *constraint) bool { + if c.dirty { + + // If there is a pre-release on the version but the constraint isn't looking + // for them assume that pre-releases are not compatible. See issue 21 for + // more details. + if v.Prerelease() != "" && c.con.Prerelease() == "" { + return false + } + + if c.con.Major() != v.Major() { + return true + } + if c.con.Minor() != v.Minor() && !c.minorDirty { + return true + } else if c.minorDirty { + return false + } + + return false + } + + return !v.Equal(c.con) +} + +func constraintGreaterThan(v *Version, c *constraint) bool { + + // If there is a pre-release on the version but the constraint isn't looking + // for them assume that pre-releases are not compatible. See issue 21 for + // more details. + if v.Prerelease() != "" && c.con.Prerelease() == "" { + return false + } + + return v.Compare(c.con) == 1 +} + +func constraintLessThan(v *Version, c *constraint) bool { + // If there is a pre-release on the version but the constraint isn't looking + // for them assume that pre-releases are not compatible. See issue 21 for + // more details. + if v.Prerelease() != "" && c.con.Prerelease() == "" { + return false + } + + if !c.dirty { + return v.Compare(c.con) < 0 + } + + if v.Major() > c.con.Major() { + return false + } else if v.Minor() > c.con.Minor() && !c.minorDirty { + return false + } + + return true +} + +func constraintGreaterThanEqual(v *Version, c *constraint) bool { + + // If there is a pre-release on the version but the constraint isn't looking + // for them assume that pre-releases are not compatible. See issue 21 for + // more details. + if v.Prerelease() != "" && c.con.Prerelease() == "" { + return false + } + + return v.Compare(c.con) >= 0 +} + +func constraintLessThanEqual(v *Version, c *constraint) bool { + // If there is a pre-release on the version but the constraint isn't looking + // for them assume that pre-releases are not compatible. See issue 21 for + // more details. + if v.Prerelease() != "" && c.con.Prerelease() == "" { + return false + } + + if !c.dirty { + return v.Compare(c.con) <= 0 + } + + if v.Major() > c.con.Major() { + return false + } else if v.Minor() > c.con.Minor() && !c.minorDirty { + return false + } + + return true +} + +// ~*, ~>* --> >= 0.0.0 (any) +// ~2, ~2.x, ~2.x.x, ~>2, ~>2.x ~>2.x.x --> >=2.0.0, <3.0.0 +// ~2.0, ~2.0.x, ~>2.0, ~>2.0.x --> >=2.0.0, <2.1.0 +// ~1.2, ~1.2.x, ~>1.2, ~>1.2.x --> >=1.2.0, <1.3.0 +// ~1.2.3, ~>1.2.3 --> >=1.2.3, <1.3.0 +// ~1.2.0, ~>1.2.0 --> >=1.2.0, <1.3.0 +func constraintTilde(v *Version, c *constraint) bool { + // If there is a pre-release on the version but the constraint isn't looking + // for them assume that pre-releases are not compatible. See issue 21 for + // more details. + if v.Prerelease() != "" && c.con.Prerelease() == "" { + return false + } + + if v.LessThan(c.con) { + return false + } + + // ~0.0.0 is a special case where all constraints are accepted. It's + // equivalent to >= 0.0.0. + if c.con.Major() == 0 && c.con.Minor() == 0 && c.con.Patch() == 0 && + !c.minorDirty && !c.patchDirty { + return true + } + + if v.Major() != c.con.Major() { + return false + } + + if v.Minor() != c.con.Minor() && !c.minorDirty { + return false + } + + return true +} + +// When there is a .x (dirty) status it automatically opts in to ~. Otherwise +// it's a straight = +func constraintTildeOrEqual(v *Version, c *constraint) bool { + // If there is a pre-release on the version but the constraint isn't looking + // for them assume that pre-releases are not compatible. See issue 21 for + // more details. + if v.Prerelease() != "" && c.con.Prerelease() == "" { + return false + } + + if c.dirty { + c.msg = constraintMsg["~"] + return constraintTilde(v, c) + } + + return v.Equal(c.con) +} + +// ^* --> (any) +// ^2, ^2.x, ^2.x.x --> >=2.0.0, <3.0.0 +// ^2.0, ^2.0.x --> >=2.0.0, <3.0.0 +// ^1.2, ^1.2.x --> >=1.2.0, <2.0.0 +// ^1.2.3 --> >=1.2.3, <2.0.0 +// ^1.2.0 --> >=1.2.0, <2.0.0 +func constraintCaret(v *Version, c *constraint) bool { + // If there is a pre-release on the version but the constraint isn't looking + // for them assume that pre-releases are not compatible. See issue 21 for + // more details. + if v.Prerelease() != "" && c.con.Prerelease() == "" { + return false + } + + if v.LessThan(c.con) { + return false + } + + if v.Major() != c.con.Major() { + return false + } + + return true +} + +var constraintRangeRegex *regexp.Regexp + +const cvRegex string = `v?([0-9|x|X|\*]+)(\.[0-9|x|X|\*]+)?(\.[0-9|x|X|\*]+)?` + + `(-([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*))?` + + `(\+([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*))?` + +func isX(x string) bool { + switch x { + case "x", "*", "X": + return true + default: + return false + } +} + +func rewriteRange(i string) string { + m := constraintRangeRegex.FindAllStringSubmatch(i, -1) + if m == nil { + return i + } + o := i + for _, v := range m { + t := fmt.Sprintf(">= %s, <= %s", v[1], v[11]) + o = strings.Replace(o, v[0], t, 1) + } + + return o +} diff --git a/vendor/github.com/Masterminds/semver/doc.go b/vendor/github.com/Masterminds/semver/doc.go new file mode 100644 index 0000000000..6a6c24c6d6 --- /dev/null +++ b/vendor/github.com/Masterminds/semver/doc.go @@ -0,0 +1,115 @@ +/* +Package semver provides the ability to work with Semantic Versions (http://semver.org) in Go. + +Specifically it provides the ability to: + + * Parse semantic versions + * Sort semantic versions + * Check if a semantic version fits within a set of constraints + * Optionally work with a `v` prefix + +Parsing Semantic Versions + +To parse a semantic version use the `NewVersion` function. For example, + + v, err := semver.NewVersion("1.2.3-beta.1+build345") + +If there is an error the version wasn't parseable. The version object has methods +to get the parts of the version, compare it to other versions, convert the +version back into a string, and get the original string. For more details +please see the documentation at https://godoc.org/github.com/Masterminds/semver. + +Sorting Semantic Versions + +A set of versions can be sorted using the `sort` package from the standard library. +For example, + + raw := []string{"1.2.3", "1.0", "1.3", "2", "0.4.2",} + vs := make([]*semver.Version, len(raw)) + for i, r := range raw { + v, err := semver.NewVersion(r) + if err != nil { + t.Errorf("Error parsing version: %s", err) + } + + vs[i] = v + } + + sort.Sort(semver.Collection(vs)) + +Checking Version Constraints + +Checking a version against version constraints is one of the most featureful +parts of the package. + + c, err := semver.NewConstraint(">= 1.2.3") + if err != nil { + // Handle constraint not being parseable. + } + + v, err := semver.NewVersion("1.3") + if err != nil { + // Handle version not being parseable. + } + // Check if the version meets the constraints. The a variable will be true. + a := c.Check(v) + +Basic Comparisons + +There are two elements to the comparisons. First, a comparison string is a list +of comma separated and comparisons. These are then separated by || separated or +comparisons. For example, `">= 1.2, < 3.0.0 || >= 4.2.3"` is looking for a +comparison that's greater than or equal to 1.2 and less than 3.0.0 or is +greater than or equal to 4.2.3. + +The basic comparisons are: + + * `=`: equal (aliased to no operator) + * `!=`: not equal + * `>`: greater than + * `<`: less than + * `>=`: greater than or equal to + * `<=`: less than or equal to + +Hyphen Range Comparisons + +There are multiple methods to handle ranges and the first is hyphens ranges. +These look like: + + * `1.2 - 1.4.5` which is equivalent to `>= 1.2, <= 1.4.5` + * `2.3.4 - 4.5` which is equivalent to `>= 2.3.4, <= 4.5` + +Wildcards In Comparisons + +The `x`, `X`, and `*` characters can be used as a wildcard character. This works +for all comparison operators. When used on the `=` operator it falls +back to the pack level comparison (see tilde below). For example, + + * `1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` + * `>= 1.2.x` is equivalent to `>= 1.2.0` + * `<= 2.x` is equivalent to `<= 3` + * `*` is equivalent to `>= 0.0.0` + +Tilde Range Comparisons (Patch) + +The tilde (`~`) comparison operator is for patch level ranges when a minor +version is specified and major level changes when the minor number is missing. +For example, + + * `~1.2.3` is equivalent to `>= 1.2.3, < 1.3.0` + * `~1` is equivalent to `>= 1, < 2` + * `~2.3` is equivalent to `>= 2.3, < 2.4` + * `~1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` + * `~1.x` is equivalent to `>= 1, < 2` + +Caret Range Comparisons (Major) + +The caret (`^`) comparison operator is for major level changes. This is useful +when comparisons of API versions as a major change is API breaking. For example, + + * `^1.2.3` is equivalent to `>= 1.2.3, < 2.0.0` + * `^1.2.x` is equivalent to `>= 1.2.0, < 2.0.0` + * `^2.3` is equivalent to `>= 2.3, < 3` + * `^2.x` is equivalent to `>= 2.0.0, < 3` +*/ +package semver diff --git a/vendor/github.com/Masterminds/semver/version.go b/vendor/github.com/Masterminds/semver/version.go new file mode 100644 index 0000000000..400d4f9341 --- /dev/null +++ b/vendor/github.com/Masterminds/semver/version.go @@ -0,0 +1,425 @@ +package semver + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "regexp" + "strconv" + "strings" +) + +// The compiled version of the regex created at init() is cached here so it +// only needs to be created once. +var versionRegex *regexp.Regexp +var validPrereleaseRegex *regexp.Regexp + +var ( + // ErrInvalidSemVer is returned a version is found to be invalid when + // being parsed. + ErrInvalidSemVer = errors.New("Invalid Semantic Version") + + // ErrInvalidMetadata is returned when the metadata is an invalid format + ErrInvalidMetadata = errors.New("Invalid Metadata string") + + // ErrInvalidPrerelease is returned when the pre-release is an invalid format + ErrInvalidPrerelease = errors.New("Invalid Prerelease string") +) + +// SemVerRegex is the regular expression used to parse a semantic version. +const SemVerRegex string = `v?([0-9]+)(\.[0-9]+)?(\.[0-9]+)?` + + `(-([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*))?` + + `(\+([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*))?` + +// ValidPrerelease is the regular expression which validates +// both prerelease and metadata values. +const ValidPrerelease string = `^([0-9A-Za-z\-]+(\.[0-9A-Za-z\-]+)*)$` + +// Version represents a single semantic version. +type Version struct { + major, minor, patch int64 + pre string + metadata string + original string +} + +func init() { + versionRegex = regexp.MustCompile("^" + SemVerRegex + "$") + validPrereleaseRegex = regexp.MustCompile(ValidPrerelease) +} + +// NewVersion parses a given version and returns an instance of Version or +// an error if unable to parse the version. +func NewVersion(v string) (*Version, error) { + m := versionRegex.FindStringSubmatch(v) + if m == nil { + return nil, ErrInvalidSemVer + } + + sv := &Version{ + metadata: m[8], + pre: m[5], + original: v, + } + + var temp int64 + temp, err := strconv.ParseInt(m[1], 10, 64) + if err != nil { + return nil, fmt.Errorf("Error parsing version segment: %s", err) + } + sv.major = temp + + if m[2] != "" { + temp, err = strconv.ParseInt(strings.TrimPrefix(m[2], "."), 10, 64) + if err != nil { + return nil, fmt.Errorf("Error parsing version segment: %s", err) + } + sv.minor = temp + } else { + sv.minor = 0 + } + + if m[3] != "" { + temp, err = strconv.ParseInt(strings.TrimPrefix(m[3], "."), 10, 64) + if err != nil { + return nil, fmt.Errorf("Error parsing version segment: %s", err) + } + sv.patch = temp + } else { + sv.patch = 0 + } + + return sv, nil +} + +// MustParse parses a given version and panics on error. +func MustParse(v string) *Version { + sv, err := NewVersion(v) + if err != nil { + panic(err) + } + return sv +} + +// String converts a Version object to a string. +// Note, if the original version contained a leading v this version will not. +// See the Original() method to retrieve the original value. Semantic Versions +// don't contain a leading v per the spec. Instead it's optional on +// implementation. +func (v *Version) String() string { + var buf bytes.Buffer + + fmt.Fprintf(&buf, "%d.%d.%d", v.major, v.minor, v.patch) + if v.pre != "" { + fmt.Fprintf(&buf, "-%s", v.pre) + } + if v.metadata != "" { + fmt.Fprintf(&buf, "+%s", v.metadata) + } + + return buf.String() +} + +// Original returns the original value passed in to be parsed. +func (v *Version) Original() string { + return v.original +} + +// Major returns the major version. +func (v *Version) Major() int64 { + return v.major +} + +// Minor returns the minor version. +func (v *Version) Minor() int64 { + return v.minor +} + +// Patch returns the patch version. +func (v *Version) Patch() int64 { + return v.patch +} + +// Prerelease returns the pre-release version. +func (v *Version) Prerelease() string { + return v.pre +} + +// Metadata returns the metadata on the version. +func (v *Version) Metadata() string { + return v.metadata +} + +// originalVPrefix returns the original 'v' prefix if any. +func (v *Version) originalVPrefix() string { + + // Note, only lowercase v is supported as a prefix by the parser. + if v.original != "" && v.original[:1] == "v" { + return v.original[:1] + } + return "" +} + +// IncPatch produces the next patch version. +// If the current version does not have prerelease/metadata information, +// it unsets metadata and prerelease values, increments patch number. +// If the current version has any of prerelease or metadata information, +// it unsets both values and keeps curent patch value +func (v Version) IncPatch() Version { + vNext := v + // according to http://semver.org/#spec-item-9 + // Pre-release versions have a lower precedence than the associated normal version. + // according to http://semver.org/#spec-item-10 + // Build metadata SHOULD be ignored when determining version precedence. + if v.pre != "" { + vNext.metadata = "" + vNext.pre = "" + } else { + vNext.metadata = "" + vNext.pre = "" + vNext.patch = v.patch + 1 + } + vNext.original = v.originalVPrefix() + "" + vNext.String() + return vNext +} + +// IncMinor produces the next minor version. +// Sets patch to 0. +// Increments minor number. +// Unsets metadata. +// Unsets prerelease status. +func (v Version) IncMinor() Version { + vNext := v + vNext.metadata = "" + vNext.pre = "" + vNext.patch = 0 + vNext.minor = v.minor + 1 + vNext.original = v.originalVPrefix() + "" + vNext.String() + return vNext +} + +// IncMajor produces the next major version. +// Sets patch to 0. +// Sets minor to 0. +// Increments major number. +// Unsets metadata. +// Unsets prerelease status. +func (v Version) IncMajor() Version { + vNext := v + vNext.metadata = "" + vNext.pre = "" + vNext.patch = 0 + vNext.minor = 0 + vNext.major = v.major + 1 + vNext.original = v.originalVPrefix() + "" + vNext.String() + return vNext +} + +// SetPrerelease defines the prerelease value. +// Value must not include the required 'hypen' prefix. +func (v Version) SetPrerelease(prerelease string) (Version, error) { + vNext := v + if len(prerelease) > 0 && !validPrereleaseRegex.MatchString(prerelease) { + return vNext, ErrInvalidPrerelease + } + vNext.pre = prerelease + vNext.original = v.originalVPrefix() + "" + vNext.String() + return vNext, nil +} + +// SetMetadata defines metadata value. +// Value must not include the required 'plus' prefix. +func (v Version) SetMetadata(metadata string) (Version, error) { + vNext := v + if len(metadata) > 0 && !validPrereleaseRegex.MatchString(metadata) { + return vNext, ErrInvalidMetadata + } + vNext.metadata = metadata + vNext.original = v.originalVPrefix() + "" + vNext.String() + return vNext, nil +} + +// LessThan tests if one version is less than another one. +func (v *Version) LessThan(o *Version) bool { + return v.Compare(o) < 0 +} + +// GreaterThan tests if one version is greater than another one. +func (v *Version) GreaterThan(o *Version) bool { + return v.Compare(o) > 0 +} + +// Equal tests if two versions are equal to each other. +// Note, versions can be equal with different metadata since metadata +// is not considered part of the comparable version. +func (v *Version) Equal(o *Version) bool { + return v.Compare(o) == 0 +} + +// Compare compares this version to another one. It returns -1, 0, or 1 if +// the version smaller, equal, or larger than the other version. +// +// Versions are compared by X.Y.Z. Build metadata is ignored. Prerelease is +// lower than the version without a prerelease. +func (v *Version) Compare(o *Version) int { + // Compare the major, minor, and patch version for differences. If a + // difference is found return the comparison. + if d := compareSegment(v.Major(), o.Major()); d != 0 { + return d + } + if d := compareSegment(v.Minor(), o.Minor()); d != 0 { + return d + } + if d := compareSegment(v.Patch(), o.Patch()); d != 0 { + return d + } + + // At this point the major, minor, and patch versions are the same. + ps := v.pre + po := o.Prerelease() + + if ps == "" && po == "" { + return 0 + } + if ps == "" { + return 1 + } + if po == "" { + return -1 + } + + return comparePrerelease(ps, po) +} + +// UnmarshalJSON implements JSON.Unmarshaler interface. +func (v *Version) UnmarshalJSON(b []byte) error { + var s string + if err := json.Unmarshal(b, &s); err != nil { + return err + } + temp, err := NewVersion(s) + if err != nil { + return err + } + v.major = temp.major + v.minor = temp.minor + v.patch = temp.patch + v.pre = temp.pre + v.metadata = temp.metadata + v.original = temp.original + temp = nil + return nil +} + +// MarshalJSON implements JSON.Marshaler interface. +func (v *Version) MarshalJSON() ([]byte, error) { + return json.Marshal(v.String()) +} + +func compareSegment(v, o int64) int { + if v < o { + return -1 + } + if v > o { + return 1 + } + + return 0 +} + +func comparePrerelease(v, o string) int { + + // split the prelease versions by their part. The separator, per the spec, + // is a . + sparts := strings.Split(v, ".") + oparts := strings.Split(o, ".") + + // Find the longer length of the parts to know how many loop iterations to + // go through. + slen := len(sparts) + olen := len(oparts) + + l := slen + if olen > slen { + l = olen + } + + // Iterate over each part of the prereleases to compare the differences. + for i := 0; i < l; i++ { + // Since the lentgh of the parts can be different we need to create + // a placeholder. This is to avoid out of bounds issues. + stemp := "" + if i < slen { + stemp = sparts[i] + } + + otemp := "" + if i < olen { + otemp = oparts[i] + } + + d := comparePrePart(stemp, otemp) + if d != 0 { + return d + } + } + + // Reaching here means two versions are of equal value but have different + // metadata (the part following a +). They are not identical in string form + // but the version comparison finds them to be equal. + return 0 +} + +func comparePrePart(s, o string) int { + // Fastpath if they are equal + if s == o { + return 0 + } + + // When s or o are empty we can use the other in an attempt to determine + // the response. + if s == "" { + if o != "" { + return -1 + } + return 1 + } + + if o == "" { + if s != "" { + return 1 + } + return -1 + } + + // When comparing strings "99" is greater than "103". To handle + // cases like this we need to detect numbers and compare them. According + // to the semver spec, numbers are always positive. If there is a - at the + // start like -99 this is to be evaluated as an alphanum. numbers always + // have precedence over alphanum. Parsing as Uints because negative numbers + // are ignored. + + oi, n1 := strconv.ParseUint(o, 10, 64) + si, n2 := strconv.ParseUint(s, 10, 64) + + // The case where both are strings compare the strings + if n1 != nil && n2 != nil { + if s > o { + return 1 + } + return -1 + } else if n1 != nil { + // o is a string and s is a number + return -1 + } else if n2 != nil { + // s is a string and o is a number + return 1 + } + // Both are numbers + if si > oi { + return 1 + } + return -1 + +} diff --git a/vendor/github.com/Masterminds/semver/version_fuzz.go b/vendor/github.com/Masterminds/semver/version_fuzz.go new file mode 100644 index 0000000000..b42bcd62b9 --- /dev/null +++ b/vendor/github.com/Masterminds/semver/version_fuzz.go @@ -0,0 +1,10 @@ +// +build gofuzz + +package semver + +func Fuzz(data []byte) int { + if _, err := NewVersion(string(data)); err != nil { + return 0 + } + return 1 +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/LICENSE b/vendor/github.com/NVIDIA/go-dcgm/LICENSE new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/admin.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/admin.go new file mode 100644 index 0000000000..2c223219cd --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/admin.go @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +/* +#cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#cgo darwin LDFLAGS: -ldl -Wl,--export-dynamic -Wl,-undefined,dynamic_lookup + +#include +#include "dcgm_agent.h" +#include "dcgm_structs.h" + +*/ +import "C" +import ( + "fmt" + "io/ioutil" + "log" + "os" + "os/exec" + "strconv" + "strings" + "syscall" + "unsafe" + + "github.com/Masterminds/semver" +) + +type mode int + +// const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine +const ( + Embedded mode = iota + Standalone + StartHostengine +) + +type dcgmHandle struct{ handle C.dcgmHandle_t } + +var ( + dcgmLibHandle unsafe.Pointer + stopMode mode + handle dcgmHandle + hostengineAsChildPid int +) + +func initDcgm(m mode, args ...string) (err error) { + const ( + dcgmLib = "libdcgm.so" + ) + lib := C.CString(dcgmLib) + defer freeCString(lib) + + dcgmLibHandle = C.dlopen(lib, C.RTLD_LAZY|C.RTLD_GLOBAL) + if dcgmLibHandle == nil { + return fmt.Errorf("%s not Found", dcgmLib) + } + + // set the stopMode for shutdown() + stopMode = m + + switch m { + case Embedded: + return startEmbedded() + case Standalone: + return connectStandalone(args...) + case StartHostengine: + return startHostengine() + } + + return nil +} + +func shutdown() (err error) { + switch stopMode { + case Embedded: + err = stopEmbedded() + case Standalone: + err = disconnectStandalone() + case StartHostengine: + err = stopHostengine() + } + + C.dlclose(dcgmLibHandle) + return +} + +func startEmbedded() (err error) { + result := C.dcgmInit() + if err = errorString(result); err != nil { + return fmt.Errorf("Error initializing DCGM: %s", err) + } + + var cHandle C.dcgmHandle_t + result = C.dcgmStartEmbedded(C.DCGM_OPERATION_MODE_AUTO, &cHandle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error starting nv-hostengine: %s", err) + } + handle = dcgmHandle{cHandle} + return +} + +func stopEmbedded() (err error) { + result := C.dcgmStopEmbedded(handle.handle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error stopping nv-hostengine: %s", err) + } + + result = C.dcgmShutdown() + if err = errorString(result); err != nil { + return fmt.Errorf("Error shutting down DCGM: %s", err) + } + return +} + +func connectStandalone(args ...string) (err error) { + if len(args) < 2 { + return fmt.Errorf("Missing dcgm address and / or port") + } + + result := C.dcgmInit() + if err = errorString(result); err != nil { + return fmt.Errorf("Error initializing DCGM: %s", err) + } + + var cHandle C.dcgmHandle_t + addr := C.CString(args[0]) + defer freeCString(addr) + var connectParams C.dcgmConnectV2Params_v2 + connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) + + sck, err := strconv.ParseUint(args[1], 10, 32) + if err != nil { + return fmt.Errorf("Error parsing %s: %v\n", args[1], err) + } + connectParams.addressIsUnixSocket = C.uint(sck) + + result = C.dcgmConnect_v2(addr, &connectParams, &cHandle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error connecting to nv-hostengine: %s", err) + } + + handle = dcgmHandle{cHandle} + + // This check is disabled for now + /* + err = checkHostengineVersion() + if err != nil { + return fmt.Errorf("Error connecting to remote nv-hostengine: %s", err) + } + */ + + return +} + +func disconnectStandalone() (err error) { + result := C.dcgmDisconnect(handle.handle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error disconnecting from nv-hostengine: %s", err) + } + + result = C.dcgmShutdown() + if err = errorString(result); err != nil { + return fmt.Errorf("Error shutting down DCGM: %s", err) + } + return +} + +func startHostengine() (err error) { + bin, err := exec.LookPath("nv-hostengine") + if err != nil { + return fmt.Errorf("Error finding nv-hostengine: %s", err) + } + var procAttr syscall.ProcAttr + procAttr.Files = []uintptr{ + uintptr(syscall.Stdin), + uintptr(syscall.Stdout), + uintptr(syscall.Stderr)} + procAttr.Sys = &syscall.SysProcAttr{Setpgid: true} + + dir := "/tmp" + tmpfile, err := ioutil.TempFile(dir, "dcgm") + if err != nil { + return fmt.Errorf("Error creating temporary file in %s directory: %s", dir, err) + } + socketPath := tmpfile.Name() + defer os.Remove(socketPath) + + connectArg := "--domain-socket" + hostengineAsChildPid, err = syscall.ForkExec(bin, []string{bin, connectArg, socketPath}, &procAttr) + if err != nil { + return fmt.Errorf("Error fork-execing nv-hostengine: %s", err) + } + + result := C.dcgmInit() + if err = errorString(result); err != nil { + return fmt.Errorf("Error initializing DCGM: %s", err) + } + + var cHandle C.dcgmHandle_t + var connectParams C.dcgmConnectV2Params_v2 + connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) + isSocket := C.uint(1) + connectParams.addressIsUnixSocket = isSocket + cSockPath := C.CString(socketPath) + defer freeCString(cSockPath) + result = C.dcgmConnect_v2(cSockPath, &connectParams, &cHandle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error connecting to nv-hostengine: %s", err) + } + + handle = dcgmHandle{cHandle} + return +} + +func stopHostengine() (err error) { + if err = disconnectStandalone(); err != nil { + return + } + + // terminate nv-hostengine + cmd := exec.Command("nv-hostengine", "--term") + if err = cmd.Run(); err != nil { + return fmt.Errorf("Error terminating nv-hostengine: %s", err) + } + log.Println("Successfully terminated nv-hostengine.") + + return syscall.Kill(hostengineAsChildPid, syscall.SIGKILL) +} + +func checkHostengineVersion() (err error) { + var hostEngineVersionInfo C.dcgmVersionInfo_t + hostEngineVersionInfo.version = makeVersion2(unsafe.Sizeof(hostEngineVersionInfo)) + result := C.dcgmHostengineVersionInfo(handle.handle, &hostEngineVersionInfo) + if err = errorString(result); err != nil { + return fmt.Errorf("Could not retrieve running hostengine version: %s", err) + } + + var versionInfo C.dcgmVersionInfo_t + versionInfo.version = makeVersion2(unsafe.Sizeof(versionInfo)) + result = C.dcgmVersionInfo(&versionInfo) + if err = errorString(result); err != nil { + return fmt.Errorf("Could not retrieve dcgm version: %s", err) + } + + /* Version string looks like: "version:2.1.2;arch:x86_64;buildtype:Debug; + * buildid:;builddate:2021-03-03;commit:v2.1.1-5-gc27ab30f;branch:master; + * buildplatform:Linux 5.4.0-66-generic #74~18.04.2-Ubuntu SMP Fri Feb 5 + * 11:17:31 UTC 2021 x86_64;;crc:bd60aadd63245021163ef008d0907ae7" + */ + heVersionStr := C.GoString(&hostEngineVersionInfo.rawBuildInfoString[0]) + myVersionStr := C.GoString(&versionInfo.rawBuildInfoString[0]) + var foundVersion = false + + he := strings.Split(heVersionStr, ";") + + // Find version pair within build information + for _, line := range he { + if strings.HasPrefix(line, "version:") { + heVersionStr = line + foundVersion = true + } + } + + if foundVersion == false { + return fmt.Errorf("Could not determine remote version") + } + + foundVersion = false + my := strings.Split(myVersionStr, ";") + + for _, line := range my { + if strings.HasPrefix(line, "version:") { + myVersionStr = line + foundVersion = true + } + } + + if foundVersion == false { + return fmt.Errorf("Could not determine local version") + } + + // Parse out version and compare + he = strings.Split(heVersionStr, ":") + my = strings.Split(myVersionStr, ":") + + if (len(he) != 2) && (len(my) != 2) { + return fmt.Errorf("Could not parse versions") + } + + heVersion, err := semver.NewVersion(he[1]) + if err != nil { + return fmt.Errorf("Could not determine remote version: %s", err) + } + myVersion, err := semver.NewVersion(my[1]) + if err != nil { + return fmt.Errorf("Could not determine local version: %s", err) + } + if heVersion.Major() != myVersion.Major() { + return fmt.Errorf("remote %v != local %v", he[1], my[1]) + } + + return +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/api.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/api.go new file mode 100644 index 0000000000..6dcdf6b604 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/api.go @@ -0,0 +1,125 @@ +package dcgm + +import ( + "context" + "fmt" + "os" + "sync" + "time" +) + +var ( + dcgmInitCounter int + mux sync.Mutex +) + +// Init starts DCGM, based on the user selected mode +// DCGM can be started in 3 differengt modes: +// 1. Embedded: Start hostengine within this process +// 2. Standalone: Connect to an already running nv-hostengine at the specified address +// Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" +// 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting +func Init(m mode, args ...string) (cleanup func(), err error) { + mux.Lock() + if dcgmInitCounter < 0 { + count := fmt.Sprintf("%d", dcgmInitCounter) + err = fmt.Errorf("Shutdown() is called %s times, before Init()", count[1:]) + } + if dcgmInitCounter == 0 { + err = initDcgm(m, args...) + + if err != nil { + mux.Unlock() + + return nil, err + } + } + dcgmInitCounter += 1 + mux.Unlock() + + return func() { + if err := Shutdown(); err != nil { + fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", err) + } + }, err +} + +// Shutdown stops DCGM and destroy all connections +func Shutdown() (err error) { + mux.Lock() + if dcgmInitCounter <= 0 { + err = fmt.Errorf("Init() needs to be called before Shutdown()") + } + if dcgmInitCounter == 1 { + err = shutdown() + } + dcgmInitCounter -= 1 + mux.Unlock() + + return +} + +// GetAllDeviceCount counts all GPUs on the system +func GetAllDeviceCount() (uint, error) { + return getAllDeviceCount() +} + +func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error) { + return getEntityGroupEntities(entityGroup) +} + +// GetSupportedDevices returns only DCGM supported GPUs +func GetSupportedDevices() ([]uint, error) { + return getSupportedDevices() +} + +// GetDeviceInfo describes the given device +func GetDeviceInfo(gpuId uint) (Device, error) { + return getDeviceInfo(gpuId) +} + +// GetDeviceStatus monitors GPU status including its power, memory and GPU utilization +func GetDeviceStatus(gpuId uint) (DeviceStatus, error) { + return latestValuesForDevice(gpuId) +} + +// GetDeviceTopology returns device topology corresponding to the gpuId +func GetDeviceTopology(gpuId uint) ([]P2PLink, error) { + return getDeviceTopology(gpuId) +} + +// WatchPidFields lets DCGM start recording stats for GPU process +// It needs to be called before calling GetProcessInfo +func WatchPidFields() (GroupHandle, error) { + return watchPidFields(time.Microsecond*time.Duration(defaultUpdateFreq), time.Second*time.Duration(defaultMaxKeepAge), defaultMaxKeepSamples) +} + +// GetProcessInfo provides detailed per GPU stats for this process +func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error) { + return getProcessInfo(group, pid) +} + +// HealthCheckByGpuId monitors GPU health for any errors/failures/warnings +func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) { + return healthCheckByGpuId(gpuId) +} + +// ListenForPolicyViolations sets GPU usage and error policies and notifies in case of any violations +func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error) { + groupId := GroupAllGPUs() + return registerPolicy(ctx, groupId, typ...) +} + +// Introspect returns DCGM hostengine memory and CPU usage +func Introspect() (DcgmStatus, error) { + return introspect() +} + +// Get all of the profiling metric groups for a given GPU group. +func GetSupportedMetricGroups(gpuId uint) ([]MetricGroup, error) { + return getSupportedMetricGroups(gpuId) +} + +func GetNvLinkLinkStatus() ([]NvLinkStatus, error) { + return getNvLinkLinkStatus() +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/callback.c b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/callback.c new file mode 100644 index 0000000000..5bc2fc2b5f --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/callback.c @@ -0,0 +1,4 @@ +int violationNotify(void* p) { + int ViolationRegistration(void*); + return ViolationRegistration(p); +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/const.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/const.go new file mode 100644 index 0000000000..96b63f544a --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/const.go @@ -0,0 +1,1004 @@ +package dcgm + +import "C" + +type Short C.ushort + +type FieldValue_v1 struct { + Version uint + FieldId uint + FieldType uint + Status int + Ts int64 + Value [4096]byte +} + +type FieldValue_v2 struct { + Version uint + EntityGroupId Field_Entity_Group + EntityId uint + FieldId uint + FieldType uint + Status int + Ts int64 + Value [4096]byte + StringValue *string +} + +const ( + DCGM_FT_BINARY = uint('b') + DCGM_FT_DOUBLE = uint('d') + DCGM_FT_INT64 = uint('i') + DCGM_FT_STRING = uint('s') + DCGM_FT_TIMESTAMP = uint('t') + DCGM_FT_INT32_BLANK = int64(2147483632) + DCGM_FT_INT32_NOT_FOUND = int64(DCGM_FT_INT32_BLANK + 1) + DCGM_FT_INT32_NOT_SUPPORTED = int64(DCGM_FT_INT32_BLANK + 2) + DCGM_FT_INT32_NOT_PERMISSIONED = int64(DCGM_FT_INT32_BLANK + 3) + DCGM_FT_INT64_BLANK = int64(9223372036854775792) + DCGM_FT_INT64_NOT_FOUND = int64(DCGM_FT_INT64_BLANK + 1) + DCGM_FT_INT64_NOT_SUPPORTED = int64(DCGM_FT_INT64_BLANK + 2) + DCGM_FT_INT64_NOT_PERMISSIONED = int64(DCGM_FT_INT64_BLANK + 3) + DCGM_FT_FP64_BLANK = 140737488355328.0 + DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) + DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) + DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) + DCGM_FT_STR_BLANK = "<<>>" + DCGM_FT_STR_NOT_FOUND = "<<>>" + DCGM_FT_STR_NOT_SUPPORTED = "<<>>" + DCGM_FT_STR_NOT_PERMISSIONED = "<<>>" + + DCGM_FI_UNKNOWN = 0 + DCGM_FI_DRIVER_VERSION = 1 + DCGM_FI_NVML_VERSION = 2 + DCGM_FI_PROCESS_NAME = 3 + DCGM_FI_DEV_COUNT = 4 + DCGM_FI_CUDA_DRIVER_VERSION = 5 + DCGM_FI_DEV_NAME = 50 + DCGM_FI_DEV_BRAND = 51 + DCGM_FI_DEV_NVML_INDEX = 52 + DCGM_FI_DEV_SERIAL = 53 + DCGM_FI_DEV_UUID = 54 + DCGM_FI_DEV_MINOR_NUMBER = 55 + DCGM_FI_DEV_OEM_INFOROM_VER = 56 + DCGM_FI_DEV_PCI_BUSID = 57 + DCGM_FI_DEV_PCI_COMBINED_ID = 58 + DCGM_FI_DEV_PCI_SUBSYS_ID = 59 + DCGM_FI_GPU_TOPOLOGY_PCI = 60 + DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 + DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 + DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 + DCGM_FI_DEV_COMPUTE_MODE = 65 + DCGM_FI_DEV_PERSISTENCE_MODE = 66 + DCGM_FI_DEV_MIG_MODE = 67 + DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68 + DCGM_FI_DEV_MIG_MAX_SLICES = 69 + DCGM_FI_DEV_CPU_AFFINITY_0 = 70 + DCGM_FI_DEV_CPU_AFFINITY_1 = 71 + DCGM_FI_DEV_CPU_AFFINITY_2 = 72 + DCGM_FI_DEV_CPU_AFFINITY_3 = 73 + DCGM_FI_DEV_CC_MODE = 74 + DCGM_FI_DEV_MIG_ATTRIBUTES = 75 + DCGM_FI_DEV_MIG_GI_INFO = 76 + DCGM_FI_DEV_MIG_CI_INFO = 77 + DCGM_FI_DEV_ECC_INFOROM_VER = 80 + DCGM_FI_DEV_POWER_INFOROM_VER = 81 + DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 + DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 + DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 + DCGM_FI_DEV_VBIOS_VERSION = 85 + DCGM_FI_DEV_MEM_AFFINITY_0 = 86 + DCGM_FI_DEV_MEM_AFFINITY_1 = 87 + DCGM_FI_DEV_MEM_AFFINITY_2 = 88 + DCGM_FI_DEV_MEM_AFFINITY_3 = 89 + DCGM_FI_DEV_BAR1_TOTAL = 90 + DCGM_FI_SYNC_BOOST = 91 + DCGM_FI_DEV_BAR1_USED = 92 + DCGM_FI_DEV_BAR1_FREE = 93 + DCGM_FI_DEV_SM_CLOCK = 100 + DCGM_FI_DEV_MEM_CLOCK = 101 + DCGM_FI_DEV_VIDEO_CLOCK = 102 + DCGM_FI_DEV_APP_SM_CLOCK = 110 + DCGM_FI_DEV_APP_MEM_CLOCK = 111 + DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 + DCGM_FI_DEV_MAX_SM_CLOCK = 113 + DCGM_FI_DEV_MAX_MEM_CLOCK = 114 + DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 + DCGM_FI_DEV_AUTOBOOST = 120 + DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 + DCGM_FI_DEV_MEMORY_TEMP = 140 + DCGM_FI_DEV_GPU_TEMP = 150 + DCGM_FI_DEV_MEM_MAX_OP_TEMP = 151 + DCGM_FI_DEV_GPU_MAX_OP_TEMP = 152 + DCGM_FI_DEV_POWER_USAGE = 155 + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 + DCGM_FI_DEV_POWER_USAGE_INSTANT = 157 + DCGM_FI_DEV_SLOWDOWN_TEMP = 158 + DCGM_FI_DEV_SHUTDOWN_TEMP = 159 + DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 + DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 + DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 + DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 + DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 + DCGM_FI_DEV_PSTATE = 190 + DCGM_FI_DEV_FAN_SPEED = 191 + DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 + DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 + DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 + DCGM_FI_DEV_GPU_UTIL = 203 + DCGM_FI_DEV_MEM_COPY_UTIL = 204 + DCGM_FI_DEV_ACCOUNTING_DATA = 205 + DCGM_FI_DEV_ENC_UTIL = 206 + DCGM_FI_DEV_DEC_UTIL = 207 + DCGM_FI_DEV_XID_ERRORS = 230 + DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 + DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 + DCGM_FI_DEV_PCIE_LINK_GEN = 237 + DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 + DCGM_FI_DEV_POWER_VIOLATION = 240 + DCGM_FI_DEV_THERMAL_VIOLATION = 241 + DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 + DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 + DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 + DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 + DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 + DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 + DCGM_FI_DEV_FB_TOTAL = 250 + DCGM_FI_DEV_FB_FREE = 251 + DCGM_FI_DEV_FB_USED = 252 + DCGM_FI_DEV_FB_RESERVED = 253 + DCGM_FI_DEV_FB_USED_PERCENT = 254 + DCGM_FI_DEV_C2C_LINK_COUNT = 285 + DCGM_FI_DEV_C2C_LINK_STATUS = 286 + DCGM_FI_DEV_C2C_MAX_BANDWIDTH = 287 + DCGM_FI_DEV_ECC_CURRENT = 300 + DCGM_FI_DEV_ECC_PENDING = 301 + DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 + DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 + DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 + DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 + DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 + DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 + DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 + DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 + DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 + DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 + DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 + DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 + DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 + DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 + DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 + DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 + DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 + DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 + DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 + DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 + DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 + DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 + DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 + DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX = 385 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH = 386 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL = 387 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW = 388 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE = 389 + DCGM_FI_DEV_RETIRED_SBE = 390 + DCGM_FI_DEV_RETIRED_DBE = 391 + DCGM_FI_DEV_RETIRED_PENDING = 392 + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 + DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 + DCGM_FI_DEV_ROW_REMAP_PENDING = 396 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 + DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 451 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 452 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 453 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 454 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 455 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 456 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 457 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 458 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 459 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 460 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 461 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 462 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 463 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 464 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 465 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 466 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 467 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 468 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 469 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 470 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 471 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 472 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 473 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 474 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 = 475 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 = 476 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 = 477 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 = 478 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 = 479 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 = 480 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 = 406 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 = 407 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 = 408 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 = 481 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 = 482 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 = 483 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 = 416 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 = 417 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 = 418 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 = 484 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 = 485 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 = 486 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 = 426 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 = 427 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 = 428 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 = 487 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 = 488 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 = 489 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 = 436 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 = 437 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 = 438 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 = 491 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 = 492 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 = 493 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 = 446 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 = 447 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 = 448 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 = 494 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 = 495 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 = 496 + DCGM_FI_DEV_VIRTUAL_MODE = 500 + DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 + DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 + DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 + DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 + DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 + DCGM_FI_DEV_ENC_STATS = 506 + DCGM_FI_DEV_FBC_STATS = 507 + DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 + DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS = 509 + DCGM_FI_DEV_VGPU_TYPE_INFO = 510 + DCGM_FI_DEV_VGPU_TYPE_NAME = 511 + DCGM_FI_DEV_VGPU_TYPE_CLASS = 512 + DCGM_FI_DEV_VGPU_TYPE_LICENSE = 513 + DCGM_FI_DEV_VGPU_VM_ID = 520 + DCGM_FI_DEV_VGPU_VM_NAME = 521 + DCGM_FI_DEV_VGPU_TYPE = 522 + DCGM_FI_DEV_VGPU_UUID = 523 + DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 + DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 + DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 + DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 + DCGM_FI_DEV_VGPU_ENC_STATS = 528 + DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 + DCGM_FI_DEV_VGPU_FBC_STATS = 530 + DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 + DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = 532 + DCGM_FI_DEV_VGPU_PCI_ID = 533 + DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534 + DCGM_FI_INTERNAL_FIELDS_0_START = 600 + DCGM_FI_INTERNAL_FIELDS_0_END = 699 + DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT = 701 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ = 702 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV = 703 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD = 704 + DCGM_FI_DEV_NVSWITCH_POWER_VDD = 705 + DCGM_FI_DEV_NVSWITCH_POWER_DVDD = 706 + DCGM_FI_DEV_NVSWITCH_POWER_HVDD = 707 + DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780 + DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781 + DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782 + DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS = 783 + DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS = 784 + DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS = 785 + DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS = 786 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS = 787 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS = 788 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 = 789 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 = 790 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 = 791 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 = 792 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 = 793 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 = 794 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 = 795 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 = 796 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 = 797 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 = 798 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 = 799 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 = 800 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 = 801 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 = 802 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 = 803 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 = 804 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 = 805 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 = 806 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 = 807 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 = 808 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 = 809 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 = 810 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 = 811 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 = 812 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 = 813 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 = 814 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 = 815 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 = 816 + DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856 + DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857 + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT = 858 + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN = 859 + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN = 860 + DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX = 861 + DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX = 862 + DCGM_FI_DEV_NVSWITCH_PHYS_ID = 863 + DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED = 864 + DCGM_FI_DEV_NVSWITCH_LINK_ID = 865 + DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN = 866 + DCGM_FI_DEV_NVSWITCH_PCIE_BUS = 867 + DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE = 868 + DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION = 869 + DCGM_FI_DEV_NVSWITCH_LINK_STATUS = 870 + DCGM_FI_DEV_NVSWITCH_LINK_TYPE = 871 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN = 872 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS = 873 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE = 874 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION = 875 + DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID = 876 + DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID = 877 + DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID = 878 + DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 + DCGM_FI_PROF_SM_ACTIVE = 1002 + DCGM_FI_PROF_SM_OCCUPANCY = 1003 + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 + DCGM_FI_PROF_DRAM_ACTIVE = 1005 + DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 + DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 + DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 + DCGM_FI_PROF_PCIE_TX_BYTES = 1009 + DCGM_FI_PROF_PCIE_RX_BYTES = 1010 + DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 + DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 + DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE = 1013 + DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE = 1014 + DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE = 1015 + DCGM_FI_PROF_PIPE_INT_ACTIVE = 1016 + DCGM_FI_PROF_NVDEC0_ACTIVE = 1017 + DCGM_FI_PROF_NVDEC1_ACTIVE = 1018 + DCGM_FI_PROF_NVDEC2_ACTIVE = 1019 + DCGM_FI_PROF_NVDEC3_ACTIVE = 1020 + DCGM_FI_PROF_NVDEC4_ACTIVE = 1021 + DCGM_FI_PROF_NVDEC5_ACTIVE = 1022 + DCGM_FI_PROF_NVDEC6_ACTIVE = 1023 + DCGM_FI_PROF_NVDEC7_ACTIVE = 1024 + DCGM_FI_PROF_NVJPG0_ACTIVE = 1025 + DCGM_FI_PROF_NVJPG1_ACTIVE = 1026 + DCGM_FI_PROF_NVJPG2_ACTIVE = 1027 + DCGM_FI_PROF_NVJPG3_ACTIVE = 1028 + DCGM_FI_PROF_NVJPG4_ACTIVE = 1029 + DCGM_FI_PROF_NVJPG5_ACTIVE = 1030 + DCGM_FI_PROF_NVJPG6_ACTIVE = 1031 + DCGM_FI_PROF_NVJPG7_ACTIVE = 1032 + DCGM_FI_PROF_NVOFA0_ACTIVE = 1033 + DCGM_FI_PROF_NVLINK_L0_TX_BYTES = 1040 + DCGM_FI_PROF_NVLINK_L0_RX_BYTES = 1041 + DCGM_FI_PROF_NVLINK_L1_TX_BYTES = 1042 + DCGM_FI_PROF_NVLINK_L1_RX_BYTES = 1043 + DCGM_FI_PROF_NVLINK_L2_TX_BYTES = 1044 + DCGM_FI_PROF_NVLINK_L2_RX_BYTES = 1045 + DCGM_FI_PROF_NVLINK_L3_TX_BYTES = 1046 + DCGM_FI_PROF_NVLINK_L3_RX_BYTES = 1047 + DCGM_FI_PROF_NVLINK_L4_TX_BYTES = 1048 + DCGM_FI_PROF_NVLINK_L4_RX_BYTES = 1049 + DCGM_FI_PROF_NVLINK_L5_TX_BYTES = 1050 + DCGM_FI_PROF_NVLINK_L5_RX_BYTES = 1051 + DCGM_FI_PROF_NVLINK_L6_TX_BYTES = 1052 + DCGM_FI_PROF_NVLINK_L6_RX_BYTES = 1053 + DCGM_FI_PROF_NVLINK_L7_TX_BYTES = 1054 + DCGM_FI_PROF_NVLINK_L7_RX_BYTES = 1055 + DCGM_FI_PROF_NVLINK_L8_TX_BYTES = 1056 + DCGM_FI_PROF_NVLINK_L8_RX_BYTES = 1057 + DCGM_FI_PROF_NVLINK_L9_TX_BYTES = 1058 + DCGM_FI_PROF_NVLINK_L9_RX_BYTES = 1059 + DCGM_FI_PROF_NVLINK_L10_TX_BYTES = 1060 + DCGM_FI_PROF_NVLINK_L10_RX_BYTES = 1061 + DCGM_FI_PROF_NVLINK_L11_TX_BYTES = 1062 + DCGM_FI_PROF_NVLINK_L11_RX_BYTES = 1063 + DCGM_FI_PROF_NVLINK_L12_TX_BYTES = 1064 + DCGM_FI_PROF_NVLINK_L12_RX_BYTES = 1065 + DCGM_FI_PROF_NVLINK_L13_TX_BYTES = 1066 + DCGM_FI_PROF_NVLINK_L13_RX_BYTES = 1067 + DCGM_FI_PROF_NVLINK_L14_TX_BYTES = 1068 + DCGM_FI_PROF_NVLINK_L14_RX_BYTES = 1069 + DCGM_FI_PROF_NVLINK_L15_TX_BYTES = 1070 + DCGM_FI_PROF_NVLINK_L15_RX_BYTES = 1071 + DCGM_FI_PROF_NVLINK_L16_TX_BYTES = 1072 + DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073 + DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074 + DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075 + DCGM_FI_DEV_CPU_UTIL_TOTAL = 1100 + DCGM_FI_DEV_CPU_UTIL_USER = 1101 + DCGM_FI_DEV_CPU_UTIL_NICE = 1102 + DCGM_FI_DEV_CPU_UTIL_SYS = 1103 + DCGM_FI_DEV_CPU_UTIL_IRQ = 1104 + DCGM_FI_DEV_CPU_TEMP_CURRENT = 1110 + DCGM_FI_DEV_CPU_TEMP_WARNING = 1111 + DCGM_FI_DEV_CPU_TEMP_CRITICAL = 1112 + DCGM_FI_DEV_CPU_CLOCK_CURRENT = 1120 + DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT = 1130 + DCGM_FI_DEV_CPU_POWER_LIMIT = 1131 + DCGM_FI_DEV_CPU_VENDOR = 1140 + DCGM_FI_DEV_CPU_MODEL = 1141 + DCGM_FI_MAX_FIELDS = 1142 + + DCGM_ST_OK = 0 + DCGM_ST_BADPARAM = -1 + DCGM_ST_GENERIC_ERROR = -3 + DCGM_ST_MEMORY = -4 + DCGM_ST_NOT_CONFIGURED = -5 + DCGM_ST_NOT_SUPPORTED = -6 + DCGM_ST_INIT_ERROR = -7 + DCGM_ST_NVML_ERROR = -8 + DCGM_ST_PENDING = -9 + DCGM_ST_UNINITIALIZED = -10 + DCGM_ST_TIMEOUT = -11 + DCGM_ST_VER_MISMATCH = -12 + DCGM_ST_UNKNOWN_FIELD = -13 + DCGM_ST_NO_DATA = -14 + DCGM_ST_STALE_DATA = -15 + DCGM_ST_NOT_WATCHED = -16 + DCGM_ST_NO_PERMISSION = -17 + DCGM_ST_GPU_IS_LOST = -18 + DCGM_ST_RESET_REQUIRED = -19 + DCGM_ST_FUNCTION_NOT_FOUND = -20 + DCGM_ST_CONNECTION_NOT_VALID = -21 + DCGM_ST_GPU_NOT_SUPPORTED = -22 + DCGM_ST_GROUP_INCOMPATIBLE = -23 + DCGM_ST_MAX_LIMIT = -24 + DCGM_ST_LIBRARY_NOT_FOUND = -25 + DCGM_ST_DUPLICATE_KEY = -26 + DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 + DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 + DCGM_ST_REQUIRES_ROOT = -29 + DCGM_ST_NVVS_ERROR = -30 + DCGM_ST_INSUFFICIENT_SIZE = -31 + DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32 + DCGM_ST_MODULE_NOT_LOADED = -33 + DCGM_ST_IN_USE = -34 + DCGM_ST_GROUP_IS_EMPTY = -35 + DCGM_ST_PROFILING_NOT_SUPPORTED = -36 + DCGM_ST_PROFILING_LIBRARY_ERROR = -37 + DCGM_ST_PROFILING_MULTI_PASS = -38 + DCGM_ST_DIAG_ALREADY_RUNNING = -39 + DCGM_ST_DIAG_BAD_JSON = -40 + DCGM_ST_DIAG_BAD_LAUNCH = -41 + DCGM_ST_DIAG_UNUSED = -42 + DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43 + DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44 + DCGM_ST_INSTANCE_NOT_FOUND = -45 + DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46 + DCGM_ST_CHILD_NOT_KILLED = -47 + DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 + DCGM_ST_INSUFFICIENT_RESOURCES = -49 + DCGM_ST_PLUGIN_EXCEPTION = -50 + DCGM_ST_NVVS_ISOLATE_ERROR = -51 + DCGM_ST_NVVS_BINARY_NOT_FOUND = -52 + DCGM_ST_NVVS_KILLED = -53 + DCGM_ST_PAUSED = -54 + DCGM_ST_ALREADY_INITIALIZED = -55 +) + +var ( + DCGM_FI = map[string]Short{ + "DCGM_FT_BINARY": Short('b'), + "DCGM_FT_DOUBLE": Short('d'), + "DCGM_FT_INT64": Short('i'), + "DCGM_FT_STRING": Short('s'), + "DCGM_FT_TIMESTAMP": Short('t'), + + "DCGM_FI_UNKNOWN": 0, + "DCGM_FI_DRIVER_VERSION": 1, + "DCGM_FI_NVML_VERSION": 2, + "DCGM_FI_PROCESS_NAME": 3, + "DCGM_FI_DEV_COUNT": 4, + "DCGM_FI_CUDA_DRIVER_VERSION": 5, + "DCGM_FI_DEV_NAME": 50, + "DCGM_FI_DEV_BRAND": 51, + "DCGM_FI_DEV_NVML_INDEX": 52, + "DCGM_FI_DEV_SERIAL": 53, + "DCGM_FI_DEV_UUID": 54, + "DCGM_FI_DEV_MINOR_NUMBER": 55, + "DCGM_FI_DEV_OEM_INFOROM_VER": 56, + "DCGM_FI_DEV_PCI_BUSID": 57, + "DCGM_FI_DEV_PCI_COMBINED_ID": 58, + "DCGM_FI_DEV_PCI_SUBSYS_ID": 59, + "DCGM_FI_GPU_TOPOLOGY_PCI": 60, + "DCGM_FI_GPU_TOPOLOGY_NVLINK": 61, + "DCGM_FI_GPU_TOPOLOGY_AFFINITY": 62, + "DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY": 63, + "DCGM_FI_DEV_COMPUTE_MODE": 65, + "DCGM_FI_DEV_PERSISTENCE_MODE": 66, + "DCGM_FI_DEV_MIG_MODE": 67, + "DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR": 68, + "DCGM_FI_DEV_MIG_MAX_SLICES": 69, + "DCGM_FI_DEV_CPU_AFFINITY_0": 70, + "DCGM_FI_DEV_CPU_AFFINITY_1": 71, + "DCGM_FI_DEV_CPU_AFFINITY_2": 72, + "DCGM_FI_DEV_CPU_AFFINITY_3": 73, + "DCGM_FI_DEV_CC_MODE": 74, + "DCGM_FI_DEV_MIG_ATTRIBUTES": 75, + "DCGM_FI_DEV_MIG_GI_INFO": 76, + "DCGM_FI_DEV_MIG_CI_INFO": 77, + "DCGM_FI_DEV_ECC_INFOROM_VER": 80, + "DCGM_FI_DEV_POWER_INFOROM_VER": 81, + "DCGM_FI_DEV_INFOROM_IMAGE_VER": 82, + "DCGM_FI_DEV_INFOROM_CONFIG_CHECK": 83, + "DCGM_FI_DEV_INFOROM_CONFIG_VALID": 84, + "DCGM_FI_DEV_VBIOS_VERSION": 85, + "DCGM_FI_DEV_MEM_AFFINITY_0": 86, + "DCGM_FI_DEV_MEM_AFFINITY_1": 87, + "DCGM_FI_DEV_MEM_AFFINITY_2": 88, + "DCGM_FI_DEV_MEM_AFFINITY_3": 89, + "DCGM_FI_DEV_BAR1_TOTAL": 90, + "DCGM_FI_SYNC_BOOST": 91, + "DCGM_FI_DEV_BAR1_USED": 92, + "DCGM_FI_DEV_BAR1_FREE": 93, + "DCGM_FI_DEV_SM_CLOCK": 100, + "DCGM_FI_DEV_MEM_CLOCK": 101, + "DCGM_FI_DEV_VIDEO_CLOCK": 102, + "DCGM_FI_DEV_APP_SM_CLOCK": 110, + "DCGM_FI_DEV_APP_MEM_CLOCK": 111, + "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS": 112, + "DCGM_FI_DEV_MAX_SM_CLOCK": 113, + "DCGM_FI_DEV_MAX_MEM_CLOCK": 114, + "DCGM_FI_DEV_MAX_VIDEO_CLOCK": 115, + "DCGM_FI_DEV_AUTOBOOST": 120, + "DCGM_FI_DEV_SUPPORTED_CLOCKS": 130, + "DCGM_FI_DEV_MEMORY_TEMP": 140, + "DCGM_FI_DEV_GPU_TEMP": 150, + "DCGM_FI_DEV_MEM_MAX_OP_TEMP": 151, + "DCGM_FI_DEV_GPU_MAX_OP_TEMP": 152, + "DCGM_FI_DEV_POWER_USAGE": 155, + "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": 156, + "DCGM_FI_DEV_POWER_USAGE_INSTANT": 157, + "DCGM_FI_DEV_SLOWDOWN_TEMP": 158, + "DCGM_FI_DEV_SHUTDOWN_TEMP": 159, + "DCGM_FI_DEV_POWER_MGMT_LIMIT": 160, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN": 161, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX": 162, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF": 163, + "DCGM_FI_DEV_ENFORCED_POWER_LIMIT": 164, + "DCGM_FI_DEV_PSTATE": 190, + "DCGM_FI_DEV_FAN_SPEED": 191, + "DCGM_FI_DEV_PCIE_TX_THROUGHPUT": 200, + "DCGM_FI_DEV_PCIE_RX_THROUGHPUT": 201, + "DCGM_FI_DEV_PCIE_REPLAY_COUNTER": 202, + "DCGM_FI_DEV_GPU_UTIL": 203, + "DCGM_FI_DEV_MEM_COPY_UTIL": 204, + "DCGM_FI_DEV_ACCOUNTING_DATA": 205, + "DCGM_FI_DEV_ENC_UTIL": 206, + "DCGM_FI_DEV_DEC_UTIL": 207, + "DCGM_FI_DEV_XID_ERRORS": 230, + "DCGM_FI_DEV_PCIE_MAX_LINK_GEN": 235, + "DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH": 236, + "DCGM_FI_DEV_PCIE_LINK_GEN": 237, + "DCGM_FI_DEV_PCIE_LINK_WIDTH": 238, + "DCGM_FI_DEV_POWER_VIOLATION": 240, + "DCGM_FI_DEV_THERMAL_VIOLATION": 241, + "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": 242, + "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": 243, + "DCGM_FI_DEV_LOW_UTIL_VIOLATION": 244, + "DCGM_FI_DEV_RELIABILITY_VIOLATION": 245, + "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": 246, + "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": 247, + "DCGM_FI_DEV_FB_TOTAL": 250, + "DCGM_FI_DEV_FB_FREE": 251, + "DCGM_FI_DEV_FB_USED": 252, + "DCGM_FI_DEV_FB_RESERVED": 253, + "DCGM_FI_DEV_FB_USED_PERCENT": 254, + "DCGM_FI_DEV_C2C_LINK_COUNT": 285, + "DCGM_FI_DEV_C2C_LINK_STATUS": 286, + "DCGM_FI_DEV_C2C_MAX_BANDWIDTH": 287, + "DCGM_FI_DEV_ECC_CURRENT": 300, + "DCGM_FI_DEV_ECC_PENDING": 301, + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": 310, + "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": 311, + "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": 312, + "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": 313, + "DCGM_FI_DEV_ECC_SBE_VOL_L1": 314, + "DCGM_FI_DEV_ECC_DBE_VOL_L1": 315, + "DCGM_FI_DEV_ECC_SBE_VOL_L2": 316, + "DCGM_FI_DEV_ECC_DBE_VOL_L2": 317, + "DCGM_FI_DEV_ECC_SBE_VOL_DEV": 318, + "DCGM_FI_DEV_ECC_DBE_VOL_DEV": 319, + "DCGM_FI_DEV_ECC_SBE_VOL_REG": 320, + "DCGM_FI_DEV_ECC_DBE_VOL_REG": 321, + "DCGM_FI_DEV_ECC_SBE_VOL_TEX": 322, + "DCGM_FI_DEV_ECC_DBE_VOL_TEX": 323, + "DCGM_FI_DEV_ECC_SBE_AGG_L1": 324, + "DCGM_FI_DEV_ECC_DBE_AGG_L1": 325, + "DCGM_FI_DEV_ECC_SBE_AGG_L2": 326, + "DCGM_FI_DEV_ECC_DBE_AGG_L2": 327, + "DCGM_FI_DEV_ECC_SBE_AGG_DEV": 328, + "DCGM_FI_DEV_ECC_DBE_AGG_DEV": 329, + "DCGM_FI_DEV_ECC_SBE_AGG_REG": 330, + "DCGM_FI_DEV_ECC_DBE_AGG_REG": 331, + "DCGM_FI_DEV_ECC_SBE_AGG_TEX": 332, + "DCGM_FI_DEV_ECC_DBE_AGG_TEX": 333, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX": 385, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH": 386, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL": 387, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW": 388, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE": 389, + "DCGM_FI_DEV_RETIRED_SBE": 390, + "DCGM_FI_DEV_RETIRED_DBE": 391, + "DCGM_FI_DEV_RETIRED_PENDING": 392, + "DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS": 393, + "DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS": 394, + "DCGM_FI_DEV_ROW_REMAP_FAILURE": 395, + "DCGM_FI_DEV_ROW_REMAP_PENDING": 396, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0": 400, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1": 401, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2": 402, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3": 403, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4": 404, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5": 405, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": 409, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0": 410, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1": 411, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2": 412, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3": 413, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4": 414, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5": 415, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL": 419, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0": 420, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1": 421, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2": 422, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3": 423, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4": 424, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5": 425, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL": 429, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0": 430, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1": 431, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2": 432, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3": 433, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4": 434, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5": 435, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL": 439, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L0": 440, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L1": 441, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L2": 442, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L3": 443, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L4": 444, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L5": 445, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": 449, + "DCGM_FI_DEV_GPU_NVLINK_ERRORS": 450, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6": 451, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7": 452, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8": 453, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9": 454, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10": 455, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11": 456, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6": 457, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7": 458, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8": 459, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9": 460, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10": 461, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11": 462, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6": 463, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7": 464, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8": 465, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9": 466, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10": 467, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11": 468, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6": 469, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7": 470, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8": 471, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9": 472, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10": 473, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11": 474, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L6": 475, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L7": 476, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L8": 477, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L9": 478, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L10": 479, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L11": 480, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12": 406, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13": 407, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14": 408, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15": 481, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16": 482, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17": 483, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12": 416, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13": 417, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14": 418, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15": 484, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16": 485, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17": 486, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12": 426, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13": 427, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14": 428, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15": 487, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16": 488, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17": 489, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12": 436, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13": 437, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14": 438, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15": 491, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16": 492, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17": 493, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L12": 446, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L13": 447, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L14": 448, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L15": 494, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L16": 495, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L17": 496, + "DCGM_FI_DEV_VIRTUAL_MODE": 500, + "DCGM_FI_DEV_SUPPORTED_TYPE_INFO": 501, + "DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS": 502, + "DCGM_FI_DEV_VGPU_INSTANCE_IDS": 503, + "DCGM_FI_DEV_VGPU_UTILIZATIONS": 504, + "DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION": 505, + "DCGM_FI_DEV_ENC_STATS": 506, + "DCGM_FI_DEV_FBC_STATS": 507, + "DCGM_FI_DEV_FBC_SESSIONS_INFO": 508, + "DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS": 509, + "DCGM_FI_DEV_VGPU_TYPE_INFO": 510, + "DCGM_FI_DEV_VGPU_TYPE_NAME": 511, + "DCGM_FI_DEV_VGPU_TYPE_CLASS": 512, + "DCGM_FI_DEV_VGPU_TYPE_LICENSE": 513, + "DCGM_FI_DEV_VGPU_VM_ID": 520, + "DCGM_FI_DEV_VGPU_VM_NAME": 521, + "DCGM_FI_DEV_VGPU_TYPE": 522, + "DCGM_FI_DEV_VGPU_UUID": 523, + "DCGM_FI_DEV_VGPU_DRIVER_VERSION": 524, + "DCGM_FI_DEV_VGPU_MEMORY_USAGE": 525, + "DCGM_FI_DEV_VGPU_LICENSE_STATUS": 526, + "DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT": 527, + "DCGM_FI_DEV_VGPU_ENC_STATS": 528, + "DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO": 529, + "DCGM_FI_DEV_VGPU_FBC_STATS": 530, + "DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO": 531, + "DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE": 532, + "DCGM_FI_DEV_VGPU_PCI_ID": 533, + "DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID": 534, + "DCGM_FI_INTERNAL_FIELDS_0_START": 600, + "DCGM_FI_INTERNAL_FIELDS_0_END": 699, + "DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT": 701, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ": 702, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV": 703, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD": 704, + "DCGM_FI_DEV_NVSWITCH_POWER_VDD": 705, + "DCGM_FI_DEV_NVSWITCH_POWER_DVDD": 706, + "DCGM_FI_DEV_NVSWITCH_POWER_HVDD": 707, + "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX": 780, + "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX": 781, + "DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS": 782, + "DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS": 783, + "DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS": 784, + "DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS": 785, + "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS": 786, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS": 787, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS": 788, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0": 789, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1": 790, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2": 791, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3": 792, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0": 793, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1": 794, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2": 795, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3": 796, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0": 797, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1": 798, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2": 799, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3": 800, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0": 801, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1": 802, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2": 803, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3": 804, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0": 805, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1": 806, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2": 807, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3": 808, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0": 809, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1": 810, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2": 811, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3": 812, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0": 813, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1": 814, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2": 815, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3": 816, + "DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS": 856, + "DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS": 857, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT": 858, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN": 859, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN": 860, + "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX": 861, + "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX": 862, + "DCGM_FI_DEV_NVSWITCH_PHYS_ID": 863, + "DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED": 864, + "DCGM_FI_DEV_NVSWITCH_LINK_ID": 865, + "DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN": 866, + "DCGM_FI_DEV_NVSWITCH_PCIE_BUS": 867, + "DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE": 868, + "DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION": 869, + "DCGM_FI_DEV_NVSWITCH_LINK_STATUS": 870, + "DCGM_FI_DEV_NVSWITCH_LINK_TYPE": 871, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN": 872, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS": 873, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE": 874, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION": 875, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID": 876, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID": 877, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID": 878, + "DCGM_FI_PROF_GR_ENGINE_ACTIVE": 1001, + "DCGM_FI_PROF_SM_ACTIVE": 1002, + "DCGM_FI_PROF_SM_OCCUPANCY": 1003, + "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": 1004, + "DCGM_FI_PROF_DRAM_ACTIVE": 1005, + "DCGM_FI_PROF_PIPE_FP64_ACTIVE": 1006, + "DCGM_FI_PROF_PIPE_FP32_ACTIVE": 1007, + "DCGM_FI_PROF_PIPE_FP16_ACTIVE": 1008, + "DCGM_FI_PROF_PCIE_TX_BYTES": 1009, + "DCGM_FI_PROF_PCIE_RX_BYTES": 1010, + "DCGM_FI_PROF_NVLINK_TX_BYTES": 1011, + "DCGM_FI_PROF_NVLINK_RX_BYTES": 1012, + "DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE": 1013, + "DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE": 1014, + "DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE": 1015, + "DCGM_FI_PROF_PIPE_INT_ACTIVE": 1016, + "DCGM_FI_PROF_NVDEC0_ACTIVE": 1017, + "DCGM_FI_PROF_NVDEC1_ACTIVE": 1018, + "DCGM_FI_PROF_NVDEC2_ACTIVE": 1019, + "DCGM_FI_PROF_NVDEC3_ACTIVE": 1020, + "DCGM_FI_PROF_NVDEC4_ACTIVE": 1021, + "DCGM_FI_PROF_NVDEC5_ACTIVE": 1022, + "DCGM_FI_PROF_NVDEC6_ACTIVE": 1023, + "DCGM_FI_PROF_NVDEC7_ACTIVE": 1024, + "DCGM_FI_PROF_NVJPG0_ACTIVE": 1025, + "DCGM_FI_PROF_NVJPG1_ACTIVE": 1026, + "DCGM_FI_PROF_NVJPG2_ACTIVE": 1027, + "DCGM_FI_PROF_NVJPG3_ACTIVE": 1028, + "DCGM_FI_PROF_NVJPG4_ACTIVE": 1029, + "DCGM_FI_PROF_NVJPG5_ACTIVE": 1030, + "DCGM_FI_PROF_NVJPG6_ACTIVE": 1031, + "DCGM_FI_PROF_NVJPG7_ACTIVE": 1032, + "DCGM_FI_PROF_NVOFA0_ACTIVE": 1033, + "DCGM_FI_PROF_NVLINK_L0_TX_BYTES": 1040, + "DCGM_FI_PROF_NVLINK_L0_RX_BYTES": 1041, + "DCGM_FI_PROF_NVLINK_L1_TX_BYTES": 1042, + "DCGM_FI_PROF_NVLINK_L1_RX_BYTES": 1043, + "DCGM_FI_PROF_NVLINK_L2_TX_BYTES": 1044, + "DCGM_FI_PROF_NVLINK_L2_RX_BYTES": 1045, + "DCGM_FI_PROF_NVLINK_L3_TX_BYTES": 1046, + "DCGM_FI_PROF_NVLINK_L3_RX_BYTES": 1047, + "DCGM_FI_PROF_NVLINK_L4_TX_BYTES": 1048, + "DCGM_FI_PROF_NVLINK_L4_RX_BYTES": 1049, + "DCGM_FI_PROF_NVLINK_L5_TX_BYTES": 1050, + "DCGM_FI_PROF_NVLINK_L5_RX_BYTES": 1051, + "DCGM_FI_PROF_NVLINK_L6_TX_BYTES": 1052, + "DCGM_FI_PROF_NVLINK_L6_RX_BYTES": 1053, + "DCGM_FI_PROF_NVLINK_L7_TX_BYTES": 1054, + "DCGM_FI_PROF_NVLINK_L7_RX_BYTES": 1055, + "DCGM_FI_PROF_NVLINK_L8_TX_BYTES": 1056, + "DCGM_FI_PROF_NVLINK_L8_RX_BYTES": 1057, + "DCGM_FI_PROF_NVLINK_L9_TX_BYTES": 1058, + "DCGM_FI_PROF_NVLINK_L9_RX_BYTES": 1059, + "DCGM_FI_PROF_NVLINK_L10_TX_BYTES": 1060, + "DCGM_FI_PROF_NVLINK_L10_RX_BYTES": 1061, + "DCGM_FI_PROF_NVLINK_L11_TX_BYTES": 1062, + "DCGM_FI_PROF_NVLINK_L11_RX_BYTES": 1063, + "DCGM_FI_PROF_NVLINK_L12_TX_BYTES": 1064, + "DCGM_FI_PROF_NVLINK_L12_RX_BYTES": 1065, + "DCGM_FI_PROF_NVLINK_L13_TX_BYTES": 1066, + "DCGM_FI_PROF_NVLINK_L13_RX_BYTES": 1067, + "DCGM_FI_PROF_NVLINK_L14_TX_BYTES": 1068, + "DCGM_FI_PROF_NVLINK_L14_RX_BYTES": 1069, + "DCGM_FI_PROF_NVLINK_L15_TX_BYTES": 1070, + "DCGM_FI_PROF_NVLINK_L15_RX_BYTES": 1071, + "DCGM_FI_PROF_NVLINK_L16_TX_BYTES": 1072, + "DCGM_FI_PROF_NVLINK_L16_RX_BYTES": 1073, + "DCGM_FI_PROF_NVLINK_L17_TX_BYTES": 1074, + "DCGM_FI_PROF_NVLINK_L17_RX_BYTES": 1075, + "DCGM_FI_DEV_CPU_UTIL_TOTAL": 1100, + "DCGM_FI_DEV_CPU_UTIL_USER": 1101, + "DCGM_FI_DEV_CPU_UTIL_NICE": 1102, + "DCGM_FI_DEV_CPU_UTIL_SYS": 1103, + "DCGM_FI_DEV_CPU_UTIL_IRQ": 1104, + "DCGM_FI_DEV_CPU_TEMP_CURRENT": 1110, + "DCGM_FI_DEV_CPU_TEMP_WARNING": 1111, + "DCGM_FI_DEV_CPU_TEMP_CRITICAL": 1112, + "DCGM_FI_DEV_CPU_CLOCK_CURRENT": 1120, + "DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT": 1130, + "DCGM_FI_DEV_CPU_POWER_LIMIT": 1131, + "DCGM_FI_DEV_CPU_VENDOR": 1140, + "DCGM_FI_DEV_CPU_MODEL": 1141, + "DCGM_FI_MAX_FIELDS": 1142, + } +) + +var ( + OLD_DCGM_FI = map[string]Short{ + "dcgm_sm_clock": 100, + "dcgm_memory_clock": 101, + "dcgm_memory_temp": 140, + "dcgm_gpu_temp": 150, + "dcgm_power_usage": 155, + "dcgm_total_energy_consumption": 156, + "dcgm_pcie_tx_throughput": 200, + "dcgm_pcie_rx_throughput": 201, + "dcgm_pcie_replay_counter": 202, + "dcgm_gpu_utilization": 203, + "dcgm_mem_copy_utilization": 204, + "dcgm_enc_utilization": 206, + "dcgm_dec_utilization": 207, + "dcgm_xid_errors": 230, + "dcgm_power_violation": 240, + "dcgm_thermal_violation": 241, + "dcgm_sync_boost_violation": 242, + "dcgm_board_limit_violation": 243, + "dcgm_low_util_violation": 244, + "dcgm_reliability_violation": 245, + "dcgm_fb_free": 251, + "dcgm_fb_used": 252, + "dcgm_ecc_sbe_volatile_total": 310, + "dcgm_ecc_dbe_volatile_total": 311, + "dcgm_ecc_sbe_aggregate_total": 312, + "dcgm_ecc_dbe_aggregate_total": 313, + "dcgm_retired_pages_sbe": 390, + "dcgm_retired_pages_dbe": 391, + "dcgm_retired_pages_pending": 392, + "dcgm_nvlink_flit_crc_error_count_total": 409, + "dcgm_nvlink_data_crc_error_count_total": 419, + "dcgm_nvlink_replay_error_count_total": 429, + "dcgm_nvlink_recovery_error_count_total": 439, + "dcgm_nvlink_bandwidth_total": 449, + "dcgm_fi_prof_gr_engine_active": 1001, + "dcgm_fi_prof_sm_active": 1002, + "dcgm_fi_prof_sm_occupancy": 1003, + "dcgm_fi_prof_pipe_tensor_active": 1004, + "dcgm_fi_prof_dram_active": 1005, + "dcgm_fi_prof_pcie_tx_bytes": 1009, + "dcgm_fi_prof_pcie_rx_bytes": 1010, + } +) + +const ( + DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001) +) diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/cpu.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/cpu.go new file mode 100644 index 0000000000..02c1310062 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/cpu.go @@ -0,0 +1,69 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "unsafe" +) + +/* + *See dcgm_structs.h + * DCGM_CPU_CORE_BITMASK_COUNT_V1 (DCGM_MAX_NUM_CPU_CORES / sizeof(uint64_t) / CHAR_BIT) + * or + * 1024 / 8 / 8 + */ + +const ( + MAX_NUM_CPU_CORES uint = C.DCGM_MAX_NUM_CPU_CORES + MAX_NUM_CPUS uint = C.DCGM_MAX_NUM_CPUS + CHAR_BIT uint = C.CHAR_BIT + MAX_CPU_CORE_BITMASK_COUNT uint = 1024 / 8 / 8 +) + +type CpuHierarchyCpu_v1 struct { + CpuId uint + OwnedCores []uint64 +} + +type CpuHierarchy_v1 struct { + Version uint + NumCpus uint + Cpus [MAX_NUM_CPUS]CpuHierarchyCpu_v1 +} + +func GetCpuHierarchy() (hierarchy CpuHierarchy_v1, err error) { + var c_hierarchy C.dcgmCpuHierarchy_v1 + c_hierarchy.version = C.dcgmCpuHierarchy_version1 + ptr_hierarchy := (*C.dcgmCpuHierarchy_v1)(unsafe.Pointer(&c_hierarchy)) + result := C.dcgmGetCpuHierarchy(handle.handle, ptr_hierarchy) + + if err = errorString(result); err != nil { + return toCpuHierarchy(c_hierarchy), fmt.Errorf("Error retrieving DCGM MIG hierarchy: %s", err) + } + + return toCpuHierarchy(c_hierarchy), nil +} + +func toCpuHierarchy(c_hierarchy C.dcgmCpuHierarchy_v1) CpuHierarchy_v1 { + var hierarchy CpuHierarchy_v1 + hierarchy.Version = uint(c_hierarchy.version) + hierarchy.NumCpus = uint(c_hierarchy.numCpus) + for i := uint(0); i < hierarchy.NumCpus; i++ { + bits := make([]uint64, MAX_CPU_CORE_BITMASK_COUNT) + + for j := uint(0); j < MAX_CPU_CORE_BITMASK_COUNT; j++ { + bits[j] = uint64(c_hierarchy.cpus[i].ownedCores.bitmask[j]) + } + + hierarchy.Cpus[i] = CpuHierarchyCpu_v1{ + CpuId: uint(c_hierarchy.cpus[i].cpuId), + OwnedCores: bits, + } + } + + return hierarchy +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_agent.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_agent.h new file mode 100644 index 0000000000..d1eba629ce --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_agent.h @@ -0,0 +1,2013 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DCGM_AGENT_H +#define DCGM_AGENT_H + +#define DCGM_PUBLIC_API +#include "dcgm_structs.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_Admin Administrative + * + * This chapter describes the administration interfaces for DCGM. + * It is the user's responsibility to call \ref dcgmInit() before calling any other methods, + * and \ref dcgmShutdown() once DCGM is no longer being used. The APIs in Administrative module + * can be broken down into following categories: + * @{ + */ +/***************************************************************************************************/ + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_Admin_InitShut Init and Shutdown + * + * Describes APIs to Initialize and Shutdown the DCGM Engine. + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to initialize DCGM within this process. This must be called before + * dcgmStartEmbedded() or dcgmConnect() + * + * * @return + * - \ref DCGM_ST_OK if DCGM has been properly initialized + * - \ref DCGM_ST_INIT_ERROR if there was an error initializing the library + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmInit(void); + +/** + * This method is used to shut down DCGM. Any embedded host engines or remote connections will automatically + * be shut down as well. + * + * @return + * - \ref DCGM_ST_OK if DCGM has been properly shut down + * - \ref DCGM_ST_UNINITIALIZED if the library was not shut down properly + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmShutdown(void); + +/** + * Start an embedded host engine agent within this process. + * + * The agent is loaded as a shared library. This mode is provided to avoid any + * extra jitter associated with an additional autonomous agent needs to be managed. In + * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and + * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and + * operations needed for policy management. + * + * @param opMode IN: Collect data automatically or manually when asked by the user. + * @param pDcgmHandle OUT: DCGM Handle to use for API calls + * + * @return + * - \ref DCGM_ST_OK if DCGM was started successfully within our process + * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit yet + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStartEmbedded(dcgmOperationMode_t opMode, dcgmHandle_t *pDcgmHandle); + +/** + * Start an embedded host engine agent within this process. + * + * The agent is loaded as a shared library. This mode is provided to avoid any + * extra jitter associated with an additional autonomous agent needs to be managed. In + * this mode, the user has to periodically call APIs such as \c dcgmPolicyTrigger and + * \c dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and + * operations needed for policy management. + * + * @param[in,out] params A pointer to either \c dcgmStartEmbeddedV2Params_v1 or \c dcgmStartEmbeddedV2Params_v2. + * + * @return \c DCGM_ST_OK if DCGM was started successfully within our process + * @return \c DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \c dcgmInit yet + * @note This function has a versioned argument that can be actually called with two different types. The behavior will + * depend on the params->version value. + * @see dcgmStartEmbeddedV2Params_v1 + * @see dcgmStartEmbeddedV2Params_v2 + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStartEmbedded_v2(dcgmStartEmbeddedV2Params_v1 *params); + +/** + * Stop the embedded host engine within this process that was started with dcgmStartEmbedded + * + * @param pDcgmHandle IN : DCGM Handle of the embedded host engine that came from dcgmStartEmbedded + * + * @return + * - \ref DCGM_ST_OK if DCGM was stopped successfully within our process + * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit or + * the embedded host engine was not running. + * - \ref DCGM_ST_BADPARAM if an invalid parameter was provided + * - \ref DCGM_ST_INIT_ERROR if an error occurred while trying to start the host engine. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStopEmbedded(dcgmHandle_t pDcgmHandle); + +/** + * This method is used to connect to a stand-alone host engine process. Remote host engines are started + * by running the nv-hostengine command. + * + * NOTE: dcgmConnect_v2 provides additional connection options. + * + * @param ipAddress IN: Valid IP address for the remote host engine to connect to. + * If ipAddress is specified as x.x.x.x it will attempt to connect to the default + * port specified by DCGM_HE_PORT_NUMBER + * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the + * port specified by yyyy + * @param pDcgmHandle OUT: DCGM Handle of the remote host engine + * + * @return + * - \ref DCGM_ST_OK if we successfully connected to the remote host engine + * - \ref DCGM_ST_CONNECTION_NOT_VALID if the remote host engine could not be reached + * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit. + * - \ref DCGM_ST_BADPARAM if pDcgmHandle is NULL or ipAddress is invalid + * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote client library + * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmConnect(const char *ipAddress, dcgmHandle_t *pDcgmHandle); + +/** + * This method is used to connect to a stand-alone host engine process. Remote host engines are started + * by running the nv-hostengine command. + * + * @param ipAddress IN: Valid IP address for the remote host engine to connect to. + * If ipAddress is specified as x.x.x.x it will attempt to connect to the default port + * specified by DCGM_HE_PORT_NUMBER. + * If ipAddress is specified as x.x.x.x:yyyy it will attempt to connect to the port + * specified by yyyy + * @param connectParams IN: Additional connection parameters. See \ref dcgmConnectV2Params_t for details. + * @param pDcgmHandle OUT: DCGM Handle of the remote host engine + * + * @return + * - \ref DCGM_ST_OK if we successfully connected to the remote host engine + * - \ref DCGM_ST_CONNECTION_NOT_VALID if the remote host engine could not be reached + * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit. + * - \ref DCGM_ST_BADPARAM if pDcgmHandle is NULL or ipAddress is invalid + * - \ref DCGM_ST_INIT_ERROR if DCGM encountered an error while initializing the remote client library + * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmConnect_v2(const char *ipAddress, + dcgmConnectV2Params_t *connectParams, + dcgmHandle_t *pDcgmHandle); + +/** + * This method is used to disconnect from a stand-alone host engine process. + * + * @param pDcgmHandle IN: DCGM Handle that came from dcgmConnect + * + * @return + * - \ref DCGM_ST_OK if we successfully disconnected from the host engine + * - \ref DCGM_ST_UNINITIALIZED if DCGM has not been initialized with \ref dcgmInit + * - \ref DCGM_ST_BADPARAM if pDcgmHandle is not a valid DCGM handle + * - \ref DCGM_ST_GENERIC_ERROR if an unspecified internal error occurred + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmDisconnect(dcgmHandle_t pDcgmHandle); + + +/** @} */ // Closing for DCGMAPI_Admin_InitShut + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_Admin_Info Auxilary information about DCGM engine. + * + * Describes APIs to get generic information about the DCGM Engine. + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to return information about the build environment where DCGM was built. + * + * @param pVersionInfo OUT: Build environment information + * + * @return + * - \ref DCGM_ST_OK if build information is sucessfully obtained + * - \ref DCGM_ST_BADPARAM if pVersionInfo is null + * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmVersionInfo_t do not match + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmVersionInfo(dcgmVersionInfo_t *pVersionInfo); + +/** + * This method is used to return information about the build environment of the hostengine. + * + * @param pDcgmHandle IN: DCGM Handle that came from dcgmConnect + * @param pVersionInfo OUT: Build environment information + * + * @return + * - \ref DCGM_ST_OK if build information is sucessfully obtained + * - \ref DCGM_ST_BADPARAM if pVersionInfo is null + * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmVersionInfo_t do not match + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmHostengineVersionInfo(dcgmHandle_t pDcgmHandle, dcgmVersionInfo_t *pVersionInfo); + + +/** + * This method is used to set the logging severity on HostEngine for the specified logger + * + * @param pDcgmHandle IN: DCGM Handle + * @param logging IN: dcgmSettingsSetLoggingSeverity_t struct containing the target logger and severity + * + * @return + * - \ref DCGM_ST_OK Severity successfuly set + * - \ref DCGM_ST_BADPARAM Bad logger/severity string + * - \ref DCGM_ST_VER_MISMATCH if the expected and provided versions of dcgmSettingsSetLoggingSeverity_t + * do not match + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmHostengineSetLoggingSeverity(dcgmHandle_t pDcgmHandle, + dcgmSettingsSetLoggingSeverity_t *logging); + +/** + * This function is used to return whether or not the host engine considers itself healthy + * + * @param[in] pDcgmHandle - the handle to DCGM + * @param[out] heHealth - struct describing the health of the hostengine. if heHealth.hostengineHealth is 0, + * then the hostengine is healthy. Non-zero indicates not healthy with error codes + * determining the cause. + * + * @return + * - \ref DCGM_ST_OK Able to gauge health + * - \ref DCGM_ST_BADPARAM isHealthy is not a valid pointer + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmHostengineIsHealthy(dcgmHandle_t pDcgmHandle, dcgmHostengineHealth_t *heHealth); + + +/** + * This function describes DCGM error codes in human readable form + * + * @param[in] result - DCGM return code to describe + * + * @return + * - Human readable string with the DCGM error code description if the code is valid. + * - nullptr if there is not such error code + */ +DCGM_PUBLIC_API const char *errorString(dcgmReturn_t result); + +/** + * This function describes DCGM Module by given Module ID + * + * @param id[in] - Module ID to name. + * @param name[out] - Module name will be provided via this argument. + * @return + * - \ref DCGM_ST_OK Module name has valid value + * - \ref DCGM_ST_BADPARAM There is no module with specified ID. Name value is not changed. + */ +DCGM_PUBLIC_API dcgmReturn_t dcgmModuleIdToName(dcgmModuleId_t id, char const **name); + +/** @} */ // Closing DCGMAPI_Admin_Info + +/** @} */ // Closing for DCGMAPI_Admin + + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_SYS System + * @{ + * This chapter describes the APIs used to identify entities on the node, grouping functions to + * provide mechanism to operate on a group of entities, and status management APIs in + * order to get individual statuses for each operation. The APIs in System module can be + * broken down into following categories: + */ +/***************************************************************************************************/ + +/***************************************************************************************************/ +/** @defgroup DCGM_DISCOVERY Discovery + * The following APIs are used to discover GPUs and their attributes on a Node. + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to get identifiers corresponding to all the devices on the system. The + * identifier represents DCGM GPU Id corresponding to each GPU on the system and is immutable during + * the lifespan of the engine. The list should be queried again if the engine is restarted. + * + * The GPUs returned from this function include gpuIds of GPUs that are not supported by DCGM. + * To only get gpuIds of GPUs that are supported by DCGM, use dcgmGetAllSupportedDevices(). + * + * @param pDcgmHandle IN: DCGM Handle + * @param gpuIdList OUT: Array reference to fill GPU Ids present on the system. + * @param count OUT: Number of GPUs returned in \a gpuIdList. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_BADPARAM if \a gpuIdList or \a count were not valid. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetAllDevices(dcgmHandle_t pDcgmHandle, + unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], + int *count); + +/** + * This method is used to get identifiers corresponding to all the DCGM-supported devices on the system. The + * identifier represents DCGM GPU Id corresponding to each GPU on the system and is immutable during + * the lifespan of the engine. The list should be queried again if the engine is restarted. + * + * The GPUs returned from this function ONLY includes gpuIds of GPUs that are supported by DCGM. + * To get gpuIds of all GPUs in the system, use dcgmGetAllDevices(). + * + * + * @param pDcgmHandle IN: DCGM Handle + * @param gpuIdList OUT: Array reference to fill GPU Ids present on the system. + * @param count OUT: Number of GPUs returned in \a gpuIdList. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_BADPARAM if \a gpuIdList or \a count were not valid. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetAllSupportedDevices(dcgmHandle_t pDcgmHandle, + unsigned int gpuIdList[DCGM_MAX_NUM_DEVICES], + int *count); + +/** + * Gets device attributes corresponding to the \a gpuId. If operation is not successful for any of + * the requested fields then the field is populated with one of DCGM_BLANK_VALUES defined in + * dcgm_structs.h. + * + * @param pDcgmHandle IN: DCGM Handle + * @param gpuId IN: GPU Id corresponding to which the attributes should be fetched + * @param pDcgmAttr IN/OUT: Device attributes corresponding to \a gpuId.
pDcgmAttr->version should be set to + * \ref dcgmDeviceAttributes_version before this call. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_VER_MISMATCH if pDcgmAttr->version is not set or is invalid. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetDeviceAttributes(dcgmHandle_t pDcgmHandle, + unsigned int gpuId, + dcgmDeviceAttributes_t *pDcgmAttr); + +/** + * Gets the list of entities that exist for a given entity group. This API can be used in place of + * \ref dcgmGetAllDevices. + * + * @param dcgmHandle IN: DCGM Handle + * @param entityGroup IN: Entity group to list entities of + * @param entities OUT: Array of entities for entityGroup + * @param numEntities IN/OUT: Upon calling, this should be the number of entities that entityList[] can hold. Upon + * return, this will contain the number of entities actually saved to entityList. + * @param flags IN: Flags to modify the behavior of this request. + * See DCGM_GEGE_FLAG_* #defines in dcgm_structs.h + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_INSUFFICIENT_SIZE if numEntities was not large enough to hold the number of entities in the + * entityGroup. numEntities will contain the capacity needed to complete this + * request successfully. + * - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration. + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetEntityGroupEntities(dcgmHandle_t dcgmHandle, + dcgm_field_entity_group_t entityGroup, + dcgm_field_eid_t *entities, + int *numEntities, + unsigned int flags); + +/** + * Gets the hierarchy of GPUs, GPU Instances, and Compute Instances by populating a list of each entity with + * a reference to their parent + * + * @param dcgmHandle IN: DCGM Handle + * @param entities OUT: array of entities in the hierarchy + * @param numEntities IN/OUT: Upon calling, this should be the capacity of entities. + * Upon return, this will contain the number of entities actually saved to entities. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_VER_MISMATCH if the struct version is incorrect + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetGpuInstanceHierarchy(dcgmHandle_t dcgmHandle, dcgmMigHierarchy_v2 *hierarchy); + +/** + * Get the NvLink link status for every NvLink in this system. This includes the NvLinks of both GPUs and + * NvSwitches. Note that only NvSwitches and GPUs that are visible to the current environment will be + * returned in this structure. + * + * @param dcgmHandle IN: DCGM Handle + * @param linkStatus OUT: Structure in which to store NvLink link statuses. .version should be set to + * dcgmNvLinkStatus_version1 before calling this. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_NOT_SUPPORTED if the given entityGroup does not support enumeration. + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetNvLinkLinkStatus(dcgmHandle_t dcgmHandle, dcgmNvLinkStatus_v3 *linkStatus); + + +/** + * List supported CPUs and their cores present on the system + * + * This and other CPU APIs only support datacenter NVIDIA CPUs + * + * @param dcgmHandle IN: DCGM Handle + * @param cpuHierarchy OUT: Structure where the CPUs and their associated cores will be enumerated + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_NOT_SUPPORTED if the device is unsupported + * - \ref DCGM_ST_MODULE_NOT_LOADED if the sysmon module could not be loaded + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetCpuHierarchy(dcgmHandle_t dcgmHandle, dcgmCpuHierarchy_v1 *cpuHierarchy); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup DCGM_GROUPING Grouping + * The following APIs are used for group management. The user can create a group of entities and + * perform an operation on a group of entities. If grouping is not needed and the user wishes + * to run commands on all GPUs seen by DCGM then the user can use DCGM_GROUP_ALL_GPUS or + * DCGM_GROUP_ALL_NVSWITCHES in place of group IDs when needed. + * @{ + */ +/***************************************************************************************************/ + +/** + * Used to create a entity group handle which can store one or more entity Ids as an opaque handle + * returned in \a pDcgmGrpId. Instead of executing an operation separately for each entity, the + * DCGM group enables the user to execute same operation on all the entities present in the group as a + * single API call. + * + * To create the group with all the entities present on the system, the \a type field should be + * specified as \a DCGM_GROUP_DEFAULT or \a DCGM_GROUP_ALL_NVSWITCHES. To create an empty group, + * the \a type field should be specified as \a DCGM_GROUP_EMPTY. The empty group can be updated + * with the desired set of entities using the APIs \ref dcgmGroupAddDevice, \ref dcgmGroupAddEntity, + * \ref dcgmGroupRemoveDevice, and \ref dcgmGroupRemoveEntity. + * + * @param pDcgmHandle IN: DCGM Handle + * @param type IN: Type of Entity Group to be formed + * @param groupName IN: Desired name of the GPU group specified as NULL terminated C string + * @param pDcgmGrpId OUT: Reference to group ID + * + * @return + * - \ref DCGM_ST_OK if the group has been created + * - \ref DCGM_ST_BADPARAM if any of \a type, \a groupName, \a length or \a pDcgmGrpId is invalid + * - \ref DCGM_ST_MAX_LIMIT if number of groups on the system has reached the max limit \a DCGM_MAX_NUM_GROUPS + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGroupCreate(dcgmHandle_t pDcgmHandle, + dcgmGroupType_t type, + const char *groupName, + dcgmGpuGrp_t *pDcgmGrpId); + +/** + * Used to destroy a group represented by \a groupId. + * Since DCGM group is a logical grouping of entities, the properties applied on the group stay intact + * for the individual entities even after the group is destroyed. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID + * + * @return + * - \ref DCGM_ST_OK if the group has been destroyed + * - \ref DCGM_ST_BADPARAM if \a groupId is invalid + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized + * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group does not exists + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGroupDestroy(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId); + +/** + * Used to add specified GPU Id to the group represented by \a groupId. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group Id to which device should be added + * @param gpuId IN: DCGM GPU Id + * + * @return + * - \ref DCGM_ST_OK if the GPU Id has been successfully added to the group + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized + * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists + * - \ref DCGM_ST_BADPARAM if \a gpuId is invalid or already part of the specified group + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGroupAddDevice(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, unsigned int gpuId); + +/** + * Used to add specified entity to the group represented by \a groupId. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group Id to which device should be added + * @param entityGroupId IN: Entity group that entityId belongs to + * @param entityId IN: DCGM entityId + * + * @return + * - \ref DCGM_ST_OK if the entity has been successfully added to the group + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized + * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists + * - \ref DCGM_ST_BADPARAM if \a entityId is invalid or already part of the specified group + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGroupAddEntity(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgm_field_entity_group_t entityGroupId, + dcgm_field_eid_t entityId); + +/** + * Used to remove specified GPU Id from the group represented by \a groupId. + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID from which device should be removed + * @param gpuId IN: DCGM GPU Id + * + * @return + * - \ref DCGM_ST_OK if the GPU Id has been successfully removed from the group + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized + * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists + * - \ref DCGM_ST_BADPARAM if \a gpuId is invalid or not part of the specified group + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGroupRemoveDevice(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, unsigned int gpuId); + +/** + * Used to remove specified entity from the group represented by \a groupId. + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID from which device should be removed + * @param entityGroupId IN: Entity group that entityId belongs to + * @param entityId IN: DCGM entityId + * + * @return + * - \ref DCGM_ST_OK if the entity has been successfully removed from the group + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized + * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists + * - \ref DCGM_ST_BADPARAM if \a entityId is invalid or not part of the specified group + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGroupRemoveEntity(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgm_field_entity_group_t entityGroupId, + dcgm_field_eid_t entityId); + +/** + * Used to get information corresponding to the group represented by \a groupId. The information + * returned in \a pDcgmGroupInfo consists of group name, and the list of entities present in the + * group. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID for which information to be fetched + * @param pDcgmGroupInfo OUT: Group Information + * + * @return + * - \ref DCGM_ST_OK if the group info is successfully received. + * - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDcgmGroupInfo is invalid. + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. + * - \ref DCGM_ST_MAX_LIMIT if the group does not contain the GPU + * - \ref DCGM_ST_NOT_CONFIGURED if entry corresponding to the group (\a groupId) does not exists + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGroupGetInfo(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmGroupInfo_t *pDcgmGroupInfo); + +/** + * Used to get the Ids of all groups of entities. The information returned is a list of group ids + * in \a groupIdList as well as a count of how many ids there are in \a count. Please allocate enough + * memory for \a groupIdList. Memory of size MAX_NUM_GROUPS should be allocated for \a groupIdList. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupIdList OUT: List of Group Ids + * @param count OUT: The number of Group ids in the list + * + * @return + * - \ref DCGM_ST_OK if the ids of the groups were successfully retrieved + * - \ref DCGM_ST_BADPARAM if either of the \a groupIdList or \a count is null + * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGroupGetAllIds(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupIdList[], + unsigned int *count); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup DCGM_FIELD_GROUPING Field Grouping + * The following APIs are used for field group management. The user can create a group of fields and + * perform an operation on a group of fields at once. + * @{ + */ + +/** + * Used to create a group of fields and return the handle in dcgmFieldGroupId + * + * @param dcgmHandle IN: DCGM handle + * @param numFieldIds IN: Number of field IDs that are being provided in fieldIds[]. Must be between 1 and + * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP. + * @param fieldIds IN: Field IDs to be added to the newly-created field group + * @param fieldGroupName IN: Unique name for this group of fields. This must not be the same as any existing field + * groups. + * @param dcgmFieldGroupId OUT: Handle to the newly-created field group + * + * @return + * - \ref DCGM_ST_OK if the field group was successfully created. + * - \ref DCGM_ST_BADPARAM if any parameters were bad + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. + * - \ref DCGM_ST_MAX_LIMIT if too many field groups already exist + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmFieldGroupCreate(dcgmHandle_t dcgmHandle, + int numFieldIds, + unsigned short *fieldIds, + const char *fieldGroupName, + dcgmFieldGrp_t *dcgmFieldGroupId); + +/** + * Used to remove a field group that was created with \ref dcgmFieldGroupCreate + * + * @param dcgmHandle IN: DCGM handle + * @param dcgmFieldGroupId IN: Field group to remove + * + * @return + * - \ref DCGM_ST_OK if the field group was successfully removed + * - \ref DCGM_ST_BADPARAM if any parameters were bad + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmFieldGroupDestroy(dcgmHandle_t dcgmHandle, dcgmFieldGrp_t dcgmFieldGroupId); + + +/** + * Used to get information about a field group that was created with \ref dcgmFieldGroupCreate. + * + * @param dcgmHandle IN: DCGM handle + * @param fieldGroupInfo IN/OUT: Info about all of the field groups that exist.
+ * .version should be set to \ref dcgmFieldGroupInfo_version before this call
+ * .fieldGroupId should contain the fieldGroupId you are interested in querying + * information for. + * + * @return + * - \ref DCGM_ST_OK if the field group info was returned successfully + * - \ref DCGM_ST_BADPARAM if any parameters were bad + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. + * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmFieldGroupGetInfo(dcgmHandle_t dcgmHandle, dcgmFieldGroupInfo_t *fieldGroupInfo); + +/** + * Used to get information about all field groups in the system. + * + * @param dcgmHandle IN: DCGM handle + * @param allGroupInfo IN/OUT: Info about all of the field groups that exist.
+ * .version should be set to \ref dcgmAllFieldGroup_version before this call. + * + * @return + * - \ref DCGM_ST_OK if the field group info was successfully returned + * - \ref DCGM_ST_BADPARAM if any parameters were bad + * - \ref DCGM_ST_INIT_ERROR if the library has not been successfully initialized. + * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmFieldGroupGetAll(dcgmHandle_t dcgmHandle, dcgmAllFieldGroup_t *allGroupInfo); + +/** @} */ + + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_ST Status handling + * The following APIs are used to manage statuses for multiple operations on one or more GPUs. + * @{ + */ +/***************************************************************************************************/ + +/** + * Creates reference to DCGM status handler which can be used to get the statuses for multiple + * operations on one or more devices. + * + * The multiple statuses are useful when the operations are performed at group level. The status + * handle provides a mechanism to access error attributes for the failed operations. + * + * The number of errors stored behind the opaque handle can be accessed using the the API + * \ref dcgmStatusGetCount. The errors are accessed from the opaque handle \a statusHandle + * using the API \ref dcgmStatusPopError. The user can invoke \ref dcgmStatusPopError + * for the number of errors or until all the errors are fetched. + * + * When the status handle is not required any further then it should be deleted using the API + * \ref dcgmStatusDestroy. + * @param statusHandle OUT: Reference to handle for list of statuses + * + * @return + * - \ref DCGM_ST_OK if the status handle is successfully created + * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStatusCreate(dcgmStatus_t *statusHandle); + +/** + * Used to destroy status handle created using \ref dcgmStatusCreate. + * @param statusHandle IN: Handle to list of statuses + * + * @return + * - \ref DCGM_ST_OK if the status handle is successfully created + * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStatusDestroy(dcgmStatus_t statusHandle); + +/** + * Used to get count of error entries stored inside the opaque handle \a statusHandle. + * @param statusHandle IN: Handle to list of statuses + * @param count OUT: Number of error entries present in the list of statuses + * + * @return + * - \ref DCGM_ST_OK if the error count is successfully received + * - \ref DCGM_ST_BADPARAM if any of \a statusHandle or \a count is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStatusGetCount(dcgmStatus_t statusHandle, unsigned int *count); + +/** + * Used to iterate through the list of errors maintained behind \a statusHandle. The method pops the + * first error from the list of DCGM statuses. In order to iterate through all the errors, the user + * can invoke this API for the number of errors or until all the errors are fetched. + * @param statusHandle IN: Handle to list of statuses + * @param pDcgmErrorInfo OUT: First error from the list of statuses + * + * @return + * - \ref DCGM_ST_OK if the error entry is successfully fetched + * - \ref DCGM_ST_BADPARAM if any of \a statusHandle or \a pDcgmErrorInfo is invalid + * - \ref DCGM_ST_NO_DATA if the status handle list is empty + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStatusPopError(dcgmStatus_t statusHandle, dcgmErrorInfo_t *pDcgmErrorInfo); + +/** + * Used to clear all the errors in the status handle created by the API + * \ref dcgmStatusCreate. After one set of operation, the \a statusHandle + * can be cleared and reused for the next set of operation. + * @param statusHandle IN: Handle to list of statuses + * + * @return + * - \ref DCGM_ST_OK if the errors are successfully cleared + * - \ref DCGM_ST_BADPARAM if \a statusHandle is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStatusClear(dcgmStatus_t statusHandle); + +/** @} */ // Closing for DCGMAPI_ST + + +/** @} */ // Closing for DCGMAPI_SYS + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_DC Configuration + * This chapter describes the methods that handle device configuration retrieval and + * default settings. The APIs in Configuration module can be broken down into following + * categories: + * @{ + */ +/***************************************************************************************************/ + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_DC_Setup Setup and management + * Describes APIs to Get/Set configuration on the group of GPUs. + * @{ + */ +/***************************************************************************************************/ + +/** +* Used to set configuration for the group of one or more GPUs identified by \a groupId. +* +* The configuration settings specified in \a pDeviceConfig are applied to all the GPUs in the +* group. Since DCGM group is a logical grouping of GPUs, the configuration settings stays intact +* for the individual GPUs even after the group is destroyed. +* +* If the user wishes to ignore the configuration of one or more properties in the input +* \a pDeviceConfig then the property should be specified as one of \a DCGM_INT32_BLANK, +* \a DCGM_INT64_BLANK, \a DCGM_FP64_BLANK or \a DCGM_STR_BLANK based on the data type of the +* property to be ignored. +* +* If any of the properties fail to be configured for any of the GPUs in the group then the API +* returns an error. The status handle \a statusHandle should be further evaluated to access error +* attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST +* to access the error attributes. +* +* To find out valid supported clock values that can be passed to dcgmConfigSet, look at the device +* attributes of a GPU in the group using the API dcgmGetDeviceAttributes. + +* @param pDcgmHandle IN: DCGM Handle +* @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate +* for details on creating the group. +* @param pDeviceConfig IN: Pointer to memory to hold desired configuration to be applied for all the GPU in the +* group represented by \a groupId. +* The caller must populate the version field of \a pDeviceConfig. +* @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed +* error information is not needed. +* Look at \ref dcgmStatusCreate for details on creating status handle. + +* @return +* - \ref DCGM_ST_OK if the configuration has been successfully set. +* - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDeviceConfig is invalid. +* - \ref DCGM_ST_VER_MISMATCH if \a pDeviceConfig has the incorrect version. +* - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. +* +*/ +dcgmReturn_t DCGM_PUBLIC_API dcgmConfigSet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmConfig_t *pDeviceConfig, + dcgmStatus_t statusHandle); + +/** +* Used to get configuration for all the GPUs present in the group. +* +* This API can get the most recent target or desired configuration set by \ref dcgmConfigSet. +* Set type as \a DCGM_CONFIG_TARGET_STATE to get target configuration. The target configuration +* properties are maintained by DCGM and are automatically enforced after a GPU reset or +* reinitialization is completed. +* +* The method can also be used to get the actual configuration state for the GPUs in the group. +* Set type as \a DCGM_CONFIG_CURRENT_STATE to get the actually configuration state. Ideally, the +* actual configuration state will be exact same as the target configuration state. +* +* If any of the property in the target configuration is unknown then the property value in the +* output is populated as one of DCGM_INT32_BLANK, DCGM_INT64_BLANK, DCGM_FP64_BLANK or +* DCGM_STR_BLANK based on the data type of the property. +* +* If any of the property in the current configuration state is not supported then the property +* value in the output is populated as one of DCGM_INT32_NOT_SUPPORTED, DCGM_INT64_NOT_SUPPORTED, +* DCGM_FP64_NOT_SUPPORTED or DCGM_STR_NOT_SUPPORTED based on the data type of the property. +* +* If any of the properties can't be fetched for any of the GPUs in the group then the API returns +* an error. The status handle \a statusHandle should be further evaluated to access error +* attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST +* to access the error attributes. +* +* @param pDcgmHandle IN: DCGM Handle +* @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate +* for details on creating the group. +* @param type IN: Type of configuration values to be fetched. +* @param count IN: The number of entries that \a deviceConfigList array can store. +* @param deviceConfigList OUT: Pointer to memory to hold requested configuration corresponding to all the GPUs in +* the group (\a groupId). The size of the memory must be greater than or equal to hold +* output information for the number of GPUs present in the group (\a groupId). +* @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed +* error information is not needed. +* Look at \ref dcgmStatusCreate for details on creating status handle. + +* @return +* - \ref DCGM_ST_OK if the configuration has been successfully fetched. +* - \ref DCGM_ST_BADPARAM if any of \a groupId, \a type, \a count, or \a deviceConfigList is invalid. +* - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. +* - \ref DCGM_ST_VER_MISMATCH if \a deviceConfigList has the incorrect version. +* - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. +* +*/ +dcgmReturn_t DCGM_PUBLIC_API dcgmConfigGet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmConfigType_t type, + int count, + dcgmConfig_t deviceConfigList[], + dcgmStatus_t statusHandle); + +/** @} */ // Closing for DCGMAPI_DC_Setup + + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_DC_MI Manual Invocation + * Describes APIs used to manually enforce the desired configuration on a group of GPUs. + * @{ + */ +/***************************************************************************************************/ + +/** + * Used to enforce previously set configuration for all the GPUs present in the group. + * + * This API provides a mechanism to the users to manually enforce the configuration at any point of + * time. The configuration can only be enforced if it's already configured using the API \ref + * dcgmConfigSet. + * + * If any of the properties can't be enforced for any of the GPUs in the group then the API returns + * an error. The status handle \a statusHandle should be further evaluated to access error + * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST + * to access the error attributes. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as NULL if the detailed + * error information is not needed. Look at \ref dcgmStatusCreate for details on + * creating status handle. + * + * @return + * - \ref DCGM_ST_OK if the configuration has been successfully enforced. + * - \ref DCGM_ST_BADPARAM if \a groupId is invalid. + * - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. + * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmConfigEnforce(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmStatus_t statusHandle); + +/** @} */ // Closing for DCGMAPI_DC_MI + +/** @} */ // Closing for DCGMAPI_DC + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_FI Field APIs + * + * These APIs are responsible for watching, unwatching, and updating specific fields as defined + * by DCGM_FI_* + * + * @{ + */ +/***************************************************************************************************/ + +/** + * Request that DCGM start recording updates for a given field collection. + * + * Note that the first update of the field will not occur until the next field update cycle. + * To force a field update cycle, call dcgmUpdateAllFields(1). + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to to perform the operation on all NvSwitches. + * @param fieldGroupId IN: Fields to watch. + * @param updateFreq IN: How often to update this field in usec + * @param maxKeepAge IN: How long to keep data for this field in seconds + * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ + +dcgmReturn_t DCGM_PUBLIC_API dcgmWatchFields(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + long long updateFreq, + double maxKeepAge, + int maxKeepSamples); + +/** + * Request that DCGM stop recording updates for a given field collection. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to to perform the operation on all NvSwitches. + * @param fieldGroupId IN: Fields to unwatch. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmUnwatchFields(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId); + +/** + * Request updates for all field values that have updated since a given timestamp + * + * This version only works with GPU entities. Use \ref dcgmGetValuesSince_v2 for entity groups + * containing NvSwitches. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param fieldGroupId IN: Fields to return data for + * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will be returned in + * nextSinceTimestamp for subsequent calls 0 = request all data + * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function + * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be + * returned in each invocation + * @param userData IN: User data pointer to pass to the userData field of enumCB. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetValuesSince(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + long long sinceTimestamp, + long long *nextSinceTimestamp, + dcgmFieldValueEnumeration_f enumCB, + void *userData); + +/** + * Request updates for all field values that have updated since a given timestamp + * + * This version works with non-GPU entities like NvSwitches + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches. + * @param fieldGroupId IN: Fields to return data for + * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will be returned in + * nextSinceTimestamp for subsequent calls 0 = request all data + * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function + * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be + * returned in each invocation + * @param userData IN: User data pointer to pass to the userData field of enumCB. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetValuesSince_v2(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + long long sinceTimestamp, + long long *nextSinceTimestamp, + dcgmFieldValueEntityEnumeration_f enumCB, + void *userData); + +/** + * Request latest cached field value for a field value collection + * + * This version only works with GPU entities. Use \ref dcgmGetLatestValues_v2 for entity groups + * containing NvSwitches. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param fieldGroupId IN: Fields to return data for. + * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be + * returned in each invocation + * @param userData IN: User data pointer to pass to the userData field of enumCB. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetLatestValues(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + dcgmFieldValueEnumeration_f enumCB, + void *userData); + +/** + * Request latest cached field value for a field value collection + * + * This version works with non-GPU entities like NvSwitches + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to perform the operation on all NvSwitches. + * @param fieldGroupId IN: Fields to return data for. + * @param enumCB IN: Callback to invoke for every field value update. Note that multiple updates can be + * returned in each invocation + * @param userData IN: User data pointer to pass to the userData field of enumCB. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_NOT_SUPPORTED if one of the entities was from a non-GPU type + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetLatestValues_v2(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmFieldGrp_t fieldGroupId, + dcgmFieldValueEntityEnumeration_f enumCB, + void *userData); + +/** + * Request latest cached field value for a GPU + * + * @param pDcgmHandle IN: DCGM Handle + * @param gpuId IN: Gpu ID representing the GPU for which the fields are being requested. + * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. + * @param count IN: Number of field IDs in fields[] array. + * @param values OUT: Latest field values for the fields in fields[]. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetLatestValuesForFields(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fields[], + unsigned int count, + dcgmFieldValue_v1 values[]); +/** + * Request latest cached field value for a group of fields for a specific entity + * + * @param pDcgmHandle IN: DCGM Handle + * @param entityGroup IN: entity_group_t (e.g. switch) + * @param entityId IN: entity ID representing the rntity for which the fields are being requested. + * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. + * @param count IN: Number of field IDs in fields[] array. + * @param values OUT: Latest field values for the fields in fields[]. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmEntityGetLatestValues(dcgmHandle_t pDcgmHandle, + dcgm_field_entity_group_t entityGroup, + int entityId, + unsigned short fields[], + unsigned int count, + dcgmFieldValue_v1 values[]); + +/** + * Request the latest cached or live field value for a list of fields for a group of entities + * + * Note: The returned entities are not guaranteed to be in any order. Reordering can occur internally + * in order to optimize calls to the NVIDIA driver. + * + * @param pDcgmHandle IN: DCGM Handle + * @param entities IN: List of entities to get values for + * @param entityCount IN: Number of entries in entities[] + * @param fields IN: Field IDs to return data for. See the definitions in dcgm_fields.h that start with DCGM_FI_. + * @param fieldCount IN: Number of field IDs in fields[] array. + * @param flags IN: Optional flags that affect how this request is processed. Pass \ref DCGM_FV_FLAG_LIVE_DATA + * here to retrieve a live driver value rather than a cached value. See that flag's + * documentation for caveats. + * @param values OUT: Latest field values for the fields requested. This must be able to hold entityCount * + * fieldCount field value records. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmEntitiesGetLatestValues(dcgmHandle_t pDcgmHandle, + dcgmGroupEntityPair_t entities[], + unsigned int entityCount, + unsigned short fields[], + unsigned int fieldCount, + unsigned int flags, + dcgmFieldValue_v2 values[]); + +/*************************************************************************/ +/** + * Get a summary of the values for a field id over a period of time. + * + * @param pDcgmHandle IN: DCGM Handle + * @param request IN/OUT: a pointer to the struct detailing the request and containing the response + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_FIELD_UNSUPPORTED_BY_API if the field is not int64 or double type + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetFieldSummary(dcgmHandle_t pDcgmHandle, dcgmFieldSummaryRequest_t *request); + +/** @} */ + +/***************************************************************************************************/ +/** @addtogroup DCGMAPI_Admin_ExecCtrl + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to tell the DCGM module to update all the fields being watched. + * + * Note: If the if the operation mode was set to manual mode (DCGM_OPERATION_MODE_MANUAL) during + * initialization (\ref dcgmInit), this method must be caused periodically to allow field value watches + * the opportunity to gather samples. + * + * @param pDcgmHandle IN: DCGM Handle + * @param waitForUpdate IN: Whether or not to wait for the update loop to complete before returning to the + * caller 1=wait. 0=do not wait. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if \a waitForUpdate is invalid + * - \ref DCGM_ST_GENERIC_ERROR if an unspecified DCGM error occurs + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmUpdateAllFields(dcgmHandle_t pDcgmHandle, int waitForUpdate); + +/** @} */ // Closing for DCGMAPI_Admin_ExecCtrl + + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_PROCESS_STATS Process Statistics + * Describes APIs to investigate statistics such as accounting, performance and errors during the + * lifetime of a GPU process + * @{ + */ +/***************************************************************************************************/ + +/** + * Request that DCGM start recording stats for fields that can be queried with dcgmGetPidInfo(). + * + * Note that the first update of the field will not occur until the next field update cycle. + * To force a field update cycle, call dcgmUpdateAllFields(1). + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param updateFreq IN: How often to update this field in usec + * @param maxKeepAge IN: How long to keep data for this field in seconds + * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * - \ref DCGM_ST_REQUIRES_ROOT if the host engine is being run as non-root, and accounting mode could not + * be enabled (requires root). Run "nvidia-smi -am 1" as root on the node + * before starting DCGM to fix this. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmWatchPidFields(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + long long updateFreq, + double maxKeepAge, + int maxKeepSamples); + +/** + * + * Get information about all GPUs while the provided pid was running + * + * In order for this request to work, you must first call dcgmWatchPidFields() to + * make sure that DCGM is watching the appropriate field IDs that will be + * populated in pidInfo + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param pidInfo IN/OUT: Structure to return information about pid in. pidInfo->pid must be set to the pid in question. + * pidInfo->version should be set to dcgmPidInfo_version. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_NO_DATA if the PID did not run on any GPU + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetPidInfo(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPidInfo_t *pidInfo); + +/** @} */ // Closing for DCGMAPI_PROCESS_STATS + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_JOB_STATS Job Statistics + * The client can invoke DCGM APIs to start and stop collecting the stats at the process boundaries + * (during prologue and epilogue). This will enable DCGM to monitor all the PIDs while the job is + * in progress, and provide a summary of active processes and resource usage during the window of + * interest. + * @{ + */ +/***************************************************************************************************/ + +/** + * Request that DCGM start recording stats for fields that are queried with dcgmJobGetStats() + * + * Note that the first update of the field will not occur until the next field update cycle. + * To force a field update cycle, call dcgmUpdateAllFields(1). + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param updateFreq IN: How often to update this field in usec + * @param maxKeepAge IN: How long to keep data for this field in seconds + * @param maxKeepSamples IN: Maximum number of samples to keep. 0=no limit + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * - \ref DCGM_ST_REQUIRES_ROOT if the host engine is being run as non-root, and + * accounting mode could not be enabled (requires root). + * Run "nvidia-smi -am 1" as root on the node before starting + * DCGM to fix this. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmWatchJobFields(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + long long updateFreq, + double maxKeepAge, + int maxKeepSamples); + +/** + * This API is used by the client to notify DCGM about the job to be started. Should be invoked as + * part of job prologue + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param jobId IN: User provided string to represent the job + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * - \ref DCGM_ST_DUPLICATE_KEY if the specified \a jobId is already in use + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmJobStartStats(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, char jobId[64]); + +/** + * This API is used by the clients to notify DCGM to stop collecting stats for the job represented + * by job id. Should be invoked as part of job epilogue. + * The job Id remains available to view the stats at any point but cannot be used to start a new job. + * You must call dcgmWatchJobFields() before this call to enable watching of job + * + * @param pDcgmHandle IN: DCGM Handle + * @param jobId IN: User provided string to represent the job + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmJobStopStats(dcgmHandle_t pDcgmHandle, char jobId[64]); + +/** + * Get stats for the job identified by DCGM generated job id. The stats can be retrieved at any + * point when the job is in process. + * If you want to reuse this jobId, call \ref dcgmJobRemove after this call. + * + * @param pDcgmHandle IN: DCGM Handle + * @param jobId IN: User provided string to represent the job + * @param pJobInfo IN/OUT: Structure to return information about the job.
.version should be set to + * \ref dcgmJobInfo_version before this call. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. + * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmJobGetStats(dcgmHandle_t pDcgmHandle, char jobId[64], dcgmJobInfo_t *pJobInfo); + +/** + * This API tells DCGM to stop tracking the job given by jobId. After this call, you will no longer + * be able to call dcgmJobGetStats() on this jobId. However, you will be able to reuse jobId after + * this call. + * + * @param pDcgmHandle IN: DCGM Handle + * @param jobId IN: User provided string to represent the job + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * - \ref DCGM_ST_NO_DATA if \a jobId is not a valid job identifier. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmJobRemove(dcgmHandle_t pDcgmHandle, char jobId[64]); + +/** + * This API tells DCGM to stop tracking all jobs. After this call, you will no longer + * be able to call dcgmJobGetStats() any jobs until you call dcgmJobStartStats again. + * You will be able to reuse any previously-used jobIds after this call. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmJobRemoveAll(dcgmHandle_t pDcgmHandle); + +/** @} */ // Closing for DCGMAPI_JOB_STATS + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_HM Health Monitor + * + * This chapter describes the methods that handle the GPU health monitor. + * + * @{ + */ +/***************************************************************************************************/ + +/** + * Enable the DCGM health check system for the given systems defined in \ref dcgmHealthSystems_t + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to perform operation on all the NvSwitches. + * @param systems IN: An enum representing systems that should be enabled for health checks logically OR'd + * together. Refer to \ref dcgmHealthSystems_t for details. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmHealthSet(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmHealthSystems_t systems); + +/** + * Enable the DCGM health check system for the given systems defined in \ref dcgmHealthSystems_t + * + * Since DCGM 2.0 + * + * @param pDcgmHandle IN: DCGM Handle + * @param healthSet IN: Parameters to use when setting health watches. See + * \ref dcgmHealthSetParams_v2 for the description of each parameter. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + */ + +dcgmReturn_t DCGM_PUBLIC_API dcgmHealthSet_v2(dcgmHandle_t pDcgmHandle, dcgmHealthSetParams_v2 *params); + +/** + * Retrieve the current state of the DCGM health check system + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more entities. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs or + * \a DCGM_GROUP_ALL_NVSWITCHES to perform operation on all the NvSwitches. + * @param systems OUT: An integer representing the enabled systems for the given group Refer to + * \ref dcgmHealthSystems_t for details. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmHealthGet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmHealthSystems_t *systems); + + +/** + * Check the configured watches for any errors/failures/warnings that have occurred + * since the last time this check was invoked. On the first call, stateful information + * about all of the enabled watches within a group is created but no error results are + * provided. On subsequent calls, any error information will be returned. + * + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing a collection of one or more entities. + * Refer to \ref dcgmGroupCreate for details on creating a group + * @param results OUT: A reference to the dcgmHealthResponse_t structure to populate. + * results->version must be set to dcgmHealthResponse_version. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * - \ref DCGM_ST_VER_MISMATCH if results->version is not dcgmHealthResponse_version + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmHealthCheck(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmHealthResponse_t *results); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_PO Policies + * + * This chapter describes the methods that handle system policy management and violation settings. + * The APIs in Policies module can be broken down into following categories: + * + * @{ + */ +/***************************************************************************************************/ + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_PO_Setup Setup and Management + * Describes APIs for setting up policies and registering callbacks to receive notification in + * case specific policy condition has been violated. + * @{ + */ +/***************************************************************************************************/ + +/** + * Set the current violation policy inside the policy manager. Given the conditions within the + * \ref dcgmPolicy_t structure, if a violation has occurred, subsequent action(s) may be performed to + * either report or contain the failure. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param policy IN: A reference to \ref dcgmPolicy_t that will be applied to all GPUs in the group. + * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if the detailed error information + * is not needed. Refer to \ref dcgmStatusCreate for details on creating a status handle. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if \a groupId or \a policy is invalid + * - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId + * - DCGM_ST_* a different error has occurred and is stored in \a statusHandle. + * Refer to \ref dcgmReturn_t + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmPolicySet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmPolicy_t *policy, + dcgmStatus_t statusHandle); + +/** + * Get the current violation policy inside the policy manager. Given a groupId, a number of + * policy structures are retrieved. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param count IN: The size of the policy array. This is the maximum number of policies that will be + * retrieved and ultimately should correspond to the number of GPUs specified in the + * group. + * @param policy OUT: A reference to \ref dcgmPolicy_t that will used as storage for the current policies + * applied to each GPU in the group. + * @param statusHandle IN/OUT: Resulting status for the operation. Pass it as NULL if the detailed error information + * for the operation is not needed. Refer to \ref dcgmStatusCreate for details on + * creating a status handle. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if \a groupId or \a policy is invalid + * - DCGM_ST_* a different error has occurred and is stored in \a statusHandle. + * Refer to \ref dcgmReturn_t + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyGet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + int count, + dcgmPolicy_t *policy, + dcgmStatus_t statusHandle); + +/** + * Register a function to be called when a specific policy condition (see \ref dcgmPolicyCondition_t) has been + * violated. This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after + * dcgmPolicyTrigger when in DCGM_OPERATION_MODE_MANUAL mode. All callbacks are made within a separate thread. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for + * which to register a callback function + * @param beginCallback IN: A reference to a function that should be called should a violation occur. + * This function will be called prior to any actions specified by the policy are taken. + * @param finishCallback IN: A reference to a function that should be called should a violation occur. + * This function will be called after any action specified by the policy are completed. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid, \a beginCallback, or + * \a finishCallback is NULL + * - \ref DCGM_ST_NOT_SUPPORTED if any unsupported GPUs are part of the GPU group specified in groupId + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyRegister(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmPolicyCondition_t condition, + fpRecvUpdates beginCallback, + fpRecvUpdates finishCallback); + +/** + * Unregister a function to be called for a specific policy condition (see \ref dcgmPolicyCondition_t). + * This function will unregister all callbacks for a given condition and handle. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param condition IN: The set of conditions specified as an OR'd list (see \ref dcgmPolicyCondition_t) for + * which to unregister a callback function + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if \a groupId, \a condition, is invalid or \a callback is NULL + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyUnregister(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmPolicyCondition_t condition); + +/** @} */ // Closing for DCGMAPI_PO_Setup + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_PO_MI Manual Invocation + * Describes APIs which can be used to perform direct actions (e.g. Perform GPU Reset, Run Health + * Diagnostics) on a group of GPUs. + * @{ + */ +/***************************************************************************************************/ + +/** + * Inform the action manager to perform a manual validation of a group of GPUs on the system + * + * *************************************** DEPRECATED *************************************** + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate for + * details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param validate IN: The validation to perform after the action. + * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually due + * to the Tesla recommended driver not being installed on the system. + * - \ref DCGM_ST_BADPARAM if \a groupId, \a validate, or \a statusHandle is invalid + * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is currently + * not allowed. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmPolicyValidation_t validate, + dcgmDiagResponse_t *response); + +/** + * Inform the action manager to perform a manual validation of a group of GPUs on the system + * + * @param pDcgmHandle IN: DCGM Handle + * @param drd IN: Contains the group id, test names, test parameters, struct version, and the validation + * that should be performed. Look at \ref dcgmGroupCreate for details on creating the + * group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS to perform + * operation on all the GPUs. + * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_NOT_SUPPORTED if running the specified \a validate is not supported. This is usually + * due to the Tesla recommended driver not being installed on the system. + * - \ref DCGM_ST_BADPARAM if \a groupId, \a validate, or \a statusHandle is invalid + * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is + * currently not allowed. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, + dcgmRunDiag_v7 *drd, + dcgmDiagResponse_t *response); + +/** + * Run a diagnostic on a group of GPUs + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate + * for details on creating the group. Alternatively, pass in the group id as + * \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. + * @param diagLevel IN: Diagnostic level to run + * @param diagResponse IN/OUT: Result of running the DCGM diagnostic.
+ * .version should be set to \ref dcgmDiagResponse_version before this call. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_NOT_SUPPORTED if running the diagnostic is not supported. This is usually due to the + * Tesla recommended driver not being installed on the system. + * - \ref DCGM_ST_BADPARAM if a provided parameter is invalid or missing + * - \ref DCGM_ST_GENERIC_ERROR an internal error has occurred + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is + * currently not allowed. + * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmDiagnosticLevel_t diagLevel, + dcgmDiagResponse_t *diagResponse); + +/** @} */ // Closing for DCGMAPI_PO_MI + +/** @} */ // Closing for DCGMAPI_PO + +/***************************************************************************************************/ +/** @addtogroup DCGMAPI_Admin_ExecCtrl + * @{ + */ +/***************************************************************************************************/ + +/** + * Inform the policy manager loop to perform an iteration and trigger the callbacks of any + * registered functions. Callback functions will be called from a separate thread as the calling function. + * + * Note: The GPU monitoring and management agent must call this method periodically if the operation + * mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization + * (\ref dcgmInit). + * + * @param pDcgmHandle IN: DCGM Handle + * + * @return + * - \ref DCGM_ST_OK If the call was successful + * - DCGM_ST_GENERIC_ERROR The policy manager was unable to perform another iteration. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmPolicyTrigger(dcgmHandle_t pDcgmHandle); + +/** @} */ // Closing for DCGMAPI_Admin_ExecCtrl + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_Topo Topology + * @{ + */ +/***************************************************************************************************/ + +/** + * Gets device topology corresponding to the \a gpuId. + * + * @param pDcgmHandle IN: DCGM Handle + * @param gpuId IN: GPU Id corresponding to which topology information should be fetched + * @param pDcgmDeviceTopology IN/OUT: Topology information corresponding to \a gpuId. pDcgmDeviceTopology->version must + * be set to dcgmDeviceTopology_version before this call. + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_BADPARAM if \a gpuId or \a pDcgmDeviceTopology were not valid. + * - \ref DCGM_ST_VER_MISMATCH if pDcgmDeviceTopology->version was not set to dcgmDeviceTopology_version. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetDeviceTopology(dcgmHandle_t pDcgmHandle, + unsigned int gpuId, + dcgmDeviceTopology_t *pDcgmDeviceTopology); + +/** + * Gets group topology corresponding to the \a groupId. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: GroupId corresponding to which topology information should be fetched + * @param pDcgmGroupTopology IN/OUT: Topology information corresponding to \a groupId. pDcgmgroupTopology->version must + * be set to dcgmGroupTopology_version. + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_BADPARAM if \a groupId or \a pDcgmGroupTopology were not valid. + * - \ref DCGM_ST_VER_MISMATCH if pDcgmgroupTopology->version was not set to dcgmGroupTopology_version. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetGroupTopology(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmGroupTopology_t *pDcgmGroupTopology); + +/** @} */ // Closing for DCGMAPI_Topo + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_METADATA Metadata + * @{ + * This chapter describes the methods that query for DCGM metadata. + */ +/***************************************************************************************************/ + +/*************************************************************************/ +/** + * Retrieve the total amount of memory that the hostengine process is currently using. + * This measurement represents both the resident set size (what is currently in RAM) and + * the swapped memory that belongs to the process. + * + * @param pDcgmHandle IN: DCGM Handle + * @param memoryInfo IN/OUT: see \ref dcgmIntrospectMemory_t. memoryInfo->version must be set to + * dcgmIntrospectMemory_version prior to this call. + * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) or return DCGM_ST_NO_DATA (0) + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED + * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet + * - \ref DCGM_ST_VER_MISMATCH if memoryInfo->version is 0 or invalid. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmIntrospectGetHostengineMemoryUsage(dcgmHandle_t pDcgmHandle, + dcgmIntrospectMemory_t *memoryInfo, + int waitIfNoData); + +/*************************************************************************/ +/** + * Retrieve the CPU utilization of the DCGM hostengine process. + * + * @param pDcgmHandle IN: DCGM Handle + * @param cpuUtil IN/OUT: see \ref dcgmIntrospectCpuUtil_t. cpuUtil->version must be set to + * dcgmIntrospectCpuUtil_version prior to this call. + * @param waitIfNoData IN: if no metadata is gathered wait till this occurs (!0) or return DCGM_ST_NO_DATA (0) + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_NOT_CONFIGURED if metadata gathering state is \a DCGM_INTROSPECT_STATE_DISABLED + * - \ref DCGM_ST_NO_DATA if \a waitIfNoData is false and metadata has not been gathered yet + * - \ref DCGM_ST_VER_MISMATCH if cpuUtil->version or execTime->version is 0 or invalid. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmIntrospectGetHostengineCpuUtilization(dcgmHandle_t pDcgmHandle, + dcgmIntrospectCpuUtil_t *cpuUtil, + int waitIfNoData); + +/** @} */ // Closing for DCGMAPI_METADATA + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_TOPOLOGY Topology + * @{ + * This chapter describes the methods that query for DCGM topology information. + */ +/***************************************************************************************************/ + +/*************************************************************************/ +/** + * Get the best group of gpus from the specified bitmask according to topological proximity: cpuAffinity, NUMA + * node, and NVLink. + * + * @param pDcgmHandle IN: DCGM Handle + * @param inputGpuIds IN: a bitmask of which GPUs DCGM should consider. If some of the GPUs on the system are + * already in use, they shouldn't be included in the bitmask. 0 means that all of the GPUs + * in the system should be considered. + * @param numGpus IN: the number of GPUs that are desired from inputGpuIds. If this number is greater than + * the number of healthy GPUs in inputGpuIds, then less than numGpus gpus will be + * specified in outputGpuIds. + * @param outputGpuIds OUT: a bitmask of numGpus or fewer GPUs from inputGpuIds that represent the best placement + * available from inputGpuIds. + * @param hintFlags IN: a bitmask of DCGM_TOPO_HINT_F_ #defines of hints that should be taken into account when + * assigning outputGpuIds. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmSelectGpusByTopology(dcgmHandle_t pDcgmHandle, + uint64_t inputGpuIds, + uint32_t numGpus, + uint64_t *outputGpuIds, + uint64_t hintFlags); + +/** @} */ // Closing for DCGMAPI_TOPOLOGY + +/***************************************************************************************************/ +/** @defgroup DCGMAPI_MODULES Modules + * @{ + * This chapter describes the methods that query and configure DCGM modules. + */ +/***************************************************************************************************/ + +/*************************************************************************/ +/** + * Add a module to the denylist. This module will be prevented from being loaded + * if it hasn't been loaded already. Modules are lazy-loaded as they are used by + * DCGM APIs, so it's important to call this API soon after the host engine has been started. + * You can also pass --denylist-modules to the nv-hostengine binary to make sure modules + * get add to the denylist immediately after the host engine starts up. + * + * @param pDcgmHandle IN: DCGM Handle + * @param moduleId IN: ID of the module to denylist. Use \ref dcgmModuleGetStatuses to get a list of valid + * module IDs. + * + * @return + * - \ref DCGM_ST_OK if the module has been add to the denylist. + * - \ref DCGM_ST_IN_USE if the module has already been loaded and cannot add to the denylist. + * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmModuleDenylist(dcgmHandle_t pDcgmHandle, dcgmModuleId_t moduleId); + +/*************************************************************************/ +/** + * Get the status of all of the DCGM modules. + * + * @param pDcgmHandle IN: DCGM Handle + * @param moduleStatuses OUT: Module statuses.
+ * .version should be set to dcgmModuleStatuses_version upon calling. + * + * @return + * - \ref DCGM_ST_OK if the request succeeds. + * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmModuleGetStatuses(dcgmHandle_t pDcgmHandle, dcgmModuleGetStatuses_t *moduleStatuses); + +/** @} */ // Closing for DCGMAPI_MODULES + +/*************************************************************************/ +/** @defgroup DCGMAPI_PROFILING Profiling + * @{ + * This chapter describes the methods that watch profiling fields from within DCGM. + */ +/*************************************************************************/ + +/*************************************************************************/ +/** + * Get all of the profiling metric groups for a given GPU group. + * + * Profiling metrics are watched in groups of fields that are all watched together. For instance, if you want + * to watch DCGM_FI_PROF_GR_ENGINE_ACTIVITY, this might also be in the same group as DCGM_FI_PROF_SM_EFFICIENCY. + * Watching this group would result in DCGM storing values for both of these metrics. + * + * Some groups cannot be watched concurrently as others as they utilize the same hardware resource. For instance, + * you may not be able to watch DCGM_FI_PROF_TENSOR_OP_UTIL at the same time as DCGM_FI_PROF_GR_ENGINE_ACTIVITY + * on your hardware. At the same time, you may be able to watch DCGM_FI_PROF_TENSOR_OP_UTIL at the same time as + * DCGM_FI_PROF_NVLINK_TX_DATA. + * + * Metrics that can be watched concurrently will have different .majorId fields in their dcgmProfMetricGroupInfo_t + * + * See \ref dcgmGroupCreate for details on creating a GPU group + * See \ref dcgmProfWatchFields to actually watch a metric group + * + * @param pDcgmHandle IN: DCGM Handle + * @param metricGroups IN/OUT: Metric groups supported for metricGroups->groupId.
+ * metricGroups->version should be set to dcgmProfGetMetricGroups_version upon calling. + * + * @return + * - \ref DCGM_ST_OK if the request succeeds. + * - \ref DCGM_ST_BADPARAM if a parameter is missing or bad. + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if metricGroups->groupId's GPUs are not identical GPUs. + * - \ref DCGM_ST_NOT_SUPPORTED if profiling metrics are not supported for the given GPU group. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmProfGetSupportedMetricGroups(dcgmHandle_t pDcgmHandle, + dcgmProfGetMetricGroups_t *metricGroups); + +/** + * Request that DCGM start recording updates for a given list of profiling field IDs. + * + * Once metrics have been watched by this API, any of the normal DCGM field-value retrieval APIs can be used on + * the underlying fieldIds of this metric group. See \ref dcgmGetLatestValues_v2, \ref dcgmGetLatestValuesForFields, + * \ref dcgmEntityGetLatestValues, and \ref dcgmEntitiesGetLatestValues. + * + * @param pDcgmHandle IN: DCGM Handle + * @param watchFields IN: Details of which metric groups to watch for which GPUs. See \ref dcgmProfWatchFields_v1 + * for details of what should be put in each struct member. watchFields->version should be + * set to dcgmProfWatchFields_version upon calling. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * - \ref DCGM_ST_NOT_SUPPORTED if profiling metric group metricGroupTag is not supported for the given + * GPU group. + * - \ref DCGM_ST_GROUP_INCOMPATIBLE if groupId's GPUs are not identical GPUs. Profiling metrics are only + * support for homogenous groups of GPUs. + * - \ref DCGM_ST_PROFILING_MULTI_PASS if any of the metric groups could not be watched concurrently due to + * requiring the hardware to gather them with multiple passes + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmProfWatchFields(dcgmHandle_t pDcgmHandle, dcgmProfWatchFields_t *watchFields); + +/** + * Request that DCGM stop recording updates for all profiling field IDs for all GPUs + * + * @param pDcgmHandle IN: DCGM Handle + * @param unwatchFields IN: Details of which metric groups to unwatch for which GPUs. See \ref + * dcgmProfUnwatchFields_v1 for details of what should be put in each struct member. + * unwatchFields->version should be set to dcgmProfUnwatchFields_version upon calling. + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a parameter is invalid + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmProfUnwatchFields(dcgmHandle_t pDcgmHandle, dcgmProfUnwatchFields_t *unwatchFields); + +/** + * Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields + * from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute. + * Profiling fields start with DCGM_PROF_ and are in the field ID range 1001-1012. + * + * Call this API before you launch one of those tools and dcgmProfResume() after the tool has completed. + * + * DCGM will save BLANK values while profiling is paused. + * + * Calling this while profiling activities are already paused is fine and will be treated as a no-op. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @return + * - \ref DCGM_ST_OK If the call was successful. + * - \ref DCGM_ST_BADPARAM if a parameter is invalid. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmProfPause(dcgmHandle_t pDcgmHandle); + +/** + * Resume profiling activities in DCGM that were previously paused with dcgmProfPause(). + * + * Call this API after you have completed running other NVIDIA developer tools to reenable DCGM + * profiling metrics. + * + * DCGM will save BLANK values while profiling is paused. + * + * Calling this while profiling activities have already been resumed is fine and will be treated as a no-op. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @return + * - \ref DCGM_ST_OK If the call was successful. + * - \ref DCGM_ST_BADPARAM if a parameter is invalid. + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmProfResume(dcgmHandle_t pDcgmHandle); + +/** @} */ // Closing for DCGMAPI_PROFILING + +/** + * Adds fake GPU instances and or compute instances for testing purposes. The entity IDs specified for + * the GPU instances and compute instances are only guaranteed to be used by DCGM if MIG mode is not active. + * + * NOTE: this API will not work on a real system reading actual values from NVML, and it may even cause + * the real instances to malfunction. This API is for testing purposes only. + * + * @param pDcgmHandle IN: DCGM Handle + * @param hierarchy + * + * @return + * - \ref DCGM_ST_OK + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmAddFakeInstances(dcgmHandle_t pDcgmHandle, dcgmMigHierarchy_v2 *hierarchy); + +#ifdef __cplusplus +} +#endif + +#endif /* DCGM_AGENT_H */ diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_api_export.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_api_export.h new file mode 100644 index 0000000000..52f9e0de39 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_api_export.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DCGM_DCGM_API_EXPORT_H +#define DCGM_DCGM_API_EXPORT_H + +#undef DCGM_PUBLIC_API +#undef DCGM_PRIVATE_API + +#if defined(DCGM_API_EXPORT) +#define DCGM_PUBLIC_API __attribute((visibility("default"))) +#else +#define DCGM_PUBLIC_API +#if defined(ERROR_IF_NOT_PUBLIC) +#error(Should be public) +#endif +#endif + +#define DCGM_PRIVATE_API __attribute((visibility("hidden"))) + + +#endif // DCGM_DCGM_API_EXPORT_H diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_errors.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_errors.h new file mode 100644 index 0000000000..02d15ab0c3 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_errors.h @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DCGM_ERRORS_H +#define DCGM_ERRORS_H + +#define DCGM_PUBLIC_API +#include "dcgm_structs.h" + +/***************************************************************************************************/ +/** @defgroup dcgmErrorEnums Error Codes + * @{ + */ +/***************************************************************************************************/ +/* + * Error codes for passive and active health checks. + * New error codes must be added to end of enum to maintain backwards compatibility. + */ +typedef enum dcgmError_enum +{ + DCGM_FR_OK = 0, //!< 0 No error + DCGM_FR_UNKNOWN = 1, //!< 1 Unknown error code + DCGM_FR_UNRECOGNIZED = 2, //!< 2 Unrecognized error code + DCGM_FR_PCI_REPLAY_RATE = 3, //!< 3 Unacceptable rate of PCI errors + DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< 4 Uncorrectable volatile double bit error + DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< 5 Unacceptable rate of volatile single bit errors + DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< 6 Pending page retirements detected + DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< 7 Unacceptable total page retirements detected + DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< 8 Unacceptable total page retirements due to uncorrectable errors + DCGM_FR_CORRUPT_INFOROM = 9, //!< 9 Corrupt inforom found + DCGM_FR_CLOCK_THROTTLE_THERMAL = 10, //!< 10 Clocks being throttled due to overheating + DCGM_FR_POWER_UNREADABLE = 11, //!< 11 Cannot get a reading for power from NVML + DCGM_FR_CLOCK_THROTTLE_POWER = 12, //!< 12 Clock being throttled due to power restrictions + DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< 13 Unacceptable rate of NVLink errors + DCGM_FR_NVLINK_DOWN = 14, //!< 14 NVLink is down + DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< 15 Fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< 16 Non-fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_DOWN = 17, //!< 17 NVSwitch is down - NOT USED: DEPRECATED + DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< 18 Cannot access a file + DCGM_FR_NVML_API = 19, //!< 19 Error occurred on an NVML API - NOT USED: DEPRECATED + DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< 20 Disagreement in GPU count between /dev and NVML + DCGM_FR_BAD_PARAMETER = 21, //!< 21 Bad parameter passed to API + DCGM_FR_CANNOT_OPEN_LIB = 22, //!< 22 Cannot open a library that must be accessed + DCGM_FR_DENYLISTED_DRIVER = 23, //!< 23 A driver on the denylist (nouveau) is active + DCGM_FR_NVML_LIB_BAD = 24, //!< 24 NVML library is missing expected functions - NOT USED: DEPRECATED + DCGM_FR_GRAPHICS_PROCESSES = 25, //!< 25 Graphics processes are active on this GPU + DCGM_FR_HOSTENGINE_CONN = 26, //!< 26 Bad connection to nv-hostengine - NOT USED: DEPRECATED + DCGM_FR_FIELD_QUERY = 27, //!< 27 Error querying a field from DCGM + DCGM_FR_BAD_CUDA_ENV = 28, //!< 28 The environment has variables that hurt CUDA + DCGM_FR_PERSISTENCE_MODE = 29, //!< 29 Persistence mode is disabled + DCGM_FR_LOW_BANDWIDTH = 30, //!< 30 The bandwidth is unacceptably low + DCGM_FR_HIGH_LATENCY = 31, //!< 31 Latency is too high + DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< 32 Cannot find a tag for a field + DCGM_FR_FIELD_VIOLATION = 33, //!< 33 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD = 34, //!< 34 The value for the specified field is above the threshold + DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< 35 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< 36 The value for the specified field is above the threshold + DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< 37 Field type cannot be supported + DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< 38 The value for the specified field is above the threshold + DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< 39 The value for the specified field is above the threshold + DCGM_FR_THERMAL_VIOLATIONS = 40, //!< 40 Thermal violations detected + DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< 41 Thermal violations detected with a timestamp + DCGM_FR_TEMP_VIOLATION = 42, //!< 42 Temperature is too high + DCGM_FR_THROTTLING_VIOLATION = 43, //!< 43 Non-benign clock throttling is occurring + DCGM_FR_INTERNAL = 44, //!< 44 An internal error was detected + DCGM_FR_PCIE_GENERATION = 45, //!< 45 PCIe generation is too low + DCGM_FR_PCIE_WIDTH = 46, //!< 46 PCIe width is too low + DCGM_FR_ABORTED = 47, //!< 47 Test was aborted by a user signal + DCGM_FR_TEST_DISABLED = 48, //!< 48 This test is disabled for this GPU + DCGM_FR_CANNOT_GET_STAT = 49, //!< 49 Cannot get telemetry for a needed value + DCGM_FR_STRESS_LEVEL = 50, //!< 50 Stress level is too low (bad performance) + DCGM_FR_CUDA_API = 51, //!< 51 Error calling the specified CUDA API + DCGM_FR_FAULTY_MEMORY = 52, //!< 52 Faulty memory detected on this GPU + DCGM_FR_CANNOT_SET_WATCHES = 53, //!< 53 Unable to set field watches in DCGM - NOT USED: DEPRECATED + DCGM_FR_CUDA_UNBOUND = 54, //!< 54 CUDA context is no longer bound + DCGM_FR_ECC_DISABLED = 55, //!< 55 ECC memory is disabled right now + DCGM_FR_MEMORY_ALLOC = 56, //!< 56 Cannot allocate memory on the GPU + DCGM_FR_CUDA_DBE = 57, //!< 57 CUDA detected unrecovable double-bit error + DCGM_FR_MEMORY_MISMATCH = 58, //!< 58 Memory error detected + DCGM_FR_CUDA_DEVICE = 59, //!< 59 No CUDA device discoverable for existing GPU + DCGM_FR_ECC_UNSUPPORTED = 60, //!< 60 ECC memory is unsupported by this SKU + DCGM_FR_ECC_PENDING = 61, //!< 61 ECC memory is in a pending state - NOT USED: DEPRECATED + DCGM_FR_MEMORY_BANDWIDTH = 62, //!< 62 Memory bandwidth is too low + DCGM_FR_TARGET_POWER = 63, //!< 63 Cannot hit the target power draw + DCGM_FR_API_FAIL = 64, //!< 64 The specified API call failed + DCGM_FR_API_FAIL_GPU = 65, //!< 65 The specified API call failed for the specified GPU + DCGM_FR_CUDA_CONTEXT = 66, //!< 66 Cannot create a CUDA context on this GPU + DCGM_FR_DCGM_API = 67, //!< 67 DCGM API failure + DCGM_FR_CONCURRENT_GPUS = 68, //!< 68 Need multiple GPUs to run this test + DCGM_FR_TOO_MANY_ERRORS = 69, //!< 69 More errors than fit in the return struct - NOT USED: DEPRECATED + DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70, //!< 70 More than 100 CRC errors are happening per second + DCGM_FR_NVLINK_ERROR_CRITICAL = 71, //!< 71 NVLink error for a field that should always be 0 + DCGM_FR_ENFORCED_POWER_LIMIT = 72, //!< 72 The enforced power limit is too low to hit the target + DCGM_FR_MEMORY_ALLOC_HOST = 73, //!< 73 Cannot allocate memory on the host + DCGM_FR_GPU_OP_MODE = 74, //!< 74 Bad GPU operating mode for running plugin - NOT USED: DEPRECATED + DCGM_FR_NO_MEMORY_CLOCKS = 75, //!< 75 No memory clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_NO_GRAPHICS_CLOCKS = 76, //!< 76 No graphics clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_HAD_TO_RESTORE_STATE = 77, //!< 77 Note that we had to restore a GPU's state + DCGM_FR_L1TAG_UNSUPPORTED = 78, //!< 78 L1TAG test is unsupported by this SKU + DCGM_FR_L1TAG_MISCOMPARE = 79, //!< 79 L1TAG test failed on a miscompare + DCGM_FR_ROW_REMAP_FAILURE = 80, //!< 80 Row remapping failed (Ampere or newer GPUs) + DCGM_FR_UNCONTAINED_ERROR = 81, //!< 81 Uncontained error - XID 95 + DCGM_FR_EMPTY_GPU_LIST = 82, //!< 82 No GPU information given to plugin + DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83, //!< 83 Pending page retirements due to a DBE + DCGM_FR_UNCORRECTABLE_ROW_REMAP = 84, //!< 84 Uncorrectable row remapping + DCGM_FR_PENDING_ROW_REMAP = 85, //!< 85 Row remapping is pending + DCGM_FR_BROKEN_P2P_MEMORY_DEVICE = 86, //!< 86 P2P copy test detected an error writing to this GPU + DCGM_FR_BROKEN_P2P_WRITER_DEVICE = 87, //!< 87 P2P copy test detected an error writing from this GPU + DCGM_FR_NVSWITCH_NVLINK_DOWN = 88, //!< 88 An NvLink is down for the specified NVSwitch - NOT USED: DEPRECATED + DCGM_FR_EUD_BINARY_PERMISSIONS = 89, //!< 89 EUD binary permissions are incorrect + DCGM_FR_EUD_NON_ROOT_USER = 90, //!< 90 EUD plugin is not running as root + DCGM_FR_EUD_SPAWN_FAILURE = 91, //!< 91 EUD plugin failed to spawn the EUD binary + DCGM_FR_EUD_TIMEOUT = 92, //!< 92 EUD plugin timed out + DCGM_FR_EUD_ZOMBIE = 93, //!< 93 EUD process remains running after the plugin considers it finished + DCGM_FR_EUD_NON_ZERO_EXIT_CODE = 94, //!< 94 EUD process exited with a non-zero exit code + DCGM_FR_EUD_TEST_FAILED = 95, //!< 95 EUD test failed + DCGM_FR_FILE_CREATE_PERMISSIONS = 96, //!< 96 We cannot create a file in this directory. + DCGM_FR_PAUSE_RESUME_FAILED = 97, //!< 97 Pause/Resume failed + DCGM_FR_PCIE_H_REPLAY_VIOLATION = 98, //!< 98 PCIe test caught correctable errors + DCGM_FR_GPU_EXPECTED_NVLINKS_UP = 99, //!< 99 Expected nvlinks up per gpu + DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP = 100, //!< 100 Expected nvlinks up per nvswitch + DCGM_FR_XID_ERROR = 101, //!< 101 XID error detected + DCGM_FR_SBE_VIOLATION = 102, //!< 102 Single bit error detected + DCGM_FR_DBE_VIOLATION = 103, //!< 103 Double bit error detected + DCGM_FR_PCIE_REPLAY_VIOLATION = 104, //!< 104 PCIe replay errors detected + DCGM_FR_SBE_THRESHOLD_VIOLATION = 105, //!< 105 SBE threshold violated + DCGM_FR_DBE_THRESHOLD_VIOLATION = 106, //!< 106 DBE threshold violated + DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION = 107, //!< 107 PCIE replay count violated + DCGM_FR_CUDA_FM_NOT_INITIALIZED = 108, //!< 108 The fabricmanager is not initialized + DCGM_FR_SXID_ERROR = 109, //!< 109 NvSwitch fatal error detected + DCGM_FR_ERROR_SENTINEL = 110, //!< 110 MUST BE THE LAST ERROR CODE +} dcgmError_t; + +typedef enum dcgmErrorSeverity_enum +{ + DCGM_ERROR_NONE = 0, //!< 0 NONE + DCGM_ERROR_MONITOR = 1, //!< 1 Can perform workload, but needs to be monitored. + DCGM_ERROR_ISOLATE = 2, //!< 2 Cannot perform workload. GPU should be isolated. + DCGM_ERROR_UNKNOWN = 3, //!< 3 This error code is not recognized + DCGM_ERROR_TRIAGE = 4, //!< 4 This error should be triaged + DCGM_ERROR_CONFIG = 5, //!< 5 This error can be configured + DCGM_ERROR_RESET = 6, //!< 6 Drain and reset GPU +} dcgmErrorSeverity_t; + +typedef enum dcgmErrorCategory_enum +{ + DCGM_FR_EC_NONE = 0, //!< 0 NONE + DCGM_FR_EC_PERF_THRESHOLD = 1, //!< 1 Performance Threshold + DCGM_FR_EC_PERF_VIOLATION = 2, //!< 2 Performance Violation + DCGM_FR_EC_SOFTWARE_CONFIG = 3, //!< 3 Software Configuration + DCGM_FR_EC_SOFTWARE_LIBRARY = 4, //!< 4 Software Library + DCGM_FR_EC_SOFTWARE_XID = 5, //!< 5 Software XID + DCGM_FR_EC_SOFTWARE_CUDA = 6, //!< 6 Software Cuda + DCGM_FR_EC_SOFTWARE_EUD = 7, //!< 7 Software EUD + DCGM_FR_EC_SOFTWARE_OTHER = 8, //!< 8 Software Other + DCGM_FR_EC_HARDWARE_THERMAL = 9, //!< 9 Hardware Thermal + DCGM_FR_EC_HARDWARE_MEMORY = 10, //!< 10 Hardware Memory + DCGM_FR_EC_HARDWARE_NVLINK = 11, //!< 11 Hardware NvLink + DCGM_FR_EC_HARDWARE_NVSWITCH = 12, //!< 12 Hardware NvSwitch + DCGM_FR_EC_HARDWARE_PCIE = 13, //!< 13 Hardware PCIe + DCGM_FR_EC_HARDWARE_POWER = 14, //!< 14 Hardware Power + DCGM_FR_EC_HARDWARE_OTHER = 15, //!< 15 Hardware Other + DCGM_FR_EC_INTERNAL_OTHER = 16, //!< 16 Internal Other +} dcgmErrorCategory_t; + +typedef struct +{ + dcgmError_t errorId; + const char *msgFormat; + const char *suggestion; + int severity; + int category; +} dcgm_error_meta_t; + +extern dcgm_error_meta_t dcgmErrorMeta[]; + + +/* Standard message for running a field diagnostic */ +#define TRIAGE_RUN_FIELD_DIAG_MSG "Run a field diagnostic on the GPU." +#define DEBUG_COOLING_MSG \ + "Verify that the cooling on this machine is functional, including external, " \ + "thermal material interface, fans, and any other components." +#define BUG_REPORT_MSG "Please capture an nvidia-bug-report and send it to NVIDIA." +#define SYSTEM_TRIAGE_MSG "Check DCGM and system logs for errors. Reset GPU. Restart DCGM. Rerun diagnostics." +#define CONFIG_MSG "Check DCGM and system configuration. This error may be eliminated with an updated configuration." + +/* + * Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG format + * where is the actual message. + */ +#define DCGM_FR_OK_MSG "The operation completed successfully." +#define DCGM_FR_UNKNOWN_MSG "Unknown error." +#define DCGM_FR_UNRECOGNIZED_MSG "Unrecognized error code." +// replay limit, gpu id, replay errors detected +#define DCGM_FR_PCI_REPLAY_RATE_MSG "Detected more than %u PCIe replays per minute for GPU %u : %d" +// dbes deteced, gpu id +#define DCGM_FR_VOLATILE_DBE_DETECTED_MSG "Detected %d volatile double-bit ECC error(s) in GPU %u." +// sbe limit, gpu id, sbes detected +#define DCGM_FR_VOLATILE_SBE_DETECTED_MSG "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld" +// gpu id +#define DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG "A pending retired page has been detected in GPU %u." +// retired pages detected, gpud id +#define DCGM_FR_RETIRED_PAGES_LIMIT_MSG "%u or more retired pages have been detected in GPU %u. " +// retired pages due to dbes detected, gpu id +#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG \ + "An excess of %u retired pages due to DBEs have been detected and" \ + " more than one page has been retired due to DBEs in the past" \ + " week in GPU %u." +// gpu id +#define DCGM_FR_CORRUPT_INFOROM_MSG "A corrupt InfoROM has been detected in GPU %u." +// gpu id +#define DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG "Detected clock throttling due to thermal violation in GPU %u." +// gpu id +#define DCGM_FR_POWER_UNREADABLE_MSG "Cannot reliably read the power usage for GPU %u." +// gpu id +#define DCGM_FR_CLOCK_THROTTLE_POWER_MSG "Detected clock throttling due to power violation in GPU %u." +// nvlink errors detected, nvlink id, error threshold +#define DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG \ + "Detected %ld %s NvLink errors on GPU %u's NVLink which exceeds " \ + "threshold of %u" +// gpu id, nvlink id +#define DCGM_FR_NVLINK_DOWN_MSG "GPU %u's NvLink link %d is currently down" +// nvlinks up, expected nvlinks up +#define DCGM_FR_GPU_EXPECTED_NVLINKS_UP_MSG "Only %u NvLinks are up out of the expected %u" +// switch id, nvlinks up, expected nvlinks up +#define DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP_MSG "NvSwitch %u - Only %u NvLinks are up out of the expected %u" +// nvswitch id, nvlink id +#define DCGM_FR_NVSWITCH_FATAL_ERROR_MSG "Detected fatal errors on NvSwitch %u link %u" +// nvswitch id, nvlink id +#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG "Detected nonfatal errors on NvSwitch %u link %u" +// nvswitch id, nvlink port +#define DCGM_FR_NVSWITCH_DOWN_MSG "NvSwitch physical ID %u's NvLink port %d is currently down." +// file path, error detail +#define DCGM_FR_NO_ACCESS_TO_FILE_MSG "File %s could not be accessed directly: %s" +// purpose for communicating with NVML, NVML error as string, NVML error +#define DCGM_FR_NVML_API_MSG "Error calling NVML API %s: %s" +#define DCGM_FR_DEVICE_COUNT_MISMATCH_MSG \ + "The number of devices NVML returns is different than the number " \ + "of devices in /dev." +// function name +#define DCGM_FR_BAD_PARAMETER_MSG "Bad parameter to function %s cannot be processed" +// library name, error returned from dlopen +#define DCGM_FR_CANNOT_OPEN_LIB_MSG "Cannot open library %s: '%s'" +// the name of the denylisted driver +#define DCGM_FR_DENYLISTED_DRIVER_MSG "Found driver on the denylist: %s" +// the name of the function that wasn't found +#define DCGM_FR_NVML_LIB_BAD_MSG "Cannot get pointer to %s from libnvidia-ml.so" +#define DCGM_FR_GRAPHICS_PROCESSES_MSG \ + "NVVS has detected processes with graphics contexts open running on at least one " \ + "GPU. This may cause some tests to fail." +// error message from the API call +#define DCGM_FR_HOSTENGINE_CONN_MSG "Could not connect to the host engine: '%s'" +// field name, gpu id +#define DCGM_FR_FIELD_QUERY_MSG "Could not query field %s for GPU %u" +// environment variable name +#define DCGM_FR_BAD_CUDA_ENV_MSG "Found CUDA performance-limiting environment variable '%s'." +// gpu id +#define DCGM_FR_PERSISTENCE_MODE_MSG "Persistence mode for GPU %u is disabled." +// gpu id, direction (d2h, e.g.), measured bandwidth, expected bandwidth +#define DCGM_FR_LOW_BANDWIDTH_MSG \ + "Bandwidth of GPU %u in direction %s of %.2f did not exceed " \ + "minimum required bandwidth of %.2f." +// gpu id, direction (d2h, e.g.), measured latency, expected latency +#define DCGM_FR_HIGH_LATENCY_MSG \ + "Latency type %s of GPU %u value %.2f exceeded maximum allowed " \ + "latency of %.2f." +// field id +#define DCGM_FR_CANNOT_GET_FIELD_TAG_MSG "Unable to get field information for field id %hu" +// field value, field name, gpu id (this message is for fields that should always have a 0 value) +#define DCGM_FR_FIELD_VIOLATION_MSG "Detected %ld %s for GPU %u" +// field value, field name, gpu id, allowable threshold +#define DCGM_FR_FIELD_THRESHOLD_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +// field value, field name, gpu id (same as DCGM_FR_FIELD_VIOLATION, but it's a double) +#define DCGM_FR_FIELD_VIOLATION_DBL_MSG "Detected %.1f %s for GPU %u" +// field value, field name, gpu id, allowable threshold (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) +#define DCGM_FR_FIELD_THRESHOLD_DBL_MSG "Detected %.1f %s for GPU %u which is above the threshold %.1f" +// field name +#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG \ + "Field %s is not supported by this API because it is neither an " \ + "int64 nor a double type." +// field name, allowable threshold, observed value, seconds +#define DCGM_FR_FIELD_THRESHOLD_TS_MSG \ + "%s met or exceeded the threshold of %lu per second: %lu at " \ + "%.1f seconds into the test." +// field name, allowable threshold, observed value, seconds (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) +#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG \ + "%s met or exceeded the threshold of %.1f per second: %.1f at " \ + "%.1f seconds into the test." +// total seconds of violation, gpu id +#define DCGM_FR_THERMAL_VIOLATIONS_MSG "There were thermal violations totaling %.1f seconds for GPU %u" +// total seconds of violations, first instance, gpu id +#define DCGM_FR_THERMAL_VIOLATIONS_TS_MSG \ + "Thermal violations totaling %.1f seconds started at %.1f seconds " \ + "into the test for GPU %u" +// observed temperature, gpu id, max allowed temperature +#define DCGM_FR_TEMP_VIOLATION_MSG \ + "Temperature %lld of GPU %u exceeded user-specified maximum " \ + "allowed temperature %lld" +// gpu id, seconds into test, details about throttling +#define DCGM_FR_THROTTLING_VIOLATION_MSG \ + "Clocks are being throttled for GPU %u because of clock " \ + "throttling starting %.1f seconds into the test. %s" +// details about error +#define DCGM_FR_INTERNAL_MSG "There was an internal error during the test: '%s'" +// gpu id, PCIe generation, minimum allowed, parameter to control +#define DCGM_FR_PCIE_GENERATION_MSG \ + "GPU %u is running at PCI link generation %d, which is below " \ + "the minimum allowed link generation of %d (parameter '%s')" +// gpu id, PCIe width, minimum allowed, parameter to control +#define DCGM_FR_PCIE_WIDTH_MSG \ + "GPU %u is running at PCI link width %dX, which is below the " \ + "minimum allowed link generation of %d (parameter '%s')" +#define DCGM_FR_ABORTED_MSG "Test was aborted early due to user signal" +// Test name +#define DCGM_FR_TEST_DISABLED_MSG "The %s test is skipped for this GPU." +// stat name, gpu id +#define DCGM_FR_CANNOT_GET_STAT_MSG "Unable to generate / collect stat %s for GPU %u" +// observed value, minimum allowed, gpu id +#define DCGM_FR_STRESS_LEVEL_MSG \ + "Max stress level of %.1f did not reach desired stress level of " \ + "%.1f for GPU %u" +// CUDA API name +#define DCGM_FR_CUDA_API_MSG "Error using CUDA API %s" +// count, gpu id +#define DCGM_FR_FAULTY_MEMORY_MSG "Found %d faulty memory elements on GPU %u" +// error detail +#define DCGM_FR_CANNOT_SET_WATCHES_MSG "Unable to add field watches to DCGM: %s" +// gpu id +#define DCGM_FR_CUDA_UNBOUND_MSG "Cuda GPU %d is no longer bound to a CUDA context...Aborting" +// Test name, gpu id +#define DCGM_FR_ECC_DISABLED_MSG "Skipping test %s because ECC is not enabled on GPU %u" +// percentage of memory we tried to allocate, gpu id +#define DCGM_FR_MEMORY_ALLOC_MSG "Couldn't allocate at least %.1f%% of GPU memory on GPU %u" +// gpu id +#define DCGM_FR_CUDA_DBE_MSG \ + "CUDA APIs have indicated that a double-bit ECC error has " \ + "occured on GPU %u." +// gpu id +#define DCGM_FR_MEMORY_MISMATCH_MSG \ + "A memory mismatch was detected on GPU %u, but no error was " \ + "reported by CUDA or NVML." +// gpu id, error detail +#define DCGM_FR_CUDA_DEVICE_MSG "Unable to find a corresponding CUDA device for GPU %u: '%s'" +#define DCGM_FR_ECC_UNSUPPORTED_MSG "ECC Memory is not turned on or is unsupported. Skipping test." +// gpu id +#define DCGM_FR_ECC_PENDING_MSG "ECC memory for GPU %u is in a pending state." +// gpu id, observed bandwidth, required, test name +#define DCGM_FR_MEMORY_BANDWIDTH_MSG \ + "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing " \ + "to meet %.2f GB/s for test %d" +// power draw observed, field tag, minimum power draw required, gpu id +#define DCGM_FR_TARGET_POWER_MSG \ + "Max power of %.1f did not reach desired power minimum %s of " \ + "%.1f for GPU %u" +// API name, error detail +#define DCGM_FR_API_FAIL_MSG "API call %s failed: '%s'" +// API name, gpu id, error detail +#define DCGM_FR_API_FAIL_GPU_MSG "API call %s failed for GPU %u: '%s'" +// gpu id, error detail +#define DCGM_FR_CUDA_CONTEXT_MSG "GPU %u failed to create a CUDA context: %s" +// DCGM API name +#define DCGM_FR_DCGM_API_MSG "Error using DCGM API %s" +#define DCGM_FR_CONCURRENT_GPUS_MSG \ + "Unable to run concurrent pair bandwidth test without 2 or more " \ + "gpus. Skipping" +#define DCGM_FR_TOO_MANY_ERRORS_MSG \ + "This API can only return up to four errors per system. " \ + "Additional errors were found for this system that couldn't be " \ + "communicated." +// error count, gpu id +#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG \ + "%.1f %s NvLink errors found occuring per second on GPU %u, " \ + "exceeding the limit of 100 per second." +// error count, field name, gpu id +#define DCGM_FR_NVLINK_ERROR_CRITICAL_MSG "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)" +// gpu id, power limit, power reached +#define DCGM_FR_ENFORCED_POWER_LIMIT_MSG \ + "Enforced power limit on GPU %u set to %.1f, which is too low to " \ + "attempt to achieve target power %.1f" +// memory +#define DCGM_FR_MEMORY_ALLOC_HOST_MSG "Cannot allocate %zu bytes on the host" +#define DCGM_FR_GPU_OP_MODE_MSG "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP." +// clock, count +#define DCGM_FR_NO_MEMORY_CLOCKS_MSG "No memory clocks <= %u MHZ were found in %u supported memory clocks." +// clock, count, clock +#define DCGM_FR_NO_GRAPHICS_CLOCKS_MSG \ + "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ." +// error detail +#define DCGM_FR_HAD_TO_RESTORE_STATE_MSG "Had to restore GPU state on NVML GPU(s): %s" +#define DCGM_FR_L1TAG_UNSUPPORTED_MSG "This card does not support the L1 cache test. Skipping test." +#define DCGM_FR_L1TAG_MISCOMPARE_MSG "Detected a miscompare failure in the L1 cache." +// gpu id +#define DCGM_FR_ROW_REMAP_FAILURE_MSG "GPU %u had uncorrectable memory errors and row remapping failed." +#define DCGM_FR_UNCONTAINED_ERROR_MSG "GPU had an uncontained error (XID 95)" +#define DCGM_FR_EMPTY_GPU_LIST_MSG "No valid GPUs passed to plugin" +#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG "Pending page retirements together with a DBE were detected on GPU %u." +// gpu id, rows remapped +#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG "GPU %u had uncorrectable memory errors and %u rows were remapped" +// gpu id +#define DCGM_FR_PENDING_ROW_REMAP_MSG "GPU %u had memory errors and row remappings are pending" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG "GPU %u was unsuccessfully written to in a peer-to-peer test: %s" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s" +// nvswitch id, nvlink id +#define DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG "NVSwitch %u's NvLink %u is down." +#define DCGM_FR_EUD_BINARY_PERMISSIONS_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ROOT_USER_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_SPAWN_FAILURE_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_TIMEOUT_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_ZOMBIE_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ZERO_EXIT_CODE_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_TEST_FAILED_MSG "" /* See message inplace */ +#define DCGM_FR_FILE_CREATE_PERMISSIONS_MSG \ + "The DCGM Diagnostic does not have permissions to create a file in directory '%s'" +#define DCGM_FR_PAUSE_RESUME_FAILED_MSG "" /* See message inplace */ +// gpu id +#define DCGM_FR_PCIE_H_REPLAY_VIOLATION_MSG "GPU %u host-side PCIe replay violation, see dmesg for more information" +// xid error, gpu id +#define DCGM_FR_XID_ERROR_MSG "Detected XID %u for GPU %u" +// count, field, gpu id +#define DCGM_FR_SBE_VIOLATION_MSG "Detected %ld %s for GPU %u" +// count, field, gpu id +#define DCGM_FR_DBE_VIOLATION_MSG "Detected %ld %s for GPU %u" +// count, field, gpu id +#define DCGM_FR_PCIE_REPLAY_VIOLATION_MSG "Detected %ld %s for GPU %u" +// count, field, gpu id, threshold +#define DCGM_FR_SBE_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_DBE_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_CUDA_FM_NOT_INITIALIZED_MSG "" +#define DCGM_FR_SXID_ERROR_MSG "Detected fatal NvSwitch SXID %u" +#define DCGM_FR_ERROR_SENTINEL_MSG "" /* See message inplace */ + +/* + * Suggestions for next steps for the corresponding error message + */ +#define DCGM_FR_OK_NEXT "N/A" +#define DCGM_FR_UNKNOWN_NEXT "" +#define DCGM_FR_UNRECOGNIZED_NEXT "" +#define DCGM_FR_PCI_REPLAY_RATE_NEXT \ + "Reconnect PCIe card. Run system side PCIE diagnostic utilities " \ + "to verify hops off the GPU board. If issue is on the board, run " \ + "the field diagnostic." +#define DCGM_FR_VOLATILE_DBE_DETECTED_NEXT "Drain the GPU and reset it or reboot the node." +#define DCGM_FR_VOLATILE_SBE_DETECTED_NEXT "Monitor - this GPU can still perform workload." +#define DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT "Monitor - this GPU can still perform workload" +#define DCGM_FR_RETIRED_PAGES_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CORRUPT_INFOROM_NEXT "Flash the InfoROM to clear this corruption." +#define DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_POWER_UNREADABLE_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CLOCK_THROTTLE_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload." +#define DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT "Monitor the NVLink. It can still perform workload." +#define DCGM_FR_NVLINK_DOWN_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT "Monitor the NVSwitch. It can still perform workload." +#define DCGM_FR_NVSWITCH_DOWN_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_NO_ACCESS_TO_FILE_NEXT "Check relevant permissions, access, and existence of the file." +#define DCGM_FR_GPU_EXPECTED_NVLINKS_UP_NEXT \ + "Ensure Fabric Manager is running. Check system logs, dmesg, and fabric-manager logs for more info." + +#define DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP_NEXT \ + "Ensure Fabric Manager is running. Check system logs, dmesg, and fabric-manager logs for more info." + +#define DCGM_FR_NVML_API_NEXT \ + "Check the error condition and ensure that appropriate libraries " \ + "are present and accessible." +#define DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT \ + "Check for the presence of cgroups, operating system blocks, and " \ + "or unsupported / older cards" +#define DCGM_FR_BAD_PARAMETER_NEXT BUG_REPORT_MSG +#define DCGM_FR_CANNOT_OPEN_LIB_NEXT \ + "Check for the existence of the library and set LD_LIBRARY_PATH " \ + "if needed." +#define DCGM_FR_DENYLISTED_DRIVER_NEXT "Please load the appropriate driver." +#define DCGM_FR_NVML_LIB_BAD_NEXT \ + "Make sure that the required version of libnvidia-ml.so " \ + "is present and accessible on the system." +#define DCGM_FR_GRAPHICS_PROCESSES_NEXT \ + "Stop the graphics processes or run this diagnostic on a server " \ + "that is not being used for display purposes." +#define DCGM_FR_HOSTENGINE_CONN_NEXT \ + "If hostengine is run separately, please ensure that it is up " \ + "and responsive." +#define DCGM_FR_FIELD_QUERY_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_BAD_CUDA_ENV_NEXT "Please unset this environment variable to address test failures." +#define DCGM_FR_PERSISTENCE_MODE_NEXT \ + "Enable persistence mode by running \"nvidia-smi -i -pm " \ + "1 \" as root." +#define DCGM_FR_LOW_BANDWIDTH_NEXT \ + "Verify that your minimum bandwidth setting is appropriate for " \ + "the topology of each GPU. If so, and errors are consistent, " \ + "please run a field diagnostic." +#define DCGM_FR_HIGH_LATENCY_NEXT \ + "Verify that your maximum latency setting is appropriate for " \ + "the topology of each GPU. If so, and errors are consistent, " \ + "please run a field diagnostic." +#define DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT "" +#define DCGM_FR_FIELD_VIOLATION_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_VIOLATION_DBL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_DBL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_TS_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_THERMAL_VIOLATIONS_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_TEMP_VIOLATION_NEXT \ + "Verify that the user-specified temperature maximum is set " \ + "correctly. If it is, check the cooling for this GPU and node: " DEBUG_COOLING_MSG +#define DCGM_FR_THROTTLING_VIOLATION_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_INTERNAL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_PCIE_GENERATION_NEXT CONFIG_MSG +#define DCGM_FR_PCIE_WIDTH_NEXT CONFIG_MSG +#define DCGM_FR_ABORTED_NEXT "" +#define DCGM_FR_TEST_DISABLED_NEXT CONFIG_MSG +#define DCGM_FR_CANNOT_GET_STAT_NEXT \ + "If running a standalone nv-hostengine, verify that it is up " \ + "and responsive." +#define DCGM_FR_STRESS_LEVEL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_API_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FAULTY_MEMORY_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CANNOT_SET_WATCHES_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_UNBOUND_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_ECC_DISABLED_NEXT \ + "Enable ECC memory by running \"nvidia-smi -i -e 1\" " \ + "to enable. This may require a GPU reset or reboot to take effect." +#define DCGM_FR_MEMORY_ALLOC_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_DBE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_MEMORY_MISMATCH_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CUDA_DEVICE_NEXT \ + "Make sure CUDA_VISIBLE_DEVICES is not preventing visibility of " \ + "this GPU. Also check if CUDA libraries are compatible and " \ + "correctly installed." +#define DCGM_FR_ECC_UNSUPPORTED_NEXT CONFIG_MSG +#define DCGM_FR_ECC_PENDING_NEXT "Reboot to complete activation of the ECC memory." +#define DCGM_FR_MEMORY_BANDWIDTH_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_TARGET_POWER_NEXT "Verify that the clock speeds and GPU utilization are high." +#define DCGM_FR_API_FAIL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_API_FAIL_GPU_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_CONTEXT_NEXT \ + "Please make sure the correct driver version is installed and " \ + "verify that no conflicting libraries are present." +#define DCGM_FR_DCGM_API_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CONCURRENT_GPUS_NEXT CONFIG_MSG +#define DCGM_FR_TOO_MANY_ERRORS_NEXT "" +#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_ENFORCED_POWER_LIMIT_NEXT \ + "If this enforced power limit is necessary, then this test " \ + "cannot be run. If it is unnecessary, then raise the enforced " \ + "power limit setting to be able to run this test." +#define DCGM_FR_MEMORY_ALLOC_HOST_NEXT "Manually kill processes or restart your machine." +#define DCGM_FR_GPU_OP_MODE_NEXT \ + "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i " \ + "" +#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT "" +#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT "" +#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT CONFIG_MSG +#define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_ROW_REMAP_FAILURE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT +#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT "Drain the GPU and reset it or reboot the node to resolve this issue." +#define DCGM_FR_EMPTY_GPU_LIST_NEXT CONFIG_MSG +#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT "" +#define DCGM_FR_PENDING_ROW_REMAP_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT \ + "Please check fabric manager and initialization logs to figure out why the link is down. " \ + "You may also need to run a field diagnostic." +#define DCGM_FR_EUD_BINARY_PERMISSIONS_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ROOT_USER_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_SPAWN_FAILURE_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_TIMEOUT_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_ZOMBIE_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ZERO_EXIT_CODE_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_TEST_FAILED_NEXT "" /* See message inplace */ +#define DCGM_FR_FILE_CREATE_PERMISSIONS_NEXT \ + "Please restart the hostengine with parameter --home-dir to specify a different home directory for the " \ + "diagnostic or change permissions in the current directory to allow the user to write files there." +#define DCGM_FR_PAUSE_RESUME_FAILED_NEXT "" /* See message inplace */ +#define DCGM_FR_PCIE_H_REPLAY_VIOLATION_NEXT "" /* See message inplace */ +#define DCGM_FR_XID_ERROR_NEXT "Please consult the documentation for details of this XID." +#define DCGM_FR_SBE_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_DBE_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_PCIE_REPLAY_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_SBE_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_DBE_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CUDA_FM_NOT_INITIALIZED_NEXT "Ensure that the FabricManager is running without errors." +#define DCGM_FR_SXID_ERROR_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_ERROR_SENTINEL_NEXT "" /* See message inplace */ + +#ifdef __cplusplus +extern "C" { +#endif + +DCGM_PUBLIC_API dcgmErrorSeverity_t dcgmErrorGetPriorityByCode(unsigned int code); +DCGM_PUBLIC_API dcgmErrorCategory_t dcgmErrorGetCategoryByCode(unsigned int code); +DCGM_PUBLIC_API const char *dcgmErrorGetFormatMsgByCode(unsigned int code); + +DCGM_PUBLIC_API const dcgm_error_meta_t *dcgmGetErrorMeta(dcgmError_t error); +DCGM_PUBLIC_API const char *errorString(dcgmReturn_t result); + +/** @} */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // DCGM_ERRORS_H diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_fields.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_fields.h new file mode 100644 index 0000000000..19d1eae2f3 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_fields.h @@ -0,0 +1,2211 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DCGMFIELDS_H +#define DCGMFIELDS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define DCGM_PUBLIC_API + +/***************************************************************************************************/ +/** @defgroup dcgmFieldTypes Field Types + * Field Types are a single byte. + * @{ + */ +/***************************************************************************************************/ + +/** + * Blob of binary data representing a structure + */ +#define DCGM_FT_BINARY 'b' + +/** + * 8-byte double precision + */ +#define DCGM_FT_DOUBLE 'd' + +/** + * 8-byte signed integer + */ +#define DCGM_FT_INT64 'i' + +/** + * Null-terminated ASCII Character string + */ +#define DCGM_FT_STRING 's' + +/** + * 8-byte signed integer usec since 1970 + */ +#define DCGM_FT_TIMESTAMP 't' + +/** @} */ + + +/***************************************************************************************************/ +/** @defgroup dcgmFieldScope Field Scope + * Represents field association with entity scope or global scope. + * @{ + */ +/***************************************************************************************************/ + +/** + * Field is global (ex: driver version) + */ +#define DCGM_FS_GLOBAL 0 + +/** + * Field is associated with an entity (GPU, VGPU...etc) + */ +#define DCGM_FS_ENTITY 1 + +/** + * Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY + */ +#define DCGM_FS_DEVICE DCGM_FS_ENTITY + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup dcgmFieldConstants Field Constants + * Constants that represent contents of individual field values. + * @{ + */ +/***************************************************************************************************/ + +/** + * DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY is 16 bits of major version followed by + * 16 bits of the minor version. These macros separate the two. + */ +#define DCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x)&0xFFFF0000) +#define DCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x)&0x0000FFFF) + +/** + * DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled. + * These macros are masks for relevant throttling, and are a 1:1 map to the NVML + * reasons documented in nvml.h. The notes for the header are copied blow: + */ +/** Nothing is running on the GPU and the clocks are dropping to Idle state + * \note This limiter may be removed in a later release + */ +#define DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE 0x0000000000000001LL +/** GPU clocks are limited by current setting of applications clocks + */ +#define DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING 0x0000000000000002LL +/** SW Power Scaling algorithm is reducing the clocks below requested clocks + */ +#define DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP 0x0000000000000004LL +/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + * + * This is an indicator of: + * - temperature being too high + * - External Power Brake Assertion is triggered (e.g. by the system power supply) + * - Power draw is too high and Fast Trigger protection is reducing the clocks + * - May be also reported during PState or clock change + * - This behavior may be removed in a later release. + */ +#define DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN 0x0000000000000008LL +/** Sync Boost + * + * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in + * order to maximize performance per watt. All GPUs in the sync boost group + * will boost to the minimum possible clocks across the entire group. Look at + * the throttle reasons for other GPUs in the system to see why those GPUs are + * holding this one at lower clocks. + */ +#define DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST 0x0000000000000010LL +/** SW Thermal Slowdown + * + * This is an indicator of one or more of the following: + * - Current GPU temperature above the GPU Max Operating Temperature + * - Current memory temperature above the Memory Max Operating Temperature + */ +#define DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL 0x0000000000000020LL +/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + * + * This is an indicator of: + * - temperature being too high + */ +#define DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL 0x0000000000000040LL +/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + * + * This is an indicator of: + * - External Power Brake Assertion being triggered (e.g. by the system power supply) + */ +#define DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE 0x0000000000000080LL +/** GPU clocks are limited by current setting of Display clocks + */ +#define DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS 0x0000000000000100LL + +/** + * GPU virtualization mode types for DCGM_FI_DEV_VIRTUAL_MODE + */ +typedef enum +{ + DCGM_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU + DCGM_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthrough + DCGM_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. + DCGM_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode + DCGM_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4, //!< Device is associated with VGX hypervisor in vSGA mode +} dcgmGpuVirtualizationMode_t; + + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup dcgmFieldEntity Field Entity + * Represents field association with a particular entity + * @{ + */ +/***************************************************************************************************/ + +/** + * Enum of possible field entity groups + */ +typedef enum dcgm_field_entity_group_t +{ + DCGM_FE_NONE = 0, /*!< Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL */ + DCGM_FE_GPU, /*!< Field is associated with a GPU entity */ + DCGM_FE_VGPU, /*!< Field is associated with a VGPU entity */ + DCGM_FE_SWITCH, /*!< Field is associated with a Switch entity */ + DCGM_FE_GPU_I, /*!< Field is associated with a GPU Instance entity */ + DCGM_FE_GPU_CI, /*!< Field is associated with a GPU Compute Instance entity */ + DCGM_FE_LINK, /*!< Field is associated with an NVLink */ + DCGM_FE_CPU, /*!< Field is associated with a CPU node */ + DCGM_FE_CPU_CORE, /*!< Field is associated with a CPU */ + + DCGM_FE_COUNT /*!< Number of elements in this enumeration. Keep this entry last */ +} dcgm_field_entity_group_t; + +/** + * Represents an identifier for an entity within a field entity. For instance, this is the gpuId for DCGM_FE_GPU. + */ +typedef unsigned int dcgm_field_eid_t; + + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup dcgmFieldIdentifiers Field Identifiers + * Field Identifiers + * @{ + */ +/***************************************************************************************************/ + +/** + * NULL field + */ +#define DCGM_FI_UNKNOWN 0 + +/** + * Driver Version + */ +#define DCGM_FI_DRIVER_VERSION 1 + +/* Underlying NVML version */ +#define DCGM_FI_NVML_VERSION 2 + +/* + * Process Name + */ +#define DCGM_FI_PROCESS_NAME 3 + +/** + * Number of Devices on the node + */ +#define DCGM_FI_DEV_COUNT 4 + +/** + * Cuda Driver Version + * Retrieves a number with the major value in the thousands place and the minor value in the hundreds place. + * CUDA 11.1 = 11100 + */ +#define DCGM_FI_CUDA_DRIVER_VERSION 5 + + +/** + * Name of the GPU device + */ +#define DCGM_FI_DEV_NAME 50 + +/** + * Device Brand + */ +#define DCGM_FI_DEV_BRAND 51 + +/** + * NVML index of this GPU + */ +#define DCGM_FI_DEV_NVML_INDEX 52 + +/** + * Device Serial Number + */ +#define DCGM_FI_DEV_SERIAL 53 + +/** + * UUID corresponding to the device + */ +#define DCGM_FI_DEV_UUID 54 + +/** + * Device node minor number /dev/nvidia# + */ +#define DCGM_FI_DEV_MINOR_NUMBER 55 + +/** + * OEM inforom version + */ +#define DCGM_FI_DEV_OEM_INFOROM_VER 56 + +/** + * PCI attributes for the device + */ +#define DCGM_FI_DEV_PCI_BUSID 57 + +/** + * The combined 16-bit device id and 16-bit vendor id + */ +#define DCGM_FI_DEV_PCI_COMBINED_ID 58 + +/** + * The 32-bit Sub System Device ID + */ +#define DCGM_FI_DEV_PCI_SUBSYS_ID 59 + +/** + * Topology of all GPUs on the system via PCI (static) + */ +#define DCGM_FI_GPU_TOPOLOGY_PCI 60 + +/** + * Topology of all GPUs on the system via NVLINK (static) + */ +#define DCGM_FI_GPU_TOPOLOGY_NVLINK 61 + +/** + * Affinity of all GPUs on the system (static) + */ +#define DCGM_FI_GPU_TOPOLOGY_AFFINITY 62 + +/** + * Cuda compute capability for the device. + * The major version is the upper 32 bits and + * the minor version is the lower 32 bits. + */ +#define DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY 63 + +/** + * Compute mode for the device + */ +#define DCGM_FI_DEV_COMPUTE_MODE 65 + +/** + * Persistence mode for the device + * Boolean: 0 is disabled, 1 is enabled + */ +#define DCGM_FI_DEV_PERSISTENCE_MODE 66 + +/** + * MIG mode for the device + * Boolean: 0 is disabled, 1 is enabled + */ +#define DCGM_FI_DEV_MIG_MODE 67 + +/** + * The string that CUDA_VISIBLE_DEVICES should + * be set to for this entity (including MIG) + */ +#define DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR 68 + +/** + * The maximum number of MIG slices supported by this GPU + */ +#define DCGM_FI_DEV_MIG_MAX_SLICES 69 + +/** + * Device CPU affinity. part 1/8 = cpus 0 - 63 + */ +#define DCGM_FI_DEV_CPU_AFFINITY_0 70 + +/** + * Device CPU affinity. part 1/8 = cpus 64 - 127 + */ +#define DCGM_FI_DEV_CPU_AFFINITY_1 71 + +/** + * Device CPU affinity. part 2/8 = cpus 128 - 191 + */ +#define DCGM_FI_DEV_CPU_AFFINITY_2 72 + +/** + * Device CPU affinity. part 3/8 = cpus 192 - 255 + */ +#define DCGM_FI_DEV_CPU_AFFINITY_3 73 + +/** + * ConfidentialCompute/AmpereProtectedMemory status for this system + * 0 = disabled + * 1 = enabled + */ +#define DCGM_FI_DEV_CC_MODE 74 + +/** + * Attributes for the given MIG device handles + */ +#define DCGM_FI_DEV_MIG_ATTRIBUTES 75 + +/** + * GPU instance profile information + */ +#define DCGM_FI_DEV_MIG_GI_INFO 76 + +/** + * Compute instance profile information + */ +#define DCGM_FI_DEV_MIG_CI_INFO 77 + +/** + * ECC inforom version + */ +#define DCGM_FI_DEV_ECC_INFOROM_VER 80 + +/** + * Power management object inforom version + */ +#define DCGM_FI_DEV_POWER_INFOROM_VER 81 + +/** + * Inforom image version + */ +#define DCGM_FI_DEV_INFOROM_IMAGE_VER 82 + +/** + * Inforom configuration checksum + */ +#define DCGM_FI_DEV_INFOROM_CONFIG_CHECK 83 + +/** + * Reads the infoROM from the flash and verifies the checksums + */ +#define DCGM_FI_DEV_INFOROM_CONFIG_VALID 84 + +/** + * VBIOS version of the device + */ +#define DCGM_FI_DEV_VBIOS_VERSION 85 + +/** + * Device Memory node affinity, 0-63 + */ +#define DCGM_FI_DEV_MEM_AFFINITY_0 86 + +/** + * Device Memory node affinity, 64-127 + */ +#define DCGM_FI_DEV_MEM_AFFINITY_1 87 + +/** + * Device Memory node affinity, 128-191 + */ +#define DCGM_FI_DEV_MEM_AFFINITY_2 88 + +/** + * Device Memory node affinity, 192-255 + */ +#define DCGM_FI_DEV_MEM_AFFINITY_3 89 + +/** + * Total BAR1 of the GPU in MB + */ +#define DCGM_FI_DEV_BAR1_TOTAL 90 + +/** + * Deprecated - Sync boost settings on the node + */ +#define DCGM_FI_SYNC_BOOST 91 + +/** + * Used BAR1 of the GPU in MB + */ +#define DCGM_FI_DEV_BAR1_USED 92 + +/** + * Free BAR1 of the GPU in MB + */ +#define DCGM_FI_DEV_BAR1_FREE 93 + +/** + * SM clock for the device + */ +#define DCGM_FI_DEV_SM_CLOCK 100 + +/** + * Memory clock for the device + */ +#define DCGM_FI_DEV_MEM_CLOCK 101 + +/** + * Video encoder/decoder clock for the device + */ +#define DCGM_FI_DEV_VIDEO_CLOCK 102 + +/** + * SM Application clocks + */ +#define DCGM_FI_DEV_APP_SM_CLOCK 110 + +/** + * Memory Application clocks + */ +#define DCGM_FI_DEV_APP_MEM_CLOCK 111 + +/** + * Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*) + */ +#define DCGM_FI_DEV_CLOCK_THROTTLE_REASONS 112 + +/** + * Maximum supported SM clock for the device + */ +#define DCGM_FI_DEV_MAX_SM_CLOCK 113 + +/** + * Maximum supported Memory clock for the device + */ +#define DCGM_FI_DEV_MAX_MEM_CLOCK 114 + +/** + * Maximum supported Video encoder/decoder clock for the device + */ +#define DCGM_FI_DEV_MAX_VIDEO_CLOCK 115 + +/** + * Auto-boost for the device (1 = enabled. 0 = disabled) + */ +#define DCGM_FI_DEV_AUTOBOOST 120 + +/** + * Supported clocks for the device + */ +#define DCGM_FI_DEV_SUPPORTED_CLOCKS 130 + +/** + * Memory temperature for the device + */ +#define DCGM_FI_DEV_MEMORY_TEMP 140 + +/** + * Current temperature readings for the device, in degrees C + */ +#define DCGM_FI_DEV_GPU_TEMP 150 + +/** + * Maximum operating temperature for the memory of this GPU + */ +#define DCGM_FI_DEV_MEM_MAX_OP_TEMP 151 + +/** + * Maximum operating temperature for this GPU + */ +#define DCGM_FI_DEV_GPU_MAX_OP_TEMP 152 + + +/** + * Power usage for the device in Watts + */ +#define DCGM_FI_DEV_POWER_USAGE 155 + +/** + * Total energy consumption for the GPU in mJ since the driver was last reloaded + */ +#define DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION 156 + +/** + * Current instantaneous power usage of the device in Watts + */ +#define DCGM_FI_DEV_POWER_USAGE_INSTANT 157 + +/** + * Slowdown temperature for the device + */ +#define DCGM_FI_DEV_SLOWDOWN_TEMP 158 + +/** + * Shutdown temperature for the device + */ +#define DCGM_FI_DEV_SHUTDOWN_TEMP 159 + +/** + * Current Power limit for the device + */ +#define DCGM_FI_DEV_POWER_MGMT_LIMIT 160 + +/** + * Minimum power management limit for the device + */ +#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN 161 + +/** + * Maximum power management limit for the device + */ +#define DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX 162 + +/** + * Default power management limit for the device + */ +#define DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF 163 + +/** + * Effective power limit that the driver enforces after taking into account all limiters + */ +#define DCGM_FI_DEV_ENFORCED_POWER_LIMIT 164 + +/** + * Performance state (P-State) 0-15. 0=highest + */ +#define DCGM_FI_DEV_PSTATE 190 + +/** + * Fan speed for the device in percent 0-100 + */ +#define DCGM_FI_DEV_FAN_SPEED 191 + +/** + * PCIe Tx utilization information + * + * Deprecated: Use DCGM_FI_PROF_PCIE_TX_BYTES instead. + */ +#define DCGM_FI_DEV_PCIE_TX_THROUGHPUT 200 + +/** + * PCIe Rx utilization information + * + * Deprecated: Use DCGM_FI_PROF_PCIE_RX_BYTES instead. + */ +#define DCGM_FI_DEV_PCIE_RX_THROUGHPUT 201 + +/** + * PCIe replay counter + */ +#define DCGM_FI_DEV_PCIE_REPLAY_COUNTER 202 + +/** + * GPU Utilization + */ +#define DCGM_FI_DEV_GPU_UTIL 203 + +/** + * Memory Utilization + */ +#define DCGM_FI_DEV_MEM_COPY_UTIL 204 + +/** + * Process accounting stats. + * + * This field is only supported when the host engine is running as root unless you + * enable accounting ahead of time. Accounting mode can be enabled by + * running "nvidia-smi -am 1" as root on the same node the host engine is running on. + */ +#define DCGM_FI_DEV_ACCOUNTING_DATA 205 + +/** + * Encoder Utilization + */ +#define DCGM_FI_DEV_ENC_UTIL 206 + +/** + * Decoder Utilization + */ +#define DCGM_FI_DEV_DEC_UTIL 207 + +/* Fields 210, 211, 220 and 221 are internal-only. See dcgm_fields_internal.hpp */ + +/** + * XID errors. The value is the specific XID error + */ +#define DCGM_FI_DEV_XID_ERRORS 230 + +/** + * PCIe Max Link Generation + */ +#define DCGM_FI_DEV_PCIE_MAX_LINK_GEN 235 + +/** + * PCIe Max Link Width + */ +#define DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH 236 + +/** + * PCIe Current Link Generation + */ +#define DCGM_FI_DEV_PCIE_LINK_GEN 237 + +/** + * PCIe Current Link Width + */ +#define DCGM_FI_DEV_PCIE_LINK_WIDTH 238 + +/** + * Power Violation time in usec + */ +#define DCGM_FI_DEV_POWER_VIOLATION 240 + +/** + * Thermal Violation time in usec + */ +#define DCGM_FI_DEV_THERMAL_VIOLATION 241 + +/** + * Sync Boost Violation time in usec + */ +#define DCGM_FI_DEV_SYNC_BOOST_VIOLATION 242 + +/** + * Board violation limit. + */ +#define DCGM_FI_DEV_BOARD_LIMIT_VIOLATION 243 + +/** + *Low utilisation violation limit. + */ +#define DCGM_FI_DEV_LOW_UTIL_VIOLATION 244 + +/** + *Reliability violation limit. + */ +#define DCGM_FI_DEV_RELIABILITY_VIOLATION 245 + +/** + * App clock violation limit. + */ +#define DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION 246 + +/** + * Base clock violation limit. + */ +#define DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION 247 + +/** + * Total Frame Buffer of the GPU in MB + */ +#define DCGM_FI_DEV_FB_TOTAL 250 + +/** + * Free Frame Buffer in MB + */ +#define DCGM_FI_DEV_FB_FREE 251 + +/** + * Used Frame Buffer in MB + */ +#define DCGM_FI_DEV_FB_USED 252 + +/** + * Reserved Frame Buffer in MB + */ +#define DCGM_FI_DEV_FB_RESERVED 253 + +/** + * Percentage used of Frame Buffer: 'Used/(Total - Reserved)'. Range 0.0-1.0 + */ +#define DCGM_FI_DEV_FB_USED_PERCENT 254 + +/** + * C2C Link Count + */ +#define DCGM_FI_DEV_C2C_LINK_COUNT 285 + +/** + * C2C Link Status + * The value of 0 the link is INACTIVE. + * The value of 1 the link is ACTIVE. + */ +#define DCGM_FI_DEV_C2C_LINK_STATUS 286 + +/** + * C2C Max Bandwidth + * The value indicates the link speed in MB/s. + */ +#define DCGM_FI_DEV_C2C_MAX_BANDWIDTH 287 + +/** + * Current ECC mode for the device + */ +#define DCGM_FI_DEV_ECC_CURRENT 300 + +/** + * Pending ECC mode for the device + */ +#define DCGM_FI_DEV_ECC_PENDING 301 + +/** + * Total single bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_SBE_VOL_TOTAL 310 + +/** + * Total double bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_DBE_VOL_TOTAL 311 + +/** + * Total single bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 312 + +/** + * Total double bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 313 + +/** + * L1 cache single bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_SBE_VOL_L1 314 + +/** + * L1 cache double bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_DBE_VOL_L1 315 + +/** + * L2 cache single bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_SBE_VOL_L2 316 + +/** + * L2 cache double bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_DBE_VOL_L2 317 + +/** + * Device memory single bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_SBE_VOL_DEV 318 + +/** + * Device memory double bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_DBE_VOL_DEV 319 + +/** + * Register file single bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_SBE_VOL_REG 320 + +/** + * Register file double bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_DBE_VOL_REG 321 + +/** + * Texture memory single bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_SBE_VOL_TEX 322 + +/** + * Texture memory double bit volatile ECC errors + */ +#define DCGM_FI_DEV_ECC_DBE_VOL_TEX 323 + +/** + * L1 cache single bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_L1 324 + +/** + * L1 cache double bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_L1 325 + +/** + * L2 cache single bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_L2 326 + +/** + * L2 cache double bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_L2 327 + +/** + * Device memory single bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_DEV 328 + +/** + * Device memory double bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_DEV 329 + +/** + * Register File single bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_REG 330 + +/** + * Register File double bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_REG 331 + +/** + * Texture memory single bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_SBE_AGG_TEX 332 + +/** + * Texture memory double bit aggregate (persistent) ECC errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_ECC_DBE_AGG_TEX 333 + +/** + * Historical max available spare memory rows per memory bank + */ +#define DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX 385 + +/** + * Historical high mark of available spare memory rows per memory bank + */ +#define DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH 386 + +/** + * Historical mark of partial available spare memory rows per memory bank + */ +#define DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL 387 + +/** + * Historical low mark of available spare memory rows per memory bank + */ +#define DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW 388 + +/** + * Historical marker of memory banks with no available spare memory rows + */ +#define DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE 389 + +/** + * Number of retired pages because of single bit errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_RETIRED_SBE 390 + +/** + * Number of retired pages because of double bit errors + * Note: monotonically increasing + */ +#define DCGM_FI_DEV_RETIRED_DBE 391 + +/** + * Number of pages pending retirement + */ +#define DCGM_FI_DEV_RETIRED_PENDING 392 + +/** + * Number of remapped rows for uncorrectable errors + */ +#define DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS 393 + +/** + * Number of remapped rows for correctable errors + */ +#define DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS 394 + +/** + * Whether remapping of rows has failed + */ +#define DCGM_FI_DEV_ROW_REMAP_FAILURE 395 + +/** + * Whether remapping of rows is pending + */ +#define DCGM_FI_DEV_ROW_REMAP_PENDING 396 + +/* + * NV Link flow control CRC Error Counter for Lane 0 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 400 + +/* + * NV Link flow control CRC Error Counter for Lane 1 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 401 + +/* + * NV Link flow control CRC Error Counter for Lane 2 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 402 + +/* + * NV Link flow control CRC Error Counter for Lane 3 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 403 + +/* + * NV Link flow control CRC Error Counter for Lane 4 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 404 + +/* + * NV Link flow control CRC Error Counter for Lane 5 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 405 + +/* + * NV Link flow control CRC Error Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 409 + +/* + * NV Link data CRC Error Counter for Lane 0 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 410 + +/* + * NV Link data CRC Error Counter for Lane 1 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 411 + +/* + * NV Link data CRC Error Counter for Lane 2 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 412 + +/* + * NV Link data CRC Error Counter for Lane 3 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 413 + +/* + * NV Link data CRC Error Counter for Lane 4 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 414 + +/* + * NV Link data CRC Error Counter for Lane 5 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 415 + +/* + * NV Link data CRC Error Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 419 + +/* + * NV Link Replay Error Counter for Lane 0 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 420 + +/* + * NV Link Replay Error Counter for Lane 1 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 421 + +/* + * NV Link Replay Error Counter for Lane 2 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 422 + +/* + * NV Link Replay Error Counter for Lane 3 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 423 + +/* + * NV Link Replay Error Counter for Lane 4 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 424 + +/* + * NV Link Replay Error Counter for Lane 5 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 425 + +/* + * NV Link Replay Error Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 429 + +/* + * NV Link Recovery Error Counter for Lane 0 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 430 + +/* + * NV Link Recovery Error Counter for Lane 1 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 431 + +/* + * NV Link Recovery Error Counter for Lane 2 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 432 + +/* + * NV Link Recovery Error Counter for Lane 3 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 433 + +/* + * NV Link Recovery Error Counter for Lane 4 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 434 + +/* + * NV Link Recovery Error Counter for Lane 5 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 435 + +/* + * NV Link Recovery Error Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 439 + +/* + * NV Link Bandwidth Counter for Lane 0 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 440 + +/* + * NV Link Bandwidth Counter for Lane 1 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 441 + +/* + * NV Link Bandwidth Counter for Lane 2 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 442 + +/* + * NV Link Bandwidth Counter for Lane 3 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 443 + +/* + * NV Link Bandwidth Counter for Lane 4 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 444 + +/* + * NV Link Bandwidth Counter for Lane 5 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 445 + +/* + * NV Link Bandwidth Counter total for all Lanes + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL 449 + +/* + * GPU NVLink error information + */ +#define DCGM_FI_DEV_GPU_NVLINK_ERRORS 450 + +/* + * NV Link flow control CRC Error Counter for Lane 6 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 451 + +/* + * NV Link flow control CRC Error Counter for Lane 7 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 452 + +/* + * NV Link flow control CRC Error Counter for Lane 8 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 453 + +/* + * NV Link flow control CRC Error Counter for Lane 9 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 454 + +/* + * NV Link flow control CRC Error Counter for Lane 10 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 455 + +/* + * NV Link flow control CRC Error Counter for Lane 11 + */ +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 456 + +/* + * NV Link data CRC Error Counter for Lane 6 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 457 + +/* + * NV Link data CRC Error Counter for Lane 7 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 458 + +/* + * NV Link data CRC Error Counter for Lane 8 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 459 + +/* + * NV Link data CRC Error Counter for Lane 9 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 460 + +/* + * NV Link data CRC Error Counter for Lane 10 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 461 + +/* + * NV Link data CRC Error Counter for Lane 11 + */ +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 462 + +/* + * NV Link Replay Error Counter for Lane 6 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 463 + +/* + * NV Link Replay Error Counter for Lane 7 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 464 + +/* + * NV Link Replay Error Counter for Lane 8 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 465 + +/* + * NV Link Replay Error Counter for Lane 9 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 466 + +/* + * NV Link Replay Error Counter for Lane 10 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 467 + +/* + * NV Link Replay Error Counter for Lane 11 + */ +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 468 + +/* + * NV Link Recovery Error Counter for Lane 6 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 469 + +/* + * NV Link Recovery Error Counter for Lane 7 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 470 + +/* + * NV Link Recovery Error Counter for Lane 8 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 471 + +/* + * NV Link Recovery Error Counter for Lane 9 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 472 + +/* + * NV Link Recovery Error Counter for Lane 10 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 473 + +/* + * NV Link Recovery Error Counter for Lane 11 + */ +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 474 + +/* + * NV Link Bandwidth Counter for Lane 6 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 475 + +/* + * NV Link Bandwidth Counter for Lane 7 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 476 + +/* + * NV Link Bandwidth Counter for Lane 8 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 477 + +/* + * NV Link Bandwidth Counter for Lane 9 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 478 + +/* + * NV Link Bandwidth Counter for Lane 10 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 479 + +/* + * NV Link Bandwidth Counter for Lane 11 + */ +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 480 + +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 406 +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 407 +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 408 +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 481 +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 482 +#define DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 483 + +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 416 +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 417 +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 418 +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 484 +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 485 +#define DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 486 + +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 426 +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 427 +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 428 +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 487 +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 488 +#define DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 489 + +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 436 +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 437 +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 438 +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 491 +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 492 +#define DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 493 + +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 446 +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 447 +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 448 +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 494 +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 495 +#define DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 496 + +/** + * Virtualization Mode corresponding to the GPU. + * + * One of DCGM_GPU_VIRTUALIZATION_MODE_* constants. + */ +#define DCGM_FI_DEV_VIRTUAL_MODE 500 + +/** + * Includes Count and Static info of vGPU types supported on a device + */ +#define DCGM_FI_DEV_SUPPORTED_TYPE_INFO 501 + +/** + * Includes Count and currently Creatable vGPU types on a device + */ +#define DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS 502 + +/** + * Includes Count and currently Active vGPU Instances on a device + */ +#define DCGM_FI_DEV_VGPU_INSTANCE_IDS 503 + +/** + * Utilization values for vGPUs running on the device + */ +#define DCGM_FI_DEV_VGPU_UTILIZATIONS 504 + +/** + * Utilization values for processes running within vGPU VMs using the device + */ +#define DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION 505 + +/** + * Current encoder statistics for a given device + */ +#define DCGM_FI_DEV_ENC_STATS 506 + +/** + * Statistics of current active frame buffer capture sessions on a given device + */ +#define DCGM_FI_DEV_FBC_STATS 507 + +/** + * Information about active frame buffer capture sessions on a target device + */ +#define DCGM_FI_DEV_FBC_SESSIONS_INFO 508 + +/** + * Includes Count and currently Supported vGPU types on a device + */ +#define DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS 509 + +/** + * Includes Static info of vGPU types supported on a device + */ +#define DCGM_FI_DEV_VGPU_TYPE_INFO 510 + +/** + * Includes the name of a vGPU type supported on a device + */ +#define DCGM_FI_DEV_VGPU_TYPE_NAME 511 + +/** + * Includes the class of a vGPU type supported on a device + */ +#define DCGM_FI_DEV_VGPU_TYPE_CLASS 512 + +/** + * Includes the license info for a vGPU type supported on a device + */ +#define DCGM_FI_DEV_VGPU_TYPE_LICENSE 513 + +/** + * VM ID of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_VM_ID 520 + +/** + * VM name of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_VM_NAME 521 + +/** + * vGPU type of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_TYPE 522 + +/** + * UUID of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_UUID 523 + +/** + * Driver version of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_DRIVER_VERSION 524 + +/** + * Memory usage of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_MEMORY_USAGE 525 + +/** + * License status of the vGPU + */ +#define DCGM_FI_DEV_VGPU_LICENSE_STATUS 526 + +/** + * Frame rate limit of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT 527 + +/** + * Current encoder statistics of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_ENC_STATS 528 + +/** + * Information about all active encoder sessions on the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO 529 + +/** + * Statistics of current active frame buffer capture sessions on the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_FBC_STATS 530 + +/** + * Information about active frame buffer capture sessions on the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO 531 + +/** + * License state information of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE 532 + +/** + * PCI Id of the vGPU instance + */ +#define DCGM_FI_DEV_VGPU_PCI_ID 533 + +/** + * GPU Instance ID for the given vGPU Instance + */ +#define DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID 534 + +/** + * Starting field ID of the vGPU instance + */ +#define DCGM_FI_FIRST_VGPU_FIELD_ID 520 + +/** + * Last field ID of the vGPU instance + */ +#define DCGM_FI_LAST_VGPU_FIELD_ID 570 + +/** + * For now max vGPU field Ids taken as difference of DCGM_FI_LAST_VGPU_FIELD_ID and DCGM_FI_LAST_VGPU_FIELD_ID i.e. 50 + */ +#define DCGM_FI_MAX_VGPU_FIELDS DCGM_FI_LAST_VGPU_FIELD_ID - DCGM_FI_FIRST_VGPU_FIELD_ID + +/** + * Starting ID for all the internal fields + */ +#define DCGM_FI_INTERNAL_FIELDS_0_START 600 + +/** + * Last ID for all the internal fields + */ + +/** + *

 

+ *

 

+ *

 

+ *

NVSwitch entity field IDs start here.

+ *

 

+ *

 

+ *

NVSwitch latency bins for port 0

+ */ + +#define DCGM_FI_INTERNAL_FIELDS_0_END 699 + +/** + * Starting field ID of the NVSwitch instance + */ +#define DCGM_FI_FIRST_NVSWITCH_FIELD_ID 700 + +/** + * NvSwitch voltage + */ +#define DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT 701 + +/** + * NvSwitch Current IDDQ + */ +#define DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ 702 + +/** + * NvSwitch Current IDDQ Rev + */ +#define DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV 703 + +/** + * NvSwitch Current IDDQ Rev DVDD + */ +#define DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD 704 + +/** + * NvSwitch Power VDD in watts + */ +#define DCGM_FI_DEV_NVSWITCH_POWER_VDD 705 + +/** + * NvSwitch Power DVDD in watts + */ +#define DCGM_FI_DEV_NVSWITCH_POWER_DVDD 706 + +/** + * NvSwitch Power HVDD in watts + */ +#define DCGM_FI_DEV_NVSWITCH_POWER_HVDD 707 + +/** + *

NVSwitch Tx Throughput Counter for ports 0-17

+ */ +#define DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX 780 +/** + * NVSwitch Rx Throughput Counter for ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX 781 + +/** + * NvSwitch fatal_errors for ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS 782 + +/** + * NvSwitch non_fatal_errors for ports 0-17 + * + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS 783 + +/** + * NvSwitch replay_count_errors for ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS 784 + +/** + * NvSwitch recovery_count_errors for ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS 785 + +/** + * NvSwitch filt_err_count_errors for ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS 786 + +/** + * NvLink lane_crs_err_count_aggregate_errors for ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS 787 + +/** + * NvLink lane ecc_err_count_aggregate_errors for ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS 788 + +/** + * Nvlink lane latency low lane0 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 789 + +/** + * Nvlink lane latency low lane1 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 790 + +/** + * Nvlink lane latency low lane2 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 791 + +/** + * Nvlink lane latency low lane3 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 792 + +/** + * Nvlink lane latency medium lane0 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 793 + +/** + * Nvlink lane latency medium lane1 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 794 + +/** + * Nvlink lane latency medium lane2 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 795 + +/** + * Nvlink lane latency medium lane3 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 796 + +/** + * Nvlink lane latency high lane0 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 797 + +/** + * Nvlink lane latency high lane1 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 798 + +/** + * Nvlink lane latency high lane2 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 799 + +/** + * Nvlink lane latency high lane3 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 800 + +/** + * Nvlink lane latency panic lane0 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 801 + +/** + * Nvlink lane latency panic lane1 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 802 + +/** + * Nvlink lane latency panic lane2 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 803 + +/** + * Nvlink lane latency panic lane2 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 804 + +/** + * Nvlink lane latency count lane0 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 805 + +/** + * Nvlink lane latency count lane1 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 806 + +/** + * Nvlink lane latency count lane2 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 807 + +/** + * Nvlink lane latency count lane3 counter. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 808 + +/** + * NvLink lane crc_err_count for lane 0 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 809 + +/** + * NvLink lane crc_err_count for lane 1 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 810 + +/** + * NvLink lane crc_err_count for lane 2 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 811 + +/** + * NvLink lane crc_err_count for lane 3 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 812 + +/** + * NvLink lane ecc_err_count for lane 0 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 813 + +/** + * NvLink lane ecc_err_count for lane 1 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 814 + +/** + * NvLink lane ecc_err_count for lane 2 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 815 + +/** + * NvLink lane ecc_err_count for lane 3 on ports 0-17 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 816 + +/** + * NVSwitch fatal error information. + * Note: value field indicates the specific SXid reported + */ +#define DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS 856 + +/** + * NVSwitch non fatal error information. + * Note: value field indicates the specific SXid reported + */ +#define DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS 857 + +/** + * NVSwitch current temperature. + */ +#define DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT 858 + +/** + * NVSwitch limit slowdown temperature. + */ +#define DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN 859 + +/** + * NVSwitch limit shutdown temperature. + */ +#define DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN 860 + +/** + * NVSwitch throughput Tx. + */ +#define DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX 861 + +/** + * NVSwitch throughput Rx. + */ +#define DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX 862 + +/* + * NVSwitch Physical ID. + */ +#define DCGM_FI_DEV_NVSWITCH_PHYS_ID 863 + +/** + * NVSwitch reset required. + */ +#define DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED 864 + +/** + * NvSwitch NvLink ID + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_ID 865 + +/** + * NvSwitch PCIE domain + */ +#define DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN 866 + +/** + * NvSwitch PCIE bus + */ +#define DCGM_FI_DEV_NVSWITCH_PCIE_BUS 867 + +/** + * NvSwitch PCIE device + */ +#define DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE 868 + +/** + * NvSwitch PCIE function + */ +#define DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION 869 + +/** + * NvLink status. UNKNOWN:-1 OFF:0 SAFE:1 ACTIVE:2 ERROR:3 + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_STATUS 870 + +/** + * NvLink device type (GPU/Switch). + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_TYPE 871 + +/** + * NvLink device pcie domain. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN 872 + +/** + * NvLink device pcie bus. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS 873 + +/** + * NvLink device pcie device. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE 874 +/** + * NvLink device pcie function. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION 875 + +/** + * NvLink device link ID + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID 876 + +/** + * NvLink device SID. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID 877 + +/** + * NvLink device link uid. + */ +#define DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID 878 + +/** + * Last field ID of the NVSwitch instance + */ +#define DCGM_FI_LAST_NVSWITCH_FIELD_ID 899 + +/** + * For now max NVSwitch field Ids taken as difference of DCGM_FI_LAST_NVSWITCH_FIELD_ID and + * DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 i.e. 200 + */ +#define DCGM_FI_MAX_NVSWITCH_FIELDS DCGM_FI_LAST_NVSWITCH_FIELD_ID - DCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 + +/** + * Profiling Fields. These all start with DCGM_FI_PROF_* + */ + +/** + * Ratio of time the graphics engine is active. The graphics engine is + * active if a graphics/compute context is bound and the graphics pipe or + * compute pipe is busy. + */ +#define DCGM_FI_PROF_GR_ENGINE_ACTIVE 1001 + +/** + * The ratio of cycles an SM has at least 1 warp assigned + * (computed from the number of cycles and elapsed cycles) + */ +#define DCGM_FI_PROF_SM_ACTIVE 1002 + +/** + * The ratio of number of warps resident on an SM. + * (number of resident as a ratio of the theoretical + * maximum number of warps per elapsed cycle) + */ +#define DCGM_FI_PROF_SM_OCCUPANCY 1003 + +/** + * The ratio of cycles the any tensor pipe is active + * (off the peak sustained elapsed cycles) + */ +#define DCGM_FI_PROF_PIPE_TENSOR_ACTIVE 1004 + +/** + * The ratio of cycles the device memory interface is + * active sending or receiving data. + */ +#define DCGM_FI_PROF_DRAM_ACTIVE 1005 + +/** + * Ratio of cycles the fp64 pipe is active. + */ +#define DCGM_FI_PROF_PIPE_FP64_ACTIVE 1006 + +/** + * Ratio of cycles the fp32 pipe is active. + */ +#define DCGM_FI_PROF_PIPE_FP32_ACTIVE 1007 + +/** + * Ratio of cycles the fp16 pipe is active. This does not include HMMA. + */ +#define DCGM_FI_PROF_PIPE_FP16_ACTIVE 1008 + +/** + * The number of bytes of active PCIe tx (transmit) data including both header and payload. + * + * Note that this is from the perspective of the GPU, so copying data from device to host (DtoH) + * would be reflected in this metric. + */ +#define DCGM_FI_PROF_PCIE_TX_BYTES 1009 + +/** + * The number of bytes of active PCIe rx (read) data including both header and payload. + * + * Note that this is from the perspective of the GPU, so copying data from host to device (HtoD) + * would be reflected in this metric. + */ +#define DCGM_FI_PROF_PCIE_RX_BYTES 1010 + +/** + * The total number of bytes of active NvLink tx (transmit) data including both header and payload. + * Per-link fields are available below + */ +#define DCGM_FI_PROF_NVLINK_TX_BYTES 1011 + +/** + * The total number of bytes of active NvLink rx (read) data including both header and payload. + * Per-link fields are available below + */ +#define DCGM_FI_PROF_NVLINK_RX_BYTES 1012 + +/** + * The ratio of cycles the tensor (IMMA) pipe is active (off the peak sustained elapsed cycles) + */ +#define DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE 1013 + +/** + * The ratio of cycles the tensor (HMMA) pipe is active (off the peak sustained elapsed cycles) + */ +#define DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE 1014 + +/** + * The ratio of cycles the tensor (DFMA) pipe is active (off the peak sustained elapsed cycles) + */ +#define DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE 1015 + +/** + * Ratio of cycles the integer pipe is active. + */ +#define DCGM_FI_PROF_PIPE_INT_ACTIVE 1016 + +/** + * Ratio of cycles each of the NVDEC engines are active. + */ +#define DCGM_FI_PROF_NVDEC0_ACTIVE 1017 +#define DCGM_FI_PROF_NVDEC1_ACTIVE 1018 +#define DCGM_FI_PROF_NVDEC2_ACTIVE 1019 +#define DCGM_FI_PROF_NVDEC3_ACTIVE 1020 +#define DCGM_FI_PROF_NVDEC4_ACTIVE 1021 +#define DCGM_FI_PROF_NVDEC5_ACTIVE 1022 +#define DCGM_FI_PROF_NVDEC6_ACTIVE 1023 +#define DCGM_FI_PROF_NVDEC7_ACTIVE 1024 + +/** + * Ratio of cycles each of the NVJPG engines are active. + */ +#define DCGM_FI_PROF_NVJPG0_ACTIVE 1025 +#define DCGM_FI_PROF_NVJPG1_ACTIVE 1026 +#define DCGM_FI_PROF_NVJPG2_ACTIVE 1027 +#define DCGM_FI_PROF_NVJPG3_ACTIVE 1028 +#define DCGM_FI_PROF_NVJPG4_ACTIVE 1029 +#define DCGM_FI_PROF_NVJPG5_ACTIVE 1030 +#define DCGM_FI_PROF_NVJPG6_ACTIVE 1031 +#define DCGM_FI_PROF_NVJPG7_ACTIVE 1032 + +/** + * Ratio of cycles each of the NVOFA engines are active. + */ +#define DCGM_FI_PROF_NVOFA0_ACTIVE 1033 + +/** + * The per-link number of bytes of active NvLink TX (transmit) or RX (transmit) data including both header and payload. + * For example: DCGM_FI_PROF_NVLINK_L0_TX_BYTES -> L0 TX + * To get the bandwidth for a link, add the RX and TX value together like + * total = DCGM_FI_PROF_NVLINK_L0_TX_BYTES + DCGM_FI_PROF_NVLINK_L0_RX_BYTES + */ +#define DCGM_FI_PROF_NVLINK_L0_TX_BYTES 1040 +#define DCGM_FI_PROF_NVLINK_L0_RX_BYTES 1041 +#define DCGM_FI_PROF_NVLINK_L1_TX_BYTES 1042 +#define DCGM_FI_PROF_NVLINK_L1_RX_BYTES 1043 +#define DCGM_FI_PROF_NVLINK_L2_TX_BYTES 1044 +#define DCGM_FI_PROF_NVLINK_L2_RX_BYTES 1045 +#define DCGM_FI_PROF_NVLINK_L3_TX_BYTES 1046 +#define DCGM_FI_PROF_NVLINK_L3_RX_BYTES 1047 +#define DCGM_FI_PROF_NVLINK_L4_TX_BYTES 1048 +#define DCGM_FI_PROF_NVLINK_L4_RX_BYTES 1049 +#define DCGM_FI_PROF_NVLINK_L5_TX_BYTES 1050 +#define DCGM_FI_PROF_NVLINK_L5_RX_BYTES 1051 +#define DCGM_FI_PROF_NVLINK_L6_TX_BYTES 1052 +#define DCGM_FI_PROF_NVLINK_L6_RX_BYTES 1053 +#define DCGM_FI_PROF_NVLINK_L7_TX_BYTES 1054 +#define DCGM_FI_PROF_NVLINK_L7_RX_BYTES 1055 +#define DCGM_FI_PROF_NVLINK_L8_TX_BYTES 1056 +#define DCGM_FI_PROF_NVLINK_L8_RX_BYTES 1057 +#define DCGM_FI_PROF_NVLINK_L9_TX_BYTES 1058 +#define DCGM_FI_PROF_NVLINK_L9_RX_BYTES 1059 +#define DCGM_FI_PROF_NVLINK_L10_TX_BYTES 1060 +#define DCGM_FI_PROF_NVLINK_L10_RX_BYTES 1061 +#define DCGM_FI_PROF_NVLINK_L11_TX_BYTES 1062 +#define DCGM_FI_PROF_NVLINK_L11_RX_BYTES 1063 +#define DCGM_FI_PROF_NVLINK_L12_TX_BYTES 1064 +#define DCGM_FI_PROF_NVLINK_L12_RX_BYTES 1065 +#define DCGM_FI_PROF_NVLINK_L13_TX_BYTES 1066 +#define DCGM_FI_PROF_NVLINK_L13_RX_BYTES 1067 +#define DCGM_FI_PROF_NVLINK_L14_TX_BYTES 1068 +#define DCGM_FI_PROF_NVLINK_L14_RX_BYTES 1069 +#define DCGM_FI_PROF_NVLINK_L15_TX_BYTES 1070 +#define DCGM_FI_PROF_NVLINK_L15_RX_BYTES 1071 +#define DCGM_FI_PROF_NVLINK_L16_TX_BYTES 1072 +#define DCGM_FI_PROF_NVLINK_L16_RX_BYTES 1073 +#define DCGM_FI_PROF_NVLINK_L17_TX_BYTES 1074 +#define DCGM_FI_PROF_NVLINK_L17_RX_BYTES 1075 + +/** + * NVLink throughput First. + */ +#define DCGM_FI_PROF_NVLINK_THROUGHPUT_FIRST DCGM_FI_PROF_NVLINK_L0_TX_BYTES + +/** + * NVLink throughput Last. + */ +#define DCGM_FI_PROF_NVLINK_THROUGHPUT_LAST DCGM_FI_PROF_NVLINK_L17_RX_BYTES + +/** + * CPU Utilization, total + */ +#define DCGM_FI_DEV_CPU_UTIL_TOTAL 1100 + +/** + * CPU Utilization, user + */ +#define DCGM_FI_DEV_CPU_UTIL_USER 1101 + +/** + * CPU Utilization, nice + */ +#define DCGM_FI_DEV_CPU_UTIL_NICE 1102 + +/** + * CPU Utilization, system time + */ +#define DCGM_FI_DEV_CPU_UTIL_SYS 1103 + +/** + * CPU Utilization, interrupt servicing + */ +#define DCGM_FI_DEV_CPU_UTIL_IRQ 1104 + +/** + * CPU temperature + */ +#define DCGM_FI_DEV_CPU_TEMP_CURRENT 1110 + +/** + * CPU Warning Temperature + */ +#define DCGM_FI_DEV_CPU_TEMP_WARNING 1111 + +/** + * CPU Critical Temperature + */ +#define DCGM_FI_DEV_CPU_TEMP_CRITICAL 1112 + +/** + * CPU instantaneous clock speed + */ +#define DCGM_FI_DEV_CPU_CLOCK_CURRENT 1120 + +/** + * CPU power utilization + */ +#define DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT 1130 + +/** + * CPU power limit + */ +#define DCGM_FI_DEV_CPU_POWER_LIMIT 1131 + +/** + * CPU vendor name + */ +#define DCGM_FI_DEV_CPU_VENDOR 1140 + +/** + * CPU model name + */ +#define DCGM_FI_DEV_CPU_MODEL 1141 + +/** + * 1 greater than maximum fields above. This is the 1 greater than the maximum field id that could be allocated + */ +#define DCGM_FI_MAX_FIELDS 1142 + + +/** @} */ + +/*****************************************************************************/ + +/** + * Structure for formating the output for dmon. + * Used as a member in dcgm_field_meta_p + */ +typedef struct +{ + char shortName[10]; /*!< Short name corresponding to field. This short name is used to identify columns in dmon + output.*/ + char unit[4]; /*!< The unit of value. Eg: C(elsius), W(att), MB/s*/ + short width; /*!< Maximum width/number of digits that a value for field can have.*/ +} dcgm_field_output_format_t, *dcgm_field_output_format_p; + +/** + * Structure to store meta data for the field + */ + +typedef struct +{ + unsigned short fieldId; /*!< Field identifier. DCGM_FI_? #define */ + char fieldType; /*!< Field type. DCGM_FT_? #define */ + unsigned char size; /*!< field size in bytes (raw value size). 0=variable (like DCGM_FT_STRING) */ + char tag[48]; /*!< Tag for this field for serialization like 'device_temperature' */ + int scope; /*!< Field scope. DCGM_FS_? #define of this field's association */ + int nvmlFieldId; /*!< Optional NVML field this DCGM field maps to. 0 = no mapping. + Otherwise, this should be a NVML_FI_? #define from nvml.h */ + dcgm_field_entity_group_t + entityLevel; /*!< Field entity level. DCGM_FE_? specifying at what level the field is queryable */ + + dcgm_field_output_format_p valueFormat; /*!< pointer to the structure that holds the formatting the + values for fields */ +} dcgm_field_meta_t; + +typedef const dcgm_field_meta_t *dcgm_field_meta_p; + +/***************************************************************************************************/ +/** @addtogroup dcgmFieldIdentifiers + * @{ + */ +/***************************************************************************************************/ + +/** + * Get a pointer to the metadata for a field by its field ID. See DCGM_FI_? for a list of field IDs. + * + * @param fieldId IN: One of the field IDs (DCGM_FI_?) + * + * @return + * 0 On Failure + * >0 Pointer to field metadata structure if found. + * + */ +dcgm_field_meta_p DCGM_PUBLIC_API DcgmFieldGetById(unsigned short fieldId); + +/** + * Get a pointer to the metadata for a field by its field tag. + * + * @param tag IN: Tag for the field of interest + * + * @return + * 0 On failure or not found + * >0 Pointer to field metadata structure if found + * + */ +dcgm_field_meta_p DCGM_PUBLIC_API DcgmFieldGetByTag(const char *tag); + +/** + * Initialize the DcgmFields module. Call this once from inside + * your program + * + * @return + * 0 On success + * <0 On error + * + */ +int DCGM_PUBLIC_API DcgmFieldsInit(void); + +/** + * Terminates the DcgmFields module. Call this once from inside your program + * + * @return + * 0 On success + * <0 On error + * + */ +int DCGM_PUBLIC_API DcgmFieldsTerm(void); + +/** + * Get the string version of a entityGroupId + * + * @returns + * - Pointer to a string like GPU/NvSwitch..etc + * - Null on error + * + */ +DCGM_PUBLIC_API const char *DcgmFieldsGetEntityGroupString(dcgm_field_entity_group_t entityGroupId); + +/** @} */ + + +#ifdef __cplusplus +} +#endif + + +#endif // DCGMFIELDS_H diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_nvml.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_nvml.h new file mode 100644 index 0000000000..02b24e84af --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_nvml.h @@ -0,0 +1,4 @@ +#pragma once + +#define NVML_NO_UNVERSIONED_FUNC_DEFS +#include "nvml.h" diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_structs.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_structs.h new file mode 100644 index 0000000000..74bee6d2f5 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_structs.h @@ -0,0 +1,3261 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * File: dcgm_structs.h + */ + +#ifndef DCGM_STRUCTS_H +#define DCGM_STRUCTS_H + +#include "dcgm_fields.h" +#include +#include + + +/***************************************************************************************************/ +/** @defgroup dcgmReturnEnums Enums and Macros + * @{ + */ +/***************************************************************************************************/ + +/** + * Creates a unique version number for each struct + */ +#define MAKE_DCGM_VERSION(typeName, ver) (unsigned int)(sizeof(typeName) | ((unsigned long)(ver) << 24U)) + +/** + * Represents value of the field which can be returned by Host Engine in case the + * operation is not successful + */ +#ifndef DCGM_BLANK_VALUES +#define DCGM_BLANK_VALUES + +/** + * Base value for 32 bits integer blank. can be used as an unspecified blank + */ +#define DCGM_INT32_BLANK 0x7ffffff0 + +/** + * Base value for 64 bits integer blank. can be used as an unspecified blank + */ +#define DCGM_INT64_BLANK 0x7ffffffffffffff0 + +/** + * Base value for double blank. 2 ** 47. FP 64 has 52 bits of mantissa, + * so 47 bits can still increment by 1 and represent each value from 0-15 + */ +#define DCGM_FP64_BLANK 140737488355328.0 + +/** + * Base value for string blank. + */ +#define DCGM_STR_BLANK "<<>>" + +/** + * Represents an error where INT32 data was not found + */ +#define DCGM_INT32_NOT_FOUND (DCGM_INT32_BLANK + 1) + +/** + * Represents an error where INT64 data was not found + */ +#define DCGM_INT64_NOT_FOUND (DCGM_INT64_BLANK + 1) + +/** + * Represents an error where FP64 data was not found + */ +#define DCGM_FP64_NOT_FOUND (DCGM_FP64_BLANK + 1.0) + +/** + * Represents an error where STR data was not found + */ +#define DCGM_STR_NOT_FOUND "<<>>" + +/** + * Represents an error where fetching the INT32 value is not supported + */ +#define DCGM_INT32_NOT_SUPPORTED (DCGM_INT32_BLANK + 2) + +/** + * Represents an error where fetching the INT64 value is not supported + */ +#define DCGM_INT64_NOT_SUPPORTED (DCGM_INT64_BLANK + 2) + +/** + * Represents an error where fetching the FP64 value is not supported + */ +#define DCGM_FP64_NOT_SUPPORTED (DCGM_FP64_BLANK + 2.0) + +/** + * Represents an error where fetching the STR value is not supported + */ +#define DCGM_STR_NOT_SUPPORTED "<<>>" + +/** + * Represents and error where fetching the INT32 value is not allowed with our current credentials + */ +#define DCGM_INT32_NOT_PERMISSIONED (DCGM_INT32_BLANK + 3) + +/** + * Represents and error where fetching the INT64 value is not allowed with our current credentials + */ +#define DCGM_INT64_NOT_PERMISSIONED (DCGM_INT64_BLANK + 3) + +/** + * Represents and error where fetching the FP64 value is not allowed with our current credentials + */ +#define DCGM_FP64_NOT_PERMISSIONED (DCGM_FP64_BLANK + 3.0) + +/** + * Represents and error where fetching the STR value is not allowed with our current credentials + */ +#define DCGM_STR_NOT_PERMISSIONED "<<>>" + +/** + * Macro to check if a INT32 value is blank or not + */ +#define DCGM_INT32_IS_BLANK(val) (((val) >= DCGM_INT32_BLANK) ? 1 : 0) + +/** + * Macro to check if a INT64 value is blank or not + */ +#define DCGM_INT64_IS_BLANK(val) (((val) >= DCGM_INT64_BLANK) ? 1 : 0) + +/** + * Macro to check if a FP64 value is blank or not + */ +#define DCGM_FP64_IS_BLANK(val) (((val) >= DCGM_FP64_BLANK ? 1 : 0)) + +/** + * Macro to check if a STR value is blank or not + * Works on (char *). Looks for <<< at first position and >>> inside string + */ +#define DCGM_STR_IS_BLANK(val) (val == strstr(val, "<<<") && strstr(val, ">>>")) + +#endif // DCGM_BLANK_VALUES + +/** + * Max number of GPUs supported by DCGM + */ +#define DCGM_MAX_NUM_DEVICES 32 /* DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16. */ + +/** + * Number of NvLink links per GPU supported by DCGM + * 18 for Hopper, 12 for Ampere, 6 for Volta, and 4 for Pascal + */ +#define DCGM_NVLINK_MAX_LINKS_PER_GPU 18 + +/** + * Number of nvlink errors supported by DCGM + * @see NVML_NVLINK_ERROR_COUNT + * + * NVML_NVLINK_ERROR_DL_ECC_DATA not currently supported + */ +#define DCGM_NVLINK_ERROR_COUNT 4 + +/** + * Number of nvlink error types: @see NVML_NVLINK_ERROR_COUNT + * TODO: update with refactor of ampere-next nvlink APIs (JIRA DCGM-2628) + */ +#define DCGM_HEALTH_WATCH_NVLINK_ERROR_NUM_FIELDS 4 + +/** + * Maximum NvLink links pre-Ampere + */ +#define DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 6 + +/** + * Maximum NvLink links pre-Hopper + */ +#define DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2 12 + +/** + * Max number of NvSwitches supported by DCGM + **/ +#define DCGM_MAX_NUM_SWITCHES 12 + +/** + * Number of NvLink links per NvSwitch supported by DCGM + */ +#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 64 + +/** + * Number of Lines per NvSwitch NvLink supported by DCGM + */ +#define DCGM_LANE_MAX_LANES_PER_NVSWICH_LINK 4 + +/** + * Maximum number of vGPU instances per physical GPU + */ +#define DCGM_MAX_VGPU_INSTANCES_PER_PGPU 32 + +/** + * Max number of CPU nodes + **/ +#define DCGM_MAX_NUM_CPUS 8 + +/** + * Max number of CPUs + **/ +#define DCGM_MAX_NUM_CPU_CORES 1024 + +/** + * Max length of the DCGM string field + */ +#define DCGM_MAX_STR_LENGTH 256 + +/** + * Default maximum age of samples kept (usec) + */ +#define DCGM_MAX_AGE_USEC_DEFAULT 30000000 + +/** + * Max number of clocks supported for a device + */ +#define DCGM_MAX_CLOCKS 256 + +/** + * Max limit on the number of groups supported by DCGM + */ +#define DCGM_MAX_NUM_GROUPS 64 + +/** + * Max number of active FBC sessions + */ +#define DCGM_MAX_FBC_SESSIONS 256 + +/** + * Represents the size of a buffer that holds a vGPU type Name or vGPU class type or name of process running on vGPU + * instance. + */ +#define DCGM_VGPU_NAME_BUFFER_SIZE 64 + +/** + * Represents the size of a buffer that holds a vGPU license string + */ +#define DCGM_GRID_LICENSE_BUFFER_SIZE 128 + +/** + * Default compute mode -- multiple contexts per device + */ +#define DCGM_CONFIG_COMPUTEMODE_DEFAULT 0 + +/** + * Compute-prohibited mode -- no contexts per device + */ +#define DCGM_CONFIG_COMPUTEMODE_PROHIBITED 1 + +/** + * Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time + */ +#define DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2 + +/** + * Default Port Number for DCGM Host Engine + */ +#define DCGM_HE_PORT_NUMBER 5555 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Operation mode for DCGM + * + * DCGM can run in auto-mode where it runs additional threads in the background to collect + * any metrics of interest and auto manages any operations needed for policy management. + * + * DCGM can also operate in manual-mode where it's execution is controlled by the user. In + * this mode, the user has to periodically call APIs such as \ref dcgmPolicyTrigger and + * \ref dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and + * operations needed for policy management. + */ +typedef enum dcgmOperationMode_enum +{ + DCGM_OPERATION_MODE_AUTO = 1, + DCGM_OPERATION_MODE_MANUAL = 2 +} dcgmOperationMode_t; + +/** + * When more than one value is returned from a query, which order should it be returned in? + */ +typedef enum dcgmOrder_enum +{ + DCGM_ORDER_ASCENDING = 1, //!< Data with earliest (lowest) timestamps returned first + DCGM_ORDER_DESCENDING = 2 //!< Data with latest (highest) timestamps returned first +} dcgmOrder_t; + +/** + * Return values for DCGM API calls. + */ +typedef enum dcgmReturn_enum +{ + DCGM_ST_OK = 0, //!< Success + DCGM_ST_BADPARAM = -1, //!< A bad parameter was passed to a function + DCGM_ST_GENERIC_ERROR = -3, //!< A generic, unspecified error + DCGM_ST_MEMORY = -4, //!< An out of memory error occurred + DCGM_ST_NOT_CONFIGURED = -5, //!< Setting not configured + DCGM_ST_NOT_SUPPORTED = -6, //!< Feature not supported + DCGM_ST_INIT_ERROR = -7, //!< DCGM Init error + DCGM_ST_NVML_ERROR = -8, //!< When NVML returns error + DCGM_ST_PENDING = -9, //!< Object is in pending state of something else + DCGM_ST_UNINITIALIZED = -10, //!< Object is in undefined state + DCGM_ST_TIMEOUT = -11, //!< Requested operation timed out + DCGM_ST_VER_MISMATCH = -12, //!< Version mismatch between received and understood API + DCGM_ST_UNKNOWN_FIELD = -13, //!< Unknown field id + DCGM_ST_NO_DATA = -14, //!< No data is available + DCGM_ST_STALE_DATA = -15, //!< Data is considered stale + DCGM_ST_NOT_WATCHED = -16, //!< The given field id is not being updated by the cache manager + DCGM_ST_NO_PERMISSION = -17, //!< Do not have permission to perform the desired action + DCGM_ST_GPU_IS_LOST = -18, //!< GPU is no longer reachable + DCGM_ST_RESET_REQUIRED = -19, //!< GPU requires a reset + DCGM_ST_FUNCTION_NOT_FOUND = -20, //!< The function that was requested was not found (bindings only error) + DCGM_ST_CONNECTION_NOT_VALID = -21, //!< The connection to the host engine is not valid any longer + DCGM_ST_GPU_NOT_SUPPORTED = -22, //!< This GPU is not supported by DCGM + DCGM_ST_GROUP_INCOMPATIBLE = -23, //!< The GPUs of the provided group are not compatible with each other for the + //!< requested operation + DCGM_ST_MAX_LIMIT = -24, //!< Max limit reached for the object + DCGM_ST_LIBRARY_NOT_FOUND = -25, //!< DCGM library could not be found + DCGM_ST_DUPLICATE_KEY = -26, //!< Duplicate key passed to a function + DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27, //!< GPU is already a part of a sync boost group + DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28, //!< GPU is not a part of a sync boost group + DCGM_ST_REQUIRES_ROOT = -29, //!< This operation cannot be performed when the host engine is running as non-root + DCGM_ST_NVVS_ERROR = -30, //!< DCGM GPU Diagnostic was successfully executed, but reported an error. + DCGM_ST_INSUFFICIENT_SIZE = -31, //!< An input argument is not large enough + DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32, //!< The given field ID is not supported by the API being called + DCGM_ST_MODULE_NOT_LOADED = -33, //!< This request is serviced by a module of DCGM that is not currently loaded + DCGM_ST_IN_USE = -34, //!< The requested operation could not be completed because the affected + //!< resource is in use + DCGM_ST_GROUP_IS_EMPTY = -35, //!< This group is empty and the requested operation is not valid on an empty group + DCGM_ST_PROFILING_NOT_SUPPORTED = -36, //!< Profiling is not supported for this group of GPUs or GPU. + DCGM_ST_PROFILING_LIBRARY_ERROR = -37, //!< The third-party Profiling module returned an unrecoverable error. + DCGM_ST_PROFILING_MULTI_PASS = -38, //!< The requested profiling metrics cannot be collected in a single pass + DCGM_ST_DIAG_ALREADY_RUNNING = -39, //!< A diag instance is already running, cannot run a new diag until + //!< the current one finishes. + DCGM_ST_DIAG_BAD_JSON = -40, //!< The DCGM GPU Diagnostic returned JSON that cannot be parsed + DCGM_ST_DIAG_BAD_LAUNCH = -41, //!< Error while launching the DCGM GPU Diagnostic + DCGM_ST_DIAG_UNUSED = -42, //!< Unused + DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43, //!< A field value met or exceeded the error threshold. + DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44, //!< The installed driver version is insufficient for this API + DCGM_ST_INSTANCE_NOT_FOUND = -45, //!< The specified GPU instance does not exist + DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46, //!< The specified GPU compute instance does not exist + DCGM_ST_CHILD_NOT_KILLED = -47, //!< Couldn't kill a child process within the retries + DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48, //!< Detected an error in a 3rd-party library + DCGM_ST_INSUFFICIENT_RESOURCES = -49, //!< Not enough resources available + DCGM_ST_PLUGIN_EXCEPTION = -50, //!< Exception thrown from a diagnostic plugin + DCGM_ST_NVVS_ISOLATE_ERROR = -51, //!< The diagnostic returned an error that indicates the need for isolation + DCGM_ST_NVVS_BINARY_NOT_FOUND = -52, //!< The NVVS binary was not found in the specified location + DCGM_ST_NVVS_KILLED = -53, //!< The NVVS process was killed by a signal + DCGM_ST_PAUSED = -54, //!< The hostengine and all modules are paused + DCGM_ST_ALREADY_INITIALIZED = -55, //!< The object is already initialized +} dcgmReturn_t; + +const char *errorString(dcgmReturn_t result); + +/** + * Type of GPU groups + */ +typedef enum dcgmGroupType_enum +{ + DCGM_GROUP_DEFAULT = 0, //!< All the GPUs on the node are added to the group + DCGM_GROUP_EMPTY = 1, //!< Creates an empty group + DCGM_GROUP_DEFAULT_NVSWITCHES = 2, //!< All NvSwitches of the node are added to the group + DCGM_GROUP_DEFAULT_INSTANCES = 3, //!< All GPU instances of the node are added to the group + DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4, //!< All compute instances of the node are added to the group + DCGM_GROUP_DEFAULT_EVERYTHING = 5, //!< All entities are added to this default group +} dcgmGroupType_t; + +/** + * Identifies for special DCGM groups + */ +#define DCGM_GROUP_ALL_GPUS 0x7fffffff +#define DCGM_GROUP_ALL_NVSWITCHES 0x7ffffffe +#define DCGM_GROUP_ALL_INSTANCES 0x7ffffffd +#define DCGM_GROUP_ALL_COMPUTE_INSTANCES 0x7ffffffc +#define DCGM_GROUP_ALL_ENTITIES 0x7ffffffb + +/** + * Maximum number of entities per entity group + */ +#define DCGM_GROUP_MAX_ENTITIES 64 + +/** + * Simplified chip architecture. Note that these are made to match nvmlChipArchitecture_t and thus + * do not start at 0. + */ +typedef enum dcgmChipArchitecture_enum +{ + DCGM_CHIP_ARCH_OLDER = 1, //!< All GPUs older than Kepler + DCGM_CHIP_ARCH_KEPLER = 2, //!< All Kepler-architecture parts + DCGM_CHIP_ARCH_MAXWELL = 3, //!< All Maxwell-architecture parts + DCGM_CHIP_ARCH_PASCAL = 4, //!< All Pascal-architecture parts + DCGM_CHIP_ARCH_VOLTA = 5, //!< All Volta-architecture parts + DCGM_CHIP_ARCH_TURING = 6, //!< All Turing-architecture parts + DCGM_CHIP_ARCH_AMPERE = 7, //!< All Ampere-architecture parts + DCGM_CHIP_ARCH_ADA = 8, //!< All Ada-architecture parts + DCGM_CHIP_ARCH_HOPPER = 9, //!< All Hopper-architecture parts + + DCGM_CHIP_ARCH_COUNT, //!< Keep this second to last, exclude unknown + + DCGM_CHIP_ARCH_UNKNOWN = 0xffffffff //!< Anything else, presumably something newer +} dcgmChipArchitecture_t; + +/** + * Represents the type of configuration to be fetched from the GPUs + */ +typedef enum dcgmConfigType_enum +{ + DCGM_CONFIG_TARGET_STATE = 0, //!< The target configuration values to be applied + DCGM_CONFIG_CURRENT_STATE = 1, //!< The current configuration state +} dcgmConfigType_t; + +/** + * Represents the power cap for each member of the group. + */ +typedef enum dcgmConfigPowerLimitType_enum +{ + DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0, //!< Represents the power cap to be applied for each member of the group + DCGM_CONFIG_POWER_BUDGET_GROUP = 1, //!< Represents the power budget for the entire group +} dcgmConfigPowerLimitType_t; + +/** @} */ + + +/***************************************************************************************************/ +/** @defgroup dcgmStructs Structure definitions + * @{ + */ +/***************************************************************************************************/ +typedef uintptr_t dcgmHandle_t; //!< Identifier for DCGM Handle +typedef uintptr_t dcgmGpuGrp_t; //!< Identifier for a group of GPUs. A group can have one or more GPUs +typedef uintptr_t dcgmFieldGrp_t; //!< Identifier for a group of fields. +typedef uintptr_t dcgmStatus_t; //!< Identifier for list of status codes + +/** + * DCGM Logging Severities. These match up with plog severities defined in Severity.h + * Each level includes all of the levels above it. For instance, level 4 includes 3,2, and 1 as well + */ +typedef enum +{ + DcgmLoggingSeverityUnspecified = -1, /*!< Don't care/inherit from the environment */ + DcgmLoggingSeverityNone = 0, /*!< No logging */ + DcgmLoggingSeverityFatal = 1, /*!< Fatal Errors */ + DcgmLoggingSeverityError = 2, /*!< Errors */ + DcgmLoggingSeverityWarning = 3, /*!< Warnings */ + DcgmLoggingSeverityInfo = 4, /*!< Informative */ + DcgmLoggingSeverityDebug = 5, /*!< Debug information (will generate large logs) */ + DcgmLoggingSeverityVerbose = 6 /*!< Verbose debugging information */ +} DcgmLoggingSeverity_t; + +/** + * Represents a link object. type should be one of DCGM_FE_GPU or + * DCGM_FE_SWITCH; gpuId or switchID is the associated gpu or switch; and index + * is the link index, 0-based, with TX (even) coming before RX (odd). + */ +#pragma pack(push, 1) +typedef struct dcgm_link_s +{ + union + { + struct + { + dcgm_field_entity_group_t type : 8; /*!< Entity Group */ + uint8_t index : 8; /*!< Link Index Tx before Rx */ + union + { + dcgm_field_eid_t gpuId : 16; /*!< Physical GPU ID */ + dcgm_field_eid_t switchId : 16; /*!< Physical Switch ID */ + }; + } parsed; /*!< Broken out Link identifier GPU/SW:[GPU|SW]:Index */ + dcgm_field_eid_t raw; /*!< Raw Link ID */ + }; +} dcgm_link_t; +#pragma pack(pop) + +/** + * Connection options for dcgmConnect_v2 (v1) + * + * NOTE: This version is deprecated. use dcgmConnectV2Params_v2 + */ +typedef struct +{ + unsigned int version; /*!< Version number. Use dcgmConnectV2Params_version */ + unsigned int persistAfterDisconnect; /*!< Whether to persist DCGM state modified by this connection + once the connection is terminated. Normally, all field + watches created by a connection are removed once a + connection goes away. + 1 = do not clean up after this connection. + 0 = clean up after this connection */ +} dcgmConnectV2Params_v1; + +/** + * Version 1 for \ref dcgmConnectV2Params_v1 + */ +#define dcgmConnectV2Params_version1 MAKE_DCGM_VERSION(dcgmConnectV2Params_v1, 1) + +/** + * Connection options for dcgmConnect_v2 (v2) + */ +typedef struct +{ + unsigned int version; /*!< Version number. Use dcgmConnectV2Params_version */ + unsigned int persistAfterDisconnect; /*!< Whether to persist DCGM state modified by this connection once the + connection is terminated. Normally, all field watches created by a + connection are removed once a connection goes away. 1 = do not clean up + after this connection. 0 = clean up after this connection */ + unsigned int timeoutMs; /*!< When attempting to connect to the specified host engine, how long should + we wait in milliseconds before giving up */ + unsigned int addressIsUnixSocket; /*!< Whether or not the passed-in address is a unix socket filename (1) or a + TCP/IP address (0) */ +} dcgmConnectV2Params_v2; + +/** + * Typedef for \ref dcgmConnectV2Params_v2 + */ +typedef dcgmConnectV2Params_v2 dcgmConnectV2Params_t; + +/** + * Version 2 for \ref dcgmConnectV2Params_v2 + */ +#define dcgmConnectV2Params_version2 MAKE_DCGM_VERSION(dcgmConnectV2Params_v2, 2) + +/** + * Latest version for \ref dcgmConnectV2Params_t + */ +#define dcgmConnectV2Params_version dcgmConnectV2Params_version2 + +/** + * Typedef for \ref dcgmHostengineHealth_v1 + */ +typedef struct +{ + unsigned int version; //!< The version of this request + unsigned int overallHealth; //!< 0 to indicate healthy, or a code to indicate the error + // For now, this will always be populated with 0 if the + // hostengine can respond. In the future this will be + // updated to have other options like NVML unresponsive, + // no GPUs on system, etc. +} dcgmHostengineHealth_v1; + +/** + * Typedef for \ref dcgmHostengineHealth_t + */ +typedef dcgmHostengineHealth_v1 dcgmHostengineHealth_t; + +#define dcgmHostengineHealth_version1 MAKE_DCGM_VERSION(dcgmHostengineHealth_v1, 1) + +/** + * Latest version for \ref dcgmHostengineHealth_t + */ +#define dcgmHostengineHealth_version dcgmHostengineHealth_version1 + +/** + * Represents a entityGroupId + entityId pair to uniquely identify a given entityId inside a group of entities + * + * Added in DCGM 1.5.0 + */ +typedef struct +{ + dcgm_field_entity_group_t entityGroupId; //!< Entity Group ID entity belongs to + dcgm_field_eid_t entityId; //!< Entity ID of the entity +} dcgmGroupEntityPair_t; + +/** + * Structure to store information for DCGM group + * + * Added in DCGM 1.5.0 + */ +typedef struct +{ + unsigned int version; //!< Version Number (use dcgmGroupInfo_version2) + unsigned int count; //!< count of entityIds returned in \a entityList + char groupName[DCGM_MAX_STR_LENGTH]; //!< Group Name + dcgmGroupEntityPair_t entityList[DCGM_GROUP_MAX_ENTITIES]; //!< List of the entities that are in this group +} dcgmGroupInfo_v2; + +/** + * Typedef for \ref dcgmGroupInfo_v2 + */ +typedef dcgmGroupInfo_v2 dcgmGroupInfo_t; + +/** + * Version 2 for \ref dcgmGroupInfo_v2 + */ +#define dcgmGroupInfo_version2 MAKE_DCGM_VERSION(dcgmGroupInfo_v2, 2) + +/** + * Latest version for \ref dcgmGroupInfo_t + */ +#define dcgmGroupInfo_version dcgmGroupInfo_version2 + +/** + * Enum for the different kinds of MIG profiles + */ +typedef enum +{ + DcgmMigProfileNone = 0, /*!< No profile (for GPUs) */ + DcgmMigProfileGpuInstanceSlice1 = 1, /*!< GPU instance slice 1 */ + DcgmMigProfileGpuInstanceSlice2 = 2, /*!< GPU instance slice 2 */ + DcgmMigProfileGpuInstanceSlice3 = 3, /*!< GPU instance slice 3 */ + DcgmMigProfileGpuInstanceSlice4 = 4, /*!< GPU instance slice 4 */ + DcgmMigProfileGpuInstanceSlice7 = 5, /*!< GPU instance slice 7 */ + DcgmMigProfileGpuInstanceSlice8 = 6, /*!< GPU instance slice 8 */ + DcgmMigProfileGpuInstanceSlice6 = 7, /*!< GPU instance slice 6 */ + DcgmMigProfileGpuInstanceSlice1Rev1 = 8, /*!< GPU instance slice 1 revision 1 */ + DcgmMigProfileGpuInstanceSlice2Rev1 = 9, /*!< GPU instance slice 2 revision 1 */ + DcgmMigProfileGpuInstanceSlice1Rev2 = 10, /*!< GPU instance slice 1 revision 2 */ + DcgmMigProfileComputeInstanceSlice1 = 30, /*!< compute instance slice 1 */ + DcgmMigProfileComputeInstanceSlice2 = 31, /*!< compute instance slice 2 */ + DcgmMigProfileComputeInstanceSlice3 = 32, /*!< compute instance slice 3 */ + DcgmMigProfileComputeInstanceSlice4 = 33, /*!< compute instance slice 4*/ + DcgmMigProfileComputeInstanceSlice7 = 34, /*!< compute instance slice 7 */ + DcgmMigProfileComputeInstanceSlice8 = 35, /*!< compute instance slice 8 */ + DcgmMigProfileComputeInstanceSlice6 = 36, /*!< compute instance slice 6 */ + DcgmMigProfileComputeInstanceSlice1Rev1 = 37, /*!< compute instance slice 1 revision 1 */ +} dcgmMigProfile_t; + +/** + * Represents a pair of entity pairings to uniquely identify an entity and its place in the hierarchy. + */ +typedef struct +{ + dcgmGroupEntityPair_t entity; //!< Entity id and type for the entity in question + dcgmGroupEntityPair_t parent; //!< Entity id and type for the parent of the entity in question + dcgmMigProfile_t sliceProfile; //!< Entity MIG profile identifier +} dcgmMigHierarchyInfo_t; + +/** + * Provides additional information about location of MIG entities. + */ +typedef struct +{ + char gpuUuid[128]; /*!< GPU UUID */ + unsigned int nvmlGpuIndex; /*!< GPU index from NVML */ + unsigned int nvmlInstanceId; /*!< GPU instance index within GPU. 0 to N. -1 for GPU entities */ + unsigned int nvmlComputeInstanceId; /*!< GPU Compute instance index within GPU instance. 0 to N. -1 for GPU + * Instance and GPU entities */ + unsigned int nvmlMigProfileId; /*!< Unique profile ID for GPU or Compute instances. -1 GPU entities + * \see nvmlComputeInstanceProfileInfo_st + * \see nvmlGpuInstanceProfileInfo_st */ + unsigned int nvmlProfileSlices; /*!< Number of slices in the MIG profile */ +} dcgmMigEntityInfo_t; + +typedef struct +{ + dcgmGroupEntityPair_t entity; + dcgmGroupEntityPair_t parent; + dcgmMigEntityInfo_t info; +} dcgmMigHierarchyInfo_v2; + +#define DCGM_MAX_INSTANCES_PER_GPU 8 +// There can never be more compute instances per GPU than instances per GPU because a compute instance is part +// of an instance +#define DCGM_MAX_COMPUTE_INSTANCES_PER_GPU DCGM_MAX_INSTANCES_PER_GPU +// Currently, there cannot be more than 14 instances + compute instances. There are always 7 compute instances +// and never more than 7 instances +#define DCGM_MAX_TOTAL_INSTANCES_PER_GPU 14 +#define DCGM_MAX_HIERARCHY_INFO DCGM_MAX_NUM_DEVICES *DCGM_MAX_TOTAL_INSTANCES_PER_GPU +#define DCGM_MAX_INSTANCES DCGM_MAX_NUM_DEVICES *DCGM_MAX_INSTANCES_PER_GPU +// The maximum compute instances are always the same as the maximum instances because each compute instance is +// part of an instance. +#define DCGM_MAX_COMPUTE_INSTANCES DCGM_MAX_INSTANCES + +typedef struct +{ + unsigned int version; + unsigned int count; + dcgmMigHierarchyInfo_v2 entityList[DCGM_MAX_HIERARCHY_INFO]; +} dcgmMigHierarchy_v2; + +#define dcgmMigHierarchy_version2 MAKE_DCGM_VERSION(dcgmMigHierarchy_v2, 2) + +#define dcgmMigHierarchy_version dcgmMigHierarchy_version2 + +/** + * Bitmask indicating which cores are owned by this CPUs + */ +#define DCGM_CPU_CORE_BITMASK_COUNT_V1 (DCGM_MAX_NUM_CPU_CORES / sizeof(uint64_t) / CHAR_BIT) +typedef struct +{ + unsigned int version; + uint64_t bitmask[DCGM_CPU_CORE_BITMASK_COUNT_V1]; +} dcgmCpuHierarchyOwnedCores_v1; + +typedef dcgmCpuHierarchyOwnedCores_v1 dcgmCpuHierarchyOwnedCores_t; + +#define dcgmCpuHierarchyOwnedCores_version1 MAKE_DCGM_VERSION(dcgmCpuHierarchyOwnedCores_v1, 1) + +/** + * Hierarchy of CPUs and their cores + */ +typedef struct +{ + unsigned int version; + unsigned int numCpus; + struct dcgmCpuHierarchyCpu_v1 + { + unsigned int cpuId; + dcgmCpuHierarchyOwnedCores_v1 ownedCores; + } cpus[DCGM_MAX_NUM_CPUS]; +} dcgmCpuHierarchy_v1; + +typedef dcgmCpuHierarchy_v1 dcgmCpuHierarchy_t; + +/** + * Version 1 for dcgmCpuHierarchy_t + */ +#define dcgmCpuHierarchy_version1 MAKE_DCGM_VERSION(dcgmCpuHierarchy_v1, 1) + +/** + * Maximum number of field groups that can exist + */ +#define DCGM_MAX_NUM_FIELD_GROUPS 64 + +/** + * Maximum number of field IDs that can be in a single field group + */ +#define DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP 128 + +/** + * Structure to represent information about a field group + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmFieldGroupInfo_version) + unsigned int numFieldIds; //!< Number of entries in fieldIds[] that are valid + dcgmFieldGrp_t fieldGroupId; //!< ID of this field group + char fieldGroupName[DCGM_MAX_STR_LENGTH]; //!< Field Group Name + unsigned short fieldIds[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< Field ids that belong to this group +} dcgmFieldGroupInfo_v1; + +typedef dcgmFieldGroupInfo_v1 dcgmFieldGroupInfo_t; + +/** + * Version 1 for dcgmFieldGroupInfo_v1 + */ +#define dcgmFieldGroupInfo_version1 MAKE_DCGM_VERSION(dcgmFieldGroupInfo_v1, 1) + +/** + * Latest version for dcgmFieldGroupInfo_t + */ +#define dcgmFieldGroupInfo_version dcgmFieldGroupInfo_version1 + +typedef struct +{ + unsigned int version; //!< Version number (dcgmAllFieldGroupInfo_version) + unsigned int numFieldGroups; //!< Number of entries in fieldGroups[] that are populated + dcgmFieldGroupInfo_t fieldGroups[DCGM_MAX_NUM_FIELD_GROUPS]; //!< Info about each field group +} dcgmAllFieldGroup_v1; + +typedef dcgmAllFieldGroup_v1 dcgmAllFieldGroup_t; + +/** + * Version 1 for dcgmAllFieldGroup_v1 + */ +#define dcgmAllFieldGroup_version1 MAKE_DCGM_VERSION(dcgmAllFieldGroup_v1, 1) + +/** + * Latest version for dcgmAllFieldGroup_t + */ +#define dcgmAllFieldGroup_version dcgmAllFieldGroup_version1 + +/** + * Structure to represent error attributes + */ +typedef struct +{ + unsigned int gpuId; //!< Represents GPU ID + short fieldId; //!< One of DCGM_FI_? + int status; //!< One of DCGM_ST_? +} dcgmErrorInfo_t; + +/** + * Represents a set of memory, SM, and video clocks for a device. This can be current values or a target values + * based on context + */ +typedef struct +{ + int version; //!< Version Number (dcgmClockSet_version) + unsigned int memClock; //!< Memory Clock (Memory Clock value OR DCGM_INT32_BLANK to Ignore/Use compatible + //!< value with smClk) + unsigned int smClock; //!< SM Clock (SM Clock value OR DCGM_INT32_BLANK to Ignore/Use compatible value with memClk) +} dcgmClockSet_v1; + +/** + * Typedef for \ref dcgmClockSet_v1 + */ +typedef dcgmClockSet_v1 dcgmClockSet_t; + +/** + * Version 1 for \ref dcgmClockSet_v1 + */ +#define dcgmClockSet_version1 MAKE_DCGM_VERSION(dcgmClockSet_v1, 1) + +/** + * Latest version for \ref dcgmClockSet_t + */ +#define dcgmClockSet_version dcgmClockSet_version1 + +/** + * Represents list of supported clock sets for a device + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceSupportedClockSets_version) + unsigned int count; //!< Number of supported clocks + dcgmClockSet_t clockSet[DCGM_MAX_CLOCKS]; //!< Valid clock sets for the device. Upto \ref count entries are filled +} dcgmDeviceSupportedClockSets_v1; + +/** + * Typedef for \ref dcgmDeviceSupportedClockSets_v1 + */ +typedef dcgmDeviceSupportedClockSets_v1 dcgmDeviceSupportedClockSets_t; + +/** + * Version 1 for \ref dcgmDeviceSupportedClockSets_v1 + */ +#define dcgmDeviceSupportedClockSets_version1 MAKE_DCGM_VERSION(dcgmDeviceSupportedClockSets_v1, 1) + +/** + * Latest version for \ref dcgmDeviceSupportedClockSets_t + */ +#define dcgmDeviceSupportedClockSets_version dcgmDeviceSupportedClockSets_version1 + +/** + * Represents accounting data for one process + */ +typedef struct +{ + unsigned int version; //!< Version Number. Should match dcgmDevicePidAccountingStats_version + unsigned int pid; //!< Process id of the process these stats are for + unsigned int gpuUtilization; //!< Percent of time over the process's lifetime during which one or more kernels + //!< was executing on the GPU. + //!< Set to DCGM_INT32_NOT_SUPPORTED if is not supported + unsigned int memoryUtilization; //!< Percent of time over the process's lifetime during which global (device) + //!< memory was being read or written. + //!< Set to DCGM_INT32_NOT_SUPPORTED if is not supported + unsigned long long maxMemoryUsage; //!< Maximum total memory in bytes that was ever allocated by the process. + //!< Set to DCGM_INT64_NOT_SUPPORTED if is not supported + unsigned long long startTimestamp; //!< CPU Timestamp in usec representing start time for the process + unsigned long long activeTimeUsec; //!< Amount of time in usec during which the compute context was active. + //!< Note that this does not mean the context was being used. endTimestamp + //!< can be computed as startTimestamp + activeTime +} dcgmDevicePidAccountingStats_v1; + +/** + * Typedef for \ref dcgmDevicePidAccountingStats_v1 + */ +typedef dcgmDevicePidAccountingStats_v1 dcgmDevicePidAccountingStats_t; + +/** + * Version 1 for \ref dcgmDevicePidAccountingStats_v1 + */ +#define dcgmDevicePidAccountingStats_version1 MAKE_DCGM_VERSION(dcgmDevicePidAccountingStats_v1, 1) + +/** + * Latest version for \ref dcgmDevicePidAccountingStats_t + */ +#define dcgmDevicePidAccountingStats_version dcgmDevicePidAccountingStats_version1 + +/** + * Represents thermal information + */ +typedef struct +{ + unsigned int version; //!< Version Number + unsigned int slowdownTemp; //!< Slowdown temperature + unsigned int shutdownTemp; //!< Shutdown temperature +} dcgmDeviceThermals_v1; + +/** + * Typedef for \ref dcgmDeviceThermals_v1 + */ +typedef dcgmDeviceThermals_v1 dcgmDeviceThermals_t; + +/** + * Version 1 for \ref dcgmDeviceThermals_v1 + */ +#define dcgmDeviceThermals_version1 MAKE_DCGM_VERSION(dcgmDeviceThermals_v1, 1) + +/** + * Latest version for \ref dcgmDeviceThermals_t + */ +#define dcgmDeviceThermals_version dcgmDeviceThermals_version1 + +/** + * Represents various power limits + */ +typedef struct +{ + unsigned int version; //!< Version Number + unsigned int curPowerLimit; //!< Power management limit associated with this device (in W) + unsigned int defaultPowerLimit; //!< Power management limit effective at device boot (in W) + unsigned int enforcedPowerLimit; //!< Effective power limit that the driver enforces after taking into account + //!< all limiters (in W) + unsigned int minPowerLimit; //!< Minimum power management limit (in W) + unsigned int maxPowerLimit; //!< Maximum power management limit (in W) +} dcgmDevicePowerLimits_v1; + +/** + * Typedef for \ref dcgmDevicePowerLimits_v1 + */ +typedef dcgmDevicePowerLimits_v1 dcgmDevicePowerLimits_t; + +/** + * Version 1 for \ref dcgmDevicePowerLimits_v1 + */ +#define dcgmDevicePowerLimits_version1 MAKE_DCGM_VERSION(dcgmDevicePowerLimits_v1, 1) + +/** + * Latest version for \ref dcgmDevicePowerLimits_t + */ +#define dcgmDevicePowerLimits_version dcgmDevicePowerLimits_version1 + +/** + * Represents device identifiers + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceIdentifiers_version) + char brandName[DCGM_MAX_STR_LENGTH]; //!< Brand Name + char deviceName[DCGM_MAX_STR_LENGTH]; //!< Name of the device + char pciBusId[DCGM_MAX_STR_LENGTH]; //!< PCI Bus ID + char serial[DCGM_MAX_STR_LENGTH]; //!< Serial for the device + char uuid[DCGM_MAX_STR_LENGTH]; //!< UUID for the device + char vbios[DCGM_MAX_STR_LENGTH]; //!< VBIOS version + char inforomImageVersion[DCGM_MAX_STR_LENGTH]; //!< Inforom Image version + unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id + unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID + char driverVersion[DCGM_MAX_STR_LENGTH]; //!< Driver Version + unsigned int virtualizationMode; //!< Virtualization Mode +} dcgmDeviceIdentifiers_v1; + +/** + * Typedef for \ref dcgmDeviceIdentifiers_v1 + */ +typedef dcgmDeviceIdentifiers_v1 dcgmDeviceIdentifiers_t; + +/** + * Version 1 for \ref dcgmDeviceIdentifiers_v1 + */ +#define dcgmDeviceIdentifiers_version1 MAKE_DCGM_VERSION(dcgmDeviceIdentifiers_v1, 1) + +/** + * Latest version for \ref dcgmDeviceIdentifiers_t + */ +#define dcgmDeviceIdentifiers_version dcgmDeviceIdentifiers_version1 + +/** + * Represents device memory and usage + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceMemoryUsage_version) + unsigned int bar1Total; //!< Total BAR1 size in megabytes + unsigned int fbTotal; //!< Total framebuffer memory in megabytes + unsigned int fbUsed; //!< Used framebuffer memory in megabytes + unsigned int fbFree; //!< Free framebuffer memory in megabytes +} dcgmDeviceMemoryUsage_v1; + +/** + * Typedef for \ref dcgmDeviceMemoryUsage_v1 + */ +typedef dcgmDeviceMemoryUsage_v1 dcgmDeviceMemoryUsage_t; + +/** + * Version 1 for \ref dcgmDeviceMemoryUsage_v1 + */ +#define dcgmDeviceMemoryUsage_version1 MAKE_DCGM_VERSION(dcgmDeviceMemoryUsage_v1, 1) + +/** + * Latest version for \ref dcgmDeviceMemoryUsage_t + */ +#define dcgmDeviceMemoryUsage_version dcgmDeviceMemoryUsage_version1 + +/** + * Represents utilization values for vGPUs running on the device + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceVgpuUtilInfo_version) + unsigned int vgpuId; //!< vGPU instance ID + unsigned int smUtil; //!< GPU utilization for vGPU + unsigned int memUtil; //!< Memory utilization for vGPU + unsigned int encUtil; //!< Encoder utilization for vGPU + unsigned int decUtil; //!< Decoder utilization for vGPU +} dcgmDeviceVgpuUtilInfo_v1; + +/** + * Typedef for \ref dcgmDeviceVgpuUtilInfo_v1 + */ +typedef dcgmDeviceVgpuUtilInfo_v1 dcgmDeviceVgpuUtilInfo_t; + +/** + * Version 1 for \ref dcgmDeviceVgpuUtilInfo_v1 + */ +#define dcgmDeviceVgpuUtilInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuUtilInfo_v1, 1) + +/** + * Latest version for \ref dcgmDeviceVgpuUtilInfo_t + */ +#define dcgmDeviceVgpuUtilInfo_version dcgmDeviceVgpuUtilInfo_version1 + +/** + * Represents current encoder statistics for the given device/vGPU instance + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceEncStats_version) + unsigned int sessionCount; //!< Count of active encoder sessions + unsigned int averageFps; //!< Trailing average FPS of all active sessions + unsigned int averageLatency; //!< Encode latency in milliseconds +} dcgmDeviceEncStats_v1; + +/** + * Typedef for \ref dcgmDeviceEncStats_v1 + */ +typedef dcgmDeviceEncStats_v1 dcgmDeviceEncStats_t; + +/** + * Version 1 for \ref dcgmDeviceEncStats_v1 + */ +#define dcgmDeviceEncStats_version1 MAKE_DCGM_VERSION(dcgmDeviceEncStats_v1, 1) + +/** + * Latest version for \ref dcgmDeviceEncStats_t + */ +#define dcgmDeviceEncStats_version dcgmDeviceEncStats_version1 + +/** + * Represents current frame buffer capture sessions statistics for the given device/vGPU instance + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceFbcStats_version) + unsigned int sessionCount; //!< Count of active FBC sessions + unsigned int averageFps; //!< Moving average new frames captured per second + unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds +} dcgmDeviceFbcStats_v1; + +/** + * Typedef for \ref dcgmDeviceFbcStats_v1 + */ +typedef dcgmDeviceFbcStats_v1 dcgmDeviceFbcStats_t; + +/** + * Version 1 for \ref dcgmDeviceFbcStats_v1 + */ +#define dcgmDeviceFbcStats_version1 MAKE_DCGM_VERSION(dcgmDeviceFbcStats_v1, 1) + +/** + * Latest version for \ref dcgmDeviceEncStats_t + */ +#define dcgmDeviceFbcStats_version dcgmDeviceFbcStats_version1 + +/* + * Represents frame buffer capture session type + */ +typedef enum dcgmFBCSessionType_enum +{ + DCGM_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknown + DCGM_FBC_SESSION_TYPE_TOSYS, //!< FB capture for a system buffer + DCGM_FBC_SESSION_TYPE_CUDA, //!< FB capture for a cuda buffer + DCGM_FBC_SESSION_TYPE_VID, //!< FB capture for a Vid buffer + DCGM_FBC_SESSION_TYPE_HWENC, //!< FB capture for a NVENC HW buffer +} dcgmFBCSessionType_t; + +/** + * Represents information about active FBC session on the given device/vGPU instance + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceFbcSessionInfo_version) + unsigned int sessionId; //!< Unique session ID + unsigned int pid; //!< Owning process ID + unsigned int vgpuId; //!< vGPU instance ID (only valid on vGPU hosts, otherwise zero) + unsigned int displayOrdinal; //!< Display identifier + dcgmFBCSessionType_t sessionType; //!< Type of frame buffer capture session + unsigned int sessionFlags; //!< Session flags + unsigned int hMaxResolution; //!< Max horizontal resolution supported by the capture session + unsigned int vMaxResolution; //!< Max vertical resolution supported by the capture session + unsigned int hResolution; //!< Horizontal resolution requested by caller in capture call + unsigned int vResolution; //!< Vertical resolution requested by caller in capture call + unsigned int averageFps; //!< Moving average new frames captured per second + unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds +} dcgmDeviceFbcSessionInfo_v1; + +/** + * Typedef for \ref dcgmDeviceFbcSessionInfo_v1 + */ +typedef dcgmDeviceFbcSessionInfo_v1 dcgmDeviceFbcSessionInfo_t; + +/** + * Version 1 for \ref dcgmDeviceFbcSessionInfo_v1 + */ +#define dcgmDeviceFbcSessionInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceFbcSessionInfo_v1, 1) + +/** + * Latest version for \ref dcgmDeviceFbcSessionInfo_t + */ +#define dcgmDeviceFbcSessionInfo_version dcgmDeviceFbcSessionInfo_version1 + +/** + * Represents all the active FBC sessions on the given device/vGPU instance + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceFbcSessions_version) + unsigned int sessionCount; //!< Count of active FBC sessions + dcgmDeviceFbcSessionInfo_t sessionInfo[DCGM_MAX_FBC_SESSIONS]; //!< Info about the active FBC session +} dcgmDeviceFbcSessions_v1; + +/** + * Typedef for \ref dcgmDeviceFbcSessions_v1 + */ +typedef dcgmDeviceFbcSessions_v1 dcgmDeviceFbcSessions_t; + +/** + * Version 1 for \ref dcgmDeviceFbcSessions_v1 + */ +#define dcgmDeviceFbcSessions_version1 MAKE_DCGM_VERSION(dcgmDeviceFbcSessions_v1, 1) + +/** + * Latest version for \ref dcgmDeviceFbcSessions_t + */ +#define dcgmDeviceFbcSessions_version dcgmDeviceFbcSessions_version1 + +/* + * Represents type of encoder for capacity can be queried + */ +typedef enum dcgmEncoderQueryType_enum +{ + DCGM_ENCODER_QUERY_H264 = 0, + DCGM_ENCODER_QUERY_HEVC = 1 +} dcgmEncoderType_t; + +/** + * Represents information about active encoder sessions on the given vGPU instance + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceVgpuEncSessions_version) + union + { + unsigned int vgpuId; //!< vGPU instance ID + unsigned int sessionCount; + } encoderSessionInfo; + unsigned int sessionId; //!< Unique session ID + unsigned int pid; //!< Process ID + dcgmEncoderType_t codecType; //!< Video encoder type + unsigned int hResolution; //!< Current encode horizontal resolution + unsigned int vResolution; //!< Current encode vertical resolution + unsigned int averageFps; //!< Moving average encode frames per second + unsigned int averageLatency; //!< Moving average encode latency in milliseconds +} dcgmDeviceVgpuEncSessions_v1; + +/** + * Typedef for \ref dcgmDeviceVgpuEncSessions_v1 + */ +typedef dcgmDeviceVgpuEncSessions_v1 dcgmDeviceVgpuEncSessions_t; + +/** + * Version 1 for \ref dcgmDeviceVgpuEncSessions_v1 + */ +#define dcgmDeviceVgpuEncSessions_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuEncSessions_v1, 1) + +/** + * Latest version for \ref dcgmDeviceVgpuEncSessions_t + */ +#define dcgmDeviceVgpuEncSessions_version dcgmDeviceVgpuEncSessions_version1 + +/** + * Represents utilization values for processes running in vGPU VMs using the device + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceVgpuProcessUtilInfo_version) + union + { + unsigned int vgpuId; //!< vGPU instance ID + unsigned int vgpuProcessSamplesCount; //!< Count of processes running in the vGPU VM,for which utilization + //!< rates are being reported in this cycle. + } vgpuProcessUtilInfo; + unsigned int pid; //!< Process ID of the process running in the vGPU VM. + char processName[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< Process Name of process running in the vGPU VM. + unsigned int smUtil; //!< GPU utilization of process running in the vGPU VM. + unsigned int memUtil; //!< Memory utilization of process running in the vGPU VM. + unsigned int encUtil; //!< Encoder utilization of process running in the vGPU VM. + unsigned int decUtil; //!< Decoder utilization of process running in the vGPU VM. +} dcgmDeviceVgpuProcessUtilInfo_v1; + +/** + * Typedef for \ref dcgmDeviceVgpuProcessUtilInfo_v1 + */ +typedef dcgmDeviceVgpuProcessUtilInfo_v1 dcgmDeviceVgpuProcessUtilInfo_t; + +/** + * Version 1 for \ref dcgmDeviceVgpuProcessUtilInfo_v1 + */ +#define dcgmDeviceVgpuProcessUtilInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuProcessUtilInfo_v1, 1) + +/** + * Latest version for \ref dcgmDeviceVgpuProcessUtilInfo_t + */ +#define dcgmDeviceVgpuProcessUtilInfo_version dcgmDeviceVgpuProcessUtilInfo_version1 + +/** + * Represents static info related to vGPUs supported on the device. + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmDeviceVgpuTypeInfo_version) + union + { + unsigned int vgpuTypeId; + unsigned int supportedVgpuTypeCount; + } vgpuTypeInfo; //!< vGPU type ID and Supported vGPU type count + char vgpuTypeName[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< vGPU type Name + char vgpuTypeClass[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< Class of vGPU type + char vgpuTypeLicense[DCGM_GRID_LICENSE_BUFFER_SIZE]; //!< license of vGPU type + int deviceId; //!< device ID of vGPU type + int subsystemId; //!< Subsystem ID of vGPU type + int numDisplayHeads; //!< Count of vGPU's supported display heads + int maxInstances; //!< maximum number of vGPU instances creatable on a device for given vGPU type + int frameRateLimit; //!< Frame rate limit value of the vGPU type + int maxResolutionX; //!< vGPU display head's maximum supported resolution in X dimension + int maxResolutionY; //!< vGPU display head's maximum supported resolution in Y dimension + int fbTotal; //!< vGPU Total framebuffer size in megabytes +} dcgmDeviceVgpuTypeInfo_v1; + +/** + * Version 1 for \ref dcgmDeviceVgpuTypeInfo_v1 + */ +#define dcgmDeviceVgpuTypeInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceVgpuTypeInfo_v1, 1) + +typedef struct +{ + unsigned int version; //!< Version number (dcgmDeviceVgpuTypeInfo_version2) + union + { + unsigned int vgpuTypeId; + unsigned int supportedVgpuTypeCount; + } vgpuTypeInfo; //!< vGPU type ID and Supported vGPU type count + char vgpuTypeName[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< vGPU type Name + char vgpuTypeClass[DCGM_VGPU_NAME_BUFFER_SIZE]; //!< Class of vGPU type + char vgpuTypeLicense[DCGM_GRID_LICENSE_BUFFER_SIZE]; //!< license of vGPU type + int deviceId; //!< device ID of vGPU type + int subsystemId; //!< Subsystem ID of vGPU type + int numDisplayHeads; //!< Count of vGPU's supported display heads + int maxInstances; //!< maximum number of vGPU instances creatable on a device for given vGPU type + int frameRateLimit; //!< Frame rate limit value of the vGPU type + int maxResolutionX; //!< vGPU display head's maximum supported resolution in X dimension + int maxResolutionY; //!< vGPU display head's maximum supported resolution in Y dimension + int fbTotal; //!< vGPU Total framebuffer size in megabytes + int gpuInstanceProfileId; //!< GPU Instance Profile ID for the given vGPU type +} dcgmDeviceVgpuTypeInfo_v2; + +/** + * Typedef for \ref dcgmDeviceVgpuTypeInfo_v2 + */ +typedef dcgmDeviceVgpuTypeInfo_v2 dcgmDeviceVgpuTypeInfo_t; + +/** + * Version 2 for \ref dcgmDeviceVgpuTypeInfo_v2 + */ +#define dcgmDeviceVgpuTypeInfo_version2 MAKE_DCGM_VERSION(dcgmDeviceVgpuTypeInfo_v2, 2) + +/** + * Latest version for \ref dcgmDeviceVgpuTypeInfo_t + */ +#define dcgmDeviceVgpuTypeInfo_version dcgmDeviceVgpuTypeInfo_version2 + +/** + * Represents the info related to vGPUs supported on the device. + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmDeviceSupportedVgpuTypeInfo_version) + unsigned long long deviceId; //!< device ID of vGPU type + unsigned long long subsystemId; //!< Subsystem ID of vGPU type + unsigned int numDisplayHeads; //!< Count of vGPU's supported display heads + unsigned int maxInstances; //!< maximum number of vGPU instances creatable on a device for given vGPU type + unsigned int frameRateLimit; //!< Frame rate limit value of the vGPU type + unsigned int maxResolutionX; //!< vGPU display head's maximum supported resolution in X dimension + unsigned int maxResolutionY; //!< vGPU display head's maximum supported resolution in Y dimension + unsigned long long fbTotal; //!< vGPU Total framebuffer size in megabytes + unsigned int gpuInstanceProfileId; //!< GPU Instance Profile ID for the given vGPU type +} dcgmDeviceSupportedVgpuTypeInfo_v1; + +/** + * Typedef for \ref dcgmDeviceSupportedVgpuTypeInfo_v1 + */ +typedef dcgmDeviceSupportedVgpuTypeInfo_v1 dcgmDeviceSupportedVgpuTypeInfo_t; + +/** + * Version 1 for \ref dcgmDeviceSupportedVgpuTypeInfo_v1 + */ +#define dcgmDeviceSupportedVgpuTypeInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceSupportedVgpuTypeInfo_v1, 1) + +/** + * Latest version for \ref dcgmDeviceSupportedVgpuTypeInfo_t + */ +#define dcgmDeviceSupportedVgpuTypeInfo_version dcgmDeviceSupportedVgpuTypeInfo_version1 + +typedef struct +{ + unsigned int version; + unsigned int persistenceModeEnabled; + unsigned int migModeEnabled; + unsigned int confidentialComputeMode; +} dcgmDeviceSettings_v2; + +typedef dcgmDeviceSettings_v2 dcgmDeviceSettings_t; + +#define dcgmDeviceSettings_version2 MAKE_DCGM_VERSION(dcgmDeviceSettings_v2, 2) + +#define dcgmDeviceSettings_version dcgmDeviceSettings_version2 + +typedef struct +{ + unsigned int version; //!< Version number (dcgmDeviceAttributes_version) + dcgmDeviceSupportedClockSets_t clockSets; //!< Supported clocks for the device + dcgmDeviceThermals_t thermalSettings; //!< Thermal settings for the device + dcgmDevicePowerLimits_t powerLimits; //!< Various power limits for the device + dcgmDeviceIdentifiers_t identifiers; //!< Identifiers for the device + dcgmDeviceMemoryUsage_t memoryUsage; //!< Memory usage info for the device + dcgmDeviceSettings_v2 settings; //!< Basic device settings +} dcgmDeviceAttributes_v3; + +/** + * Typedef for \ref dcgmDeviceAttributes_v3 + */ +typedef dcgmDeviceAttributes_v3 dcgmDeviceAttributes_t; + +/** + * Version 3 for \ref dcgmDeviceAttributes_v3 + */ +#define dcgmDeviceAttributes_version3 MAKE_DCGM_VERSION(dcgmDeviceAttributes_v3, 3) + +/** + * Latest version for \ref dcgmDeviceAttributes_t + */ +#define dcgmDeviceAttributes_version dcgmDeviceAttributes_version3 + +/** + * Structure to represent attributes info for a MIG device + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceMigAttributesInfo_version) + unsigned int gpuInstanceId; //!< GPU instance ID + unsigned int computeInstanceId; //!< Compute instance ID + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count + unsigned int gpuInstanceSliceCount; //!< GPU instance slice count + unsigned int computeInstanceSliceCount; //!< Compute instance slice count + unsigned long long memorySizeMB; //!< Device memory size (in MiB) +} dcgmDeviceMigAttributesInfo_v1; + +/** + * Typedef for \ref dcgmDeviceMigAttributesInfo_v1 + */ +typedef dcgmDeviceMigAttributesInfo_v1 dcgmDeviceMigAttributesInfo_t; + +/** + * Version 1 for \ref dcgmDeviceMigAttributesInfo_v1 + */ +#define dcgmDeviceMigAttributesInfo_version1 MAKE_DCGM_VERSION(dcgmDeviceMigAttributesInfo_v1, 1) + +/** + * Latest version for \ref dcgmDeviceMigAttributesInfo_t + */ +#define dcgmDeviceMigAttributesInfo_version dcgmDeviceMigAttributesInfo_version1 + +/** + * Structure to represent attributes for a MIG device + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmDeviceMigAttributes_version) + unsigned int migDevicesCount; //!< Count of MIG devices + dcgmDeviceMigAttributesInfo_v1 migAttributesInfo; //!< MIG attributes information +} dcgmDeviceMigAttributes_v1; + +/** + * Typedef for \ref dcgmDeviceMigAttributes_v1 + */ +typedef dcgmDeviceMigAttributes_v1 dcgmDeviceMigAttributes_t; + +/** + * Version 1 for \ref dcgmDeviceMigAttributes_v1 + */ +#define dcgmDeviceMigAttributes_version1 MAKE_DCGM_VERSION(dcgmDeviceMigAttributes_v1, 1) + +/** + * Latest version for \ref dcgmDeviceMigAttributes_t + */ +#define dcgmDeviceMigAttributes_version dcgmDeviceMigAttributes_version1 + +/** + * Structure to represent GPU instance profile information + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmGpuInstanceProfileInfo_version) + unsigned int id; //!< Unique profile ID within the device + unsigned int isP2pSupported; //!< Peer-to-Peer support + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< GPU instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int copyEngineCount; //!< Copy Engine count + unsigned int decoderCount; //!< Decoder Engine count + unsigned int encoderCount; //!< Encoder Engine count + unsigned int jpegCount; //!< JPEG Engine count + unsigned int ofaCount; //!< OFA Engine count + unsigned long long memorySizeMB; //!< Memory size in MBytes +} dcgmGpuInstanceProfileInfo_v1; + +/** + * Typedef for \ref dcgmGpuInstanceProfileInfo_v1 + */ +typedef dcgmGpuInstanceProfileInfo_v1 dcgmGpuInstanceProfileInfo_t; + +/** + * Version 1 for \ref dcgmGpuInstanceProfileInfo_v1 + */ +#define dcgmGpuInstanceProfileInfo_version1 MAKE_DCGM_VERSION(dcgmGpuInstanceProfileInfo_v1, 1) + +/** + * Latest version for \ref dcgmGpuInstanceProfileInfo_t + */ +#define dcgmGpuInstanceProfileInfo_version dcgmGpuInstanceProfileInfo_version1 + +/** + * Structure to represent GPU instance profiles + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmGpuInstanceProfiles_version) + unsigned int profileCount; //!< Profile count + dcgmGpuInstanceProfileInfo_v1 profileInfo; //!< GPU instance profile information +} dcgmGpuInstanceProfiles_v1; + +/** + * Typedef for \ref dcgmGpuInstanceProfiles_v1 + */ +typedef dcgmGpuInstanceProfiles_v1 dcgmGpuInstanceProfiles_t; + +/** + * Version 1 for \ref dcgmGpuInstanceProfiles_v1 + */ +#define dcgmGpuInstanceProfiles_version1 MAKE_DCGM_VERSION(dcgmGpuInstanceProfiles_v1, 1) + +/** + * Latest version for \ref dcgmGpuInstanceProfiles_t + */ +#define dcgmGpuInstanceProfiles_version dcgmGpuInstanceProfiles_version1 + +/** + * Structure to represent Compute instance profile information + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmComputeInstanceProfileInfo_version) + unsigned int gpuInstanceId; //!< GPU instance ID + unsigned int id; //!< Unique profile ID within the GPU instance + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< Compute instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count +} dcgmComputeInstanceProfileInfo_v1; + +/** + * Typedef for \ref dcgmComputeInstanceProfileInfo_v1 + */ +typedef dcgmComputeInstanceProfileInfo_v1 dcgmComputeInstanceProfileInfo_t; + +/** + * Version 1 for \ref dcgmComputeInstanceProfileInfo_v1 + */ +#define dcgmComputeInstanceProfileInfo_version1 MAKE_DCGM_VERSION(dcgmComputeInstanceProfileInfo_v1, 1) + +/** + * Latest version for \ref dcgmComputeInstanceProfileInfo_t + */ +#define dcgmComputeInstanceProfileInfo_version dcgmComputeInstanceProfileInfo_version1 + +/** + * Structure to represent Compute instance profiles + */ +typedef struct +{ + unsigned int version; //!< Version Number (dcgmComputeInstanceProfiles_version) + unsigned int profileCount; //!< Profile count + dcgmComputeInstanceProfileInfo_v1 profileInfo; //!< Compute instance profile information +} dcgmComputeInstanceProfiles_v1; + +/** + * Typedef for \ref dcgmComputeInstanceProfiles_v1 + */ +typedef dcgmComputeInstanceProfiles_v1 dcgmComputeInstanceProfiles_t; + +/** + * Version 1 for \ref dcgmComputeInstanceProfiles_v1 + */ +#define dcgmComputeInstanceProfiles_version1 MAKE_DCGM_VERSION(dcgmComputeInstanceProfiles_v1, 1) + +/** + * Latest version for \ref dcgmComputeInstanceProfiles_t + */ +#define dcgmComputeInstanceProfiles_version dcgmComputeInstanceProfiles_version1 + +/** + * Maximum number of vGPU types per physical GPU + */ +#define DCGM_MAX_VGPU_TYPES_PER_PGPU 32 + +/** + * Represents the size of a buffer that holds string related to attributes specific to vGPU instance + */ +#define DCGM_DEVICE_UUID_BUFFER_SIZE 80 + +/** + * Used to represent Performance state settings + */ +typedef struct +{ + unsigned int syncBoost; //!< Sync Boost Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored). Note that + //!< using this setting may result in lower clocks than targetClocks + dcgmClockSet_t targetClocks; //!< Target clocks. Set smClock and memClock to DCGM_INT32_BLANK to ignore/use + //!< compatible values. For GPUs > Maxwell, setting this implies autoBoost=0 +} dcgmConfigPerfStateSettings_t; + +/** + * Used to represents the power capping limit for each GPU in the group or to represent the power + * budget for the entire group + */ +typedef struct +{ + dcgmConfigPowerLimitType_t type; //!< Flag to represent power cap for each GPU or power budget for the group of GPUs + unsigned int val; //!< Power Limit in Watts (Set a value OR DCGM_INT32_BLANK to Ignore) +} dcgmConfigPowerLimit_t; + +/** + * Structure to represent default and target configuration for a device + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmConfig_version) + unsigned int gpuId; //!< GPU ID + unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored) + unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore) + dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode) + dcgmConfigPowerLimit_t powerLimit; //!< Power Limits +} dcgmConfig_v1; + +/** + * Typedef for \ref dcgmConfig_v1 + */ +typedef dcgmConfig_v1 dcgmConfig_t; + +/** + * Version 1 for \ref dcgmConfig_v1 + */ +#define dcgmConfig_version1 MAKE_DCGM_VERSION(dcgmConfig_v1, 1) + +/** + * Latest version for \ref dcgmConfig_t + */ +#define dcgmConfig_version dcgmConfig_version1 + +/** + * Represents a callback to receive updates from asynchronous functions. + * Currently the only implemented callback function is dcgmPolicyRegister + * and the void * data will be a pointer to dcgmPolicyCallbackResponse_t. + * Ex. + * dcgmPolicyCallbackResponse_t *callbackResponse = (dcgmPolicyCallbackResponse_t *) userData; + * + */ +typedef int (*fpRecvUpdates)(void *userData); + +/*Remove from doxygen documentation + * + * Define the structure that contains specific policy information + */ +typedef struct +{ + // version must always be first + unsigned int version; //!< Version number (dcgmPolicyViolation_version) + + unsigned int notifyOnEccDbe; //!< true/false notification on ECC Double Bit Errors + unsigned int notifyOnPciEvent; //!< true/false notification on PCI Events + unsigned int notifyOnMaxRetiredPages; //!< number of retired pages to occur before notification +} dcgmPolicyViolation_v1; + +/*Remove from doxygen documentation + * + * Represents the versioning for the dcgmPolicyViolation_v1 structure + */ + +/* + * Typedef for \ref dcgmPolicyViolation_v1 + */ +typedef dcgmPolicyViolation_v1 dcgmPolicyViolation_t; + +/* + * Version 1 for \ref dcgmPolicyViolation_v1 + */ +#define dcgmPolicyViolation_version1 MAKE_DCGM_VERSION(dcgmPolicyViolation_v1, 1) + +/* + * Latest version for \ref dcgmPolicyViolation_t + */ +#define dcgmPolicyViolation_version dcgmPolicyViolation_version1 + +/** + * Enumeration for policy conditions. + * When used as part of dcgmPolicy_t these have corresponding parameters to + * allow them to be switched on/off or set specific violation thresholds + */ +typedef enum dcgmPolicyConditionIdx_enum +{ + // These are sequential rather than bitwise. + DCGM_POLICY_COND_IDX_DBE = 0, //!< Double bit errors -- boolean in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_IDX_PCI, //!< PCI events/errors -- boolean in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_IDX_MAX_PAGES_RETIRED, //!< Maximum number of retired pages -- number + //!< required in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_IDX_THERMAL, //!< Thermal violation -- number required in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_IDX_POWER, //!< Power violation -- number required in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_IDX_NVLINK, //!< NVLINK errors -- boolean in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_IDX_XID //!< XID errors -- number required in dcgmPolicyConditionParams_t +} dcgmPolicyConditionIdx_t; + +#define DCGM_POLICY_COND_IDX_MAX 7 +#define DCGM_POLICY_COND_MAX DCGM_POLICY_COND_IDX_MAX + +/** + * Bitmask enumeration for policy conditions. + * When used as part of dcgmPolicy_t these have corresponding parameters to + * allow them to be switched on/off or set specific violation thresholds + */ +typedef enum dcgmPolicyCondition_enum +{ + // These are bitwise rather than sequential. + DCGM_POLICY_COND_DBE = 0x1, //!< Double bit errors -- boolean in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_PCI = 0x2, //!< PCI events/errors -- boolean in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4, //!< Maximum number of retired pages -- number + //!< required in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_THERMAL = 0x8, //!< Thermal violation -- number required in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_POWER = 0x10, //!< Power violation -- number required in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_NVLINK = 0x20, //!< NVLINK errors -- boolean in dcgmPolicyConditionParams_t + DCGM_POLICY_COND_XID = 0x40, //!< XID errors -- number required in dcgmPolicyConditionParams_t +} dcgmPolicyCondition_t; + +/** + * Structure for policy condition parameters. + * This structure contains a tag that represents the type of the value being passed + * as well as a "val" which is a union of the possible value types. For example, + * to pass a true boolean: tag = BOOL, val.boolean = 1. + */ +typedef struct dcgmPolicyConditionParams_st +{ + enum + { + BOOL, + LLONG + } tag; + union + { + unsigned int boolean; + unsigned long long llval; + } val; +} dcgmPolicyConditionParams_t; + +/** + * Enumeration for policy modes + */ +typedef enum dcgmPolicyMode_enum +{ + DCGM_POLICY_MODE_AUTOMATED = 0, //!< automatic mode + DCGM_POLICY_MODE_MANUAL = 1, //!< manual mode +} dcgmPolicyMode_t; + +/** + * Enumeration for policy isolation modes + */ +typedef enum dcgmPolicyIsolation_enum +{ + DCGM_POLICY_ISOLATION_NONE = 0, //!< no isolation of GPUs on error +} dcgmPolicyIsolation_t; + +/** + * Enumeration for policy actions + */ +typedef enum dcgmPolicyAction_enum +{ + DCGM_POLICY_ACTION_NONE = 0, //!< no action + DCGM_POLICY_ACTION_GPURESET = 1, //!< Deprecated - perform a GPU reset on violation +} dcgmPolicyAction_t; + +/** + * Enumeration for policy validation actions + */ +typedef enum dcgmPolicyValidation_enum +{ + DCGM_POLICY_VALID_NONE = 0, //!< no validation after an action is performed + DCGM_POLICY_VALID_SV_SHORT = 1, //!< run a short System Validation on the system after failure + DCGM_POLICY_VALID_SV_MED = 2, //!< run a medium System Validation test after failure + DCGM_POLICY_VALID_SV_LONG = 3, //!< run a extensive System Validation test after failure + DCGM_POLICY_VALID_SV_XLONG = 4, //!< run a more extensive System Validation test after failure +} dcgmPolicyValidation_t; + +/** + * Enumeration for policy failure responses + */ +typedef enum dcgmPolicyFailureResp_enum +{ + DCGM_POLICY_FAILURE_NONE = 0, //!< on failure of validation perform no action +} dcgmPolicyFailureResp_t; + +/** + * Structure to fill when a user queries for policy violations + */ +typedef struct +{ + unsigned int gpuId; //!< gpu ID + unsigned int violationOccurred; //!< a violation based on the bit values in \ref dcgmPolicyCondition_t +} dcgmPolicyViolationNotify_t; + +/** + * Define the structure that specifies a policy to be enforced for a GPU + */ +typedef struct +{ + // version must always be first + unsigned int version; //!< version number (dcgmPolicy_version) + + dcgmPolicyCondition_t condition; //!< Condition(s) to access \ref dcgmPolicyCondition_t + dcgmPolicyMode_t mode; //!< Mode of operation \ref dcgmPolicyMode_t + dcgmPolicyIsolation_t isolation; //!< Isolation level after a policy violation \ref dcgmPolicyIsolation_t + dcgmPolicyAction_t action; //!< Action to perform after a policy violation \ref dcgmPolicyAction_t action + dcgmPolicyValidation_t validation; //!< Validation to perform after action is taken \ref dcgmPolicyValidation_t + dcgmPolicyFailureResp_t response; //!< Failure to validation response \ref dcgmPolicyFailureResp_t + dcgmPolicyConditionParams_t parms[DCGM_POLICY_COND_MAX]; //!< Parameters for the \a condition fields +} dcgmPolicy_v1; + +/** + * Typedef for \ref dcgmPolicy_v1 + */ +typedef dcgmPolicy_v1 dcgmPolicy_t; + +/** + * Version 1 for \ref dcgmPolicy_v1 + */ +#define dcgmPolicy_version1 MAKE_DCGM_VERSION(dcgmPolicy_v1, 1) + +/** + * Latest version for \ref dcgmPolicy_t + */ +#define dcgmPolicy_version dcgmPolicy_version1 + + +/** + * Define the ECC DBE return structure + */ +typedef struct +{ + long long timestamp; //!< timestamp of the error + enum + { + L1, + L2, + DEVICE, + REGISTER, + TEXTURE + } location; //!< location of the error + unsigned int numerrors; //!< number of errors +} dcgmPolicyConditionDbe_t; + +/** + * Define the PCI replay error return structure + */ +typedef struct +{ + long long timestamp; //!< timestamp of the error + unsigned int counter; //!< value of the PCIe replay counter +} dcgmPolicyConditionPci_t; + +/** + * Define the maximum pending retired pages limit return structure + */ +typedef struct +{ + long long timestamp; //!< timestamp of the error + unsigned int sbepages; //!< number of pending pages due to SBE + unsigned int dbepages; //!< number of pending pages due to DBE +} dcgmPolicyConditionMpr_t; + +/** + * Define the thermal policy violations return structure + */ +typedef struct +{ + long long timestamp; //!< timestamp of the error + unsigned int thermalViolation; //!< Temperature reached that violated policy +} dcgmPolicyConditionThermal_t; + +/** + * Define the power policy violations return structure + */ +typedef struct +{ + long long timestamp; //!< timestamp of the error + unsigned int powerViolation; //!< Power value reached that violated policy +} dcgmPolicyConditionPower_t; + +/** + * Define the nvlink policy violations return structure + */ +typedef struct +{ + long long timestamp; //!< timestamp of the error + unsigned short fieldId; //!< Nvlink counter field ID that violated policy + unsigned int counter; //!< Nvlink counter value that violated policy +} dcgmPolicyConditionNvlink_t; + +/** + * Define the xid policy violations return structure + */ +typedef struct +{ + long long timestamp; //!< Timestamp of the error + unsigned int errnum; //!< The XID error number +} dcgmPolicyConditionXID_t; + + +/** + * Define the structure that is given to the callback function + */ +typedef struct +{ + // version must always be first + unsigned int version; //!< version number (dcgmPolicyCallbackResponse_version) + + dcgmPolicyCondition_t condition; //!< Condition that was violated + union + { + dcgmPolicyConditionDbe_t dbe; //!< ECC DBE return structure + dcgmPolicyConditionPci_t pci; //!< PCI replay error return structure + dcgmPolicyConditionMpr_t mpr; //!< Max retired pages limit return structure + dcgmPolicyConditionThermal_t thermal; //!< Thermal policy violations return structure + dcgmPolicyConditionPower_t power; //!< Power policy violations return structure + dcgmPolicyConditionNvlink_t nvlink; //!< Nvlink policy violations return structure + dcgmPolicyConditionXID_t xid; //!< XID policy violations return structure + } val; +} dcgmPolicyCallbackResponse_v1; + + +/** + * Typedef for \ref dcgmPolicyCallbackResponse_v1 + */ +typedef dcgmPolicyCallbackResponse_v1 dcgmPolicyCallbackResponse_t; + +/** + * Version 1 for \ref dcgmPolicyCallbackResponse_v1 + */ +#define dcgmPolicyCallbackResponse_version1 MAKE_DCGM_VERSION(dcgmPolicyCallbackResponse_v1, 1) + +/** + * Latest version for \ref dcgmPolicyCallbackResponse_t + */ +#define dcgmPolicyCallbackResponse_version dcgmPolicyCallbackResponse_version1 + +/** + * Set above size of largest blob entry. Currently this is dcgmDeviceVgpuTypeInfo_v1 + */ +#define DCGM_MAX_BLOB_LENGTH 4096 + +/** + * This structure is used to represent value for the field to be queried. + */ +typedef struct +{ + // version must always be first + unsigned int version; //!< version number (dcgmFieldValue_version1) + + unsigned short fieldId; //!< One of DCGM_FI_? + unsigned short fieldType; //!< One of DCGM_FT_? + int status; //!< Status for the querying the field. DCGM_ST_OK or one of DCGM_ST_? + int64_t ts; //!< Timestamp in usec since 1970 + union + { + int64_t i64; //!< Int64 value + double dbl; //!< Double value + char str[DCGM_MAX_STR_LENGTH]; //!< NULL terminated string + char blob[DCGM_MAX_BLOB_LENGTH]; //!< Binary blob + } value; //!< Value +} dcgmFieldValue_v1; + +/** + * Version 1 for \ref dcgmFieldValue_v1 + */ +#define dcgmFieldValue_version1 MAKE_DCGM_VERSION(dcgmFieldValue_v1, 1) + +/** + * This structure is used to represent value for the field to be queried. + */ +typedef struct +{ + // version must always be first + unsigned int version; //!< version number (dcgmFieldValue_version2) + dcgm_field_entity_group_t entityGroupId; //!< Entity group this field value's entity belongs to + dcgm_field_eid_t entityId; //!< Entity this field value belongs to + unsigned short fieldId; //!< One of DCGM_FI_? + unsigned short fieldType; //!< One of DCGM_FT_? + int status; //!< Status for the querying the field. DCGM_ST_OK or one of DCGM_ST_? + unsigned int unused; //!< Unused for now to align ts to an 8-byte boundary. + int64_t ts; //!< Timestamp in usec since 1970 + union + { + int64_t i64; //!< Int64 value + double dbl; //!< Double value + char str[DCGM_MAX_STR_LENGTH]; //!< NULL terminated string + char blob[DCGM_MAX_BLOB_LENGTH]; //!< Binary blob + } value; //!< Value +} dcgmFieldValue_v2; + +/** + * Version 2 for \ref dcgmFieldValue_v2 + */ +#define dcgmFieldValue_version2 MAKE_DCGM_VERSION(dcgmFieldValue_v2, 2) + +/** + * Field value flags used by \ref dcgmEntitiesGetLatestValues + * + * Retrieve live data from the driver rather than cached data. + * Warning: Setting this flag will result in multiple calls to the NVIDIA driver that will be much slower than + * retrieving a cached value. + */ +#define DCGM_FV_FLAG_LIVE_DATA 0x00000001 + +/** + * User callback function for processing one or more field updates. This callback will + * be invoked one or more times per field until all of the expected field values have been + * enumerated. It is up to the callee to detect when the field id changes + * + * @param gpuId IN: GPU ID of the GPU this field value set belongs to + * @param values IN: Field values. These values must be copied as they will be destroyed as soon as this + * call returns. + * @param numValues IN: Number of entries that are valid in values[] + * @param userData IN: User data pointer passed to the update function that generated this callback + * + * @returns + * 0 if OK + * <0 if enumeration should stop. This allows to callee to abort field value enumeration. + * + */ +typedef int (*dcgmFieldValueEnumeration_f)(unsigned int gpuId, + dcgmFieldValue_v1 *values, + int numValues, + void *userData); + +/** + * User callback function for processing one or more field updates. This callback will + * be invoked one or more times per field until all of the expected field values have been + * enumerated. It is up to the callee to detect when the field id changes + * + * @param entityGroupId IN: entityGroup of the entity this field value set belongs to + * @param entityId IN: Entity this field value set belongs to + * @param values IN: Field values. These values must be copied as they will be destroyed as soon as this + * call returns. + * @param numValues IN: Number of entries that are valid in values[] + * @param userData IN: User data pointer passed to the update function that generated this callback + * + * @returns + * 0 if OK + * <0 if enumeration should stop. This allows to callee to abort field value enumeration. + * + */ +typedef int (*dcgmFieldValueEntityEnumeration_f)(dcgm_field_entity_group_t entityGroupId, + dcgm_field_eid_t entityId, + dcgmFieldValue_v1 *values, + int numValues, + void *userData); + + +/** + * Summary of time series data in int64 format. + * + * Each value will either be set or be a BLANK value. + * Check for blank with the DCGM_INT64_IS_BLANK() macro. + * \sa See dcgmvalue.h for the actual values of BLANK values + */ +typedef struct +{ + long long minValue; //!< Minimum value of the samples looked at + long long maxValue; //!< Maximum value of the samples looked at + long long average; //!< Simple average of the samples looked at. Blank values are ignored for this calculation +} dcgmStatSummaryInt64_t; + +/** + * Same as dcgmStatSummaryInt64_t, but with 32-bit integer values + */ +typedef struct +{ + int minValue; //!< Minimum value of the samples looked at + int maxValue; //!< Maximum value of the samples looked at + int average; //!< Simple average of the samples looked at. Blank values are ignored for this calculation +} dcgmStatSummaryInt32_t; + +/** + * Summary of time series data in double-precision format. + * Each value will either be set or be a BLANK value. + * Check for blank with the DCGM_FP64_IS_BLANK() macro. + * \sa See dcgmvalue.h for the actual values of BLANK values + */ +typedef struct +{ + double minValue; //!< Minimum value of the samples looked at + double maxValue; //!< Maximum value of the samples looked at + double average; //!< Simple average of the samples looked at. Blank values are ignored for this calculation +} dcgmStatSummaryFp64_t; + +/** + * Systems structure used to enable or disable health watch systems + */ +typedef enum dcgmHealthSystems_enum +{ + DCGM_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches (must have 1m of data before query) + DCGM_HEALTH_WATCH_NVLINK = 0x2, //!< NVLINK system watches + DCGM_HEALTH_WATCH_PMU = 0x4, //!< Power management unit watches + DCGM_HEALTH_WATCH_MCU = 0x8, //!< Micro-controller unit watches + DCGM_HEALTH_WATCH_MEM = 0x10, //!< Memory watches + DCGM_HEALTH_WATCH_SM = 0x20, //!< Streaming multiprocessor watches + DCGM_HEALTH_WATCH_INFOROM = 0x40, //!< Inforom watches + DCGM_HEALTH_WATCH_THERMAL = 0x80, //!< Temperature watches (must have 1m of data before query) + DCGM_HEALTH_WATCH_POWER = 0x100, //!< Power watches (must have 1m of data before query) + DCGM_HEALTH_WATCH_DRIVER = 0x200, //!< Driver-related watches + DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL = 0x400, //!< Non-fatal errors in NvSwitch + DCGM_HEALTH_WATCH_NVSWITCH_FATAL = 0x800, //!< Fatal errors in NvSwitch + + // ... + DCGM_HEALTH_WATCH_ALL = 0xFFFFFFFF //!< All watches enabled +} dcgmHealthSystems_t; + +#define DCGM_HEALTH_WATCH_COUNT_V1 10 /*!< For iterating through the dcgmHealthSystems_v1 enum */ +#define DCGM_HEALTH_WATCH_COUNT_V2 12 /*!< For iterating through the dcgmHealthSystems_v2 enum */ + +/** + * Health Watch test results + */ +typedef enum dcgmHealthWatchResult_enum +{ + DCGM_HEALTH_RESULT_PASS = 0, //!< All results within this system are reporting normal + DCGM_HEALTH_RESULT_WARN = 10, //!< A warning has been issued, refer to the response for more information + DCGM_HEALTH_RESULT_FAIL = 20, //!< A failure has been issued, refer to the response for more information +} dcgmHealthWatchResults_t; + +typedef struct +{ + char msg[1024]; + unsigned int code; +} dcgmDiagErrorDetail_t; + +#define DCGM_ERR_MSG_LENGTH 512 +/** + * Error details + * + * Since DCGM 3.3 + */ +typedef struct +{ + char msg[DCGM_ERR_MSG_LENGTH]; + int gpuId; + unsigned int code; + unsigned int category; //!< See dcgmErrorCategory_t + unsigned int severity; //!< See dcgmErrorSeverity_t +} dcgmDiagErrorDetail_v2; + +#define DCGM_HEALTH_WATCH_MAX_INCIDENTS DCGM_GROUP_MAX_ENTITIES + +typedef struct +{ + dcgmHealthSystems_t system; //!< system to which this information belongs + dcgmHealthWatchResults_t health; //!< health diagnosis of this incident + dcgmDiagErrorDetail_t error; //!< Information about the error(s) and their error codes + dcgmGroupEntityPair_t entityInfo; //!< identify which entity has this error +} dcgmIncidentInfo_t; + +/** + * Health response structure version 4 - Simply list the incidents instead of reporting by entity + * + * Since DCGM 2.0 + */ +typedef struct +{ + unsigned int version; //!< The version number of this struct + dcgmHealthWatchResults_t overallHealth; //!< The overall health of this entire host + unsigned int incidentCount; //!< The number of health incidents reported in this struct + dcgmIncidentInfo_t incidents[DCGM_HEALTH_WATCH_MAX_INCIDENTS]; //!< Report of the errors detected +} dcgmHealthResponse_v4; + +/** + * Version 4 for \ref dcgmHealthResponse_v4 + */ +#define dcgmHealthResponse_version4 MAKE_DCGM_VERSION(dcgmHealthResponse_v4, 4) + +/** + * Latest version for \ref dcgmHealthResponse_t + */ +#define dcgmHealthResponse_version dcgmHealthResponse_version4 + +/** + * Typedef for \ref dcgmHealthResponse_v4 + */ +typedef dcgmHealthResponse_v4 dcgmHealthResponse_t; + +/** + * Structure used to set health watches via the dcgmHealthSet_v2 API + */ +typedef struct +{ + unsigned int version; /*!< Version of this struct. Should be dcgmHealthSet_version2 */ + dcgmGpuGrp_t groupId; /*!< Group ID representing collection of one or more entities. Look + at \ref dcgmGroupCreate for details on creating the group. + Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS + to perform operation on all the GPUs or \a DCGM_GROUP_ALL_NVSWITCHES + to perform operation on all the NvSwitches. */ + dcgmHealthSystems_t systems; /*!< An enum representing systems that should be enabled for health + checks logically OR'd together. Refer to \ref dcgmHealthSystems_t + for details. */ + long long updateInterval; /*!< How often to query the underlying health information from the + NVIDIA driver in usec. This should be the same as how often you call + dcgmHealthCheck */ + double maxKeepAge; /*!< How long to keep data cached for this field in seconds. This should + be at least your maximum time between calling dcgmHealthCheck */ +} dcgmHealthSetParams_v2; + +/** + * Version 2 for \ref dcgmHealthSet_v2 + */ +#define dcgmHealthSetParams_version2 MAKE_DCGM_VERSION(dcgmHealthSetParams_v2, 2) + + +#define DCGM_MAX_PID_INFO_NUM 16 +/** + * per process utilization rates + */ +typedef struct +{ + unsigned int pid; + double smUtil; + double memUtil; +} dcgmProcessUtilInfo_t; + +/** + *Internal structure used to get the PID and the corresponding utilization rate + */ +typedef struct +{ + double util; + unsigned int pid; +} dcgmProcessUtilSample_t; + +/** + * Info corresponding to single PID + */ +typedef struct +{ + unsigned int gpuId; //!< ID of the GPU this pertains to. GPU_ID_INVALID = summary information for multiple GPUs + + /* All of the following are during the process's lifetime */ + + long long energyConsumed; //!< Energy consumed by the gpu in milli-watt/seconds + dcgmStatSummaryInt64_t pcieRxBandwidth; //!< PCI-E bytes read from the GPU + dcgmStatSummaryInt64_t pcieTxBandwidth; //!< PCI-E bytes written to the GPU + long long pcieReplays; //!< Count of PCI-E replays that occurred + long long startTime; //!< Process start time in microseconds since 1970 + long long endTime; //!< Process end time in microseconds since 1970 or reported as 0 if the process is not completed + dcgmProcessUtilInfo_t processUtilization; //!< Process SM and Memory Utilization (in percent) + dcgmStatSummaryInt32_t smUtilization; //!< GPU SM Utilization in percent + dcgmStatSummaryInt32_t memoryUtilization; //!< GPU Memory Utilization in percent + unsigned int eccSingleBit; //!< Deprecated - Count of ECC single bit errors that occurred + unsigned int eccDoubleBit; //!< Count of ECC double bit errors that occurred + dcgmStatSummaryInt32_t memoryClock; //!< Memory clock in MHz + dcgmStatSummaryInt32_t smClock; //!< SM clock in MHz + + int numXidCriticalErrors; //!< Number of valid entries in xidCriticalErrorsTs + long long xidCriticalErrorsTs[10]; //!< Timestamps of the critical XID errors that occurred + + int numOtherComputePids; //!< Count of otherComputePids entries that are valid + unsigned int otherComputePids[DCGM_MAX_PID_INFO_NUM]; //!< Other compute processes that ran. 0=no process + + int numOtherGraphicsPids; //!< Count of otherGraphicsPids entries that are valid + unsigned int otherGraphicsPids[DCGM_MAX_PID_INFO_NUM]; //!< Other graphics processes that ran. 0=no process + + long long maxGpuMemoryUsed; //!< Maximum amount of GPU memory that was used in bytes + + long long powerViolationTime; //!< Number of microseconds we were at reduced clocks due to power violation + long long thermalViolationTime; //!< Number of microseconds we were at reduced clocks due to thermal violation + long long reliabilityViolationTime; //!< Amount of microseconds we were at reduced clocks + //!< due to the reliability limit + long long boardLimitViolationTime; //!< Amount of microseconds we were at reduced clocks due to being at the + //!< board's max voltage + long long lowUtilizationTime; //!< Amount of microseconds we were at reduced clocks due to low utilization + long long syncBoostTime; //!< Amount of microseconds we were at reduced clocks due to sync boost + dcgmHealthWatchResults_t overallHealth; //!< The overall health of the system. \ref dcgmHealthWatchResults_t + unsigned int incidentCount; + struct + { + dcgmHealthSystems_t system; //!< system to which this information belongs + dcgmHealthWatchResults_t health; //!< health of the specified system on this GPU + } systems[DCGM_HEALTH_WATCH_COUNT_V1]; +} dcgmPidSingleInfo_t; + +/** + * To store process statistics + */ +typedef struct +{ + unsigned int version; //!< Version of this message (dcgmPidInfo_version) + unsigned int pid; //!< PID of the process + unsigned int unused; + int numGpus; //!< Number of GPUs that are valid in GPUs + dcgmPidSingleInfo_t summary; //!< Summary information for all GPUs listed in gpus[] + dcgmPidSingleInfo_t gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU information for this PID +} dcgmPidInfo_v2; + +/** + * Typedef for \ref dcgmPidInfo_v2 + */ +typedef dcgmPidInfo_v2 dcgmPidInfo_t; + +/** + * Version 2 for \ref dcgmPidInfo_v2 + */ +#define dcgmPidInfo_version2 MAKE_DCGM_VERSION(dcgmPidInfo_v2, 2) + +/** + * Latest version for \ref dcgmPidInfo_t + */ +#define dcgmPidInfo_version dcgmPidInfo_version2 + +/** + * Info corresponding to the job on a GPU + */ +typedef struct +{ + unsigned int gpuId; //!< ID of the GPU this pertains to. GPU_ID_INVALID = summary information for multiple GPUs + + /* All of the following are during the job's lifetime */ + + long long energyConsumed; //!< Energy consumed in milli-watt/seconds + dcgmStatSummaryFp64_t powerUsage; //!< Power usage Min/Max/Avg in watts + dcgmStatSummaryInt64_t pcieRxBandwidth; //!< PCI-E bytes read from the GPU + dcgmStatSummaryInt64_t pcieTxBandwidth; //!< PCI-E bytes written to the GPU + long long pcieReplays; //!< Count of PCI-E replays that occurred + long long startTime; //!< User provided job start time in microseconds since 1970 + long long endTime; //!< User provided job end time in microseconds since 1970 + dcgmStatSummaryInt32_t smUtilization; //!< GPU SM Utilization in percent + dcgmStatSummaryInt32_t memoryUtilization; //!< GPU Memory Utilization in percent + unsigned int eccSingleBit; //!< Deprecated - Count of ECC single bit errors that occurred + unsigned int eccDoubleBit; //!< Count of ECC double bit errors that occurred + dcgmStatSummaryInt32_t memoryClock; //!< Memory clock in MHz + dcgmStatSummaryInt32_t smClock; //!< SM clock in MHz + + int numXidCriticalErrors; //!< Number of valid entries in xidCriticalErrorsTs + long long xidCriticalErrorsTs[10]; //!< Timestamps of the critical XID errors that occurred + + int numComputePids; //!< Count of computePids entries that are valid + dcgmProcessUtilInfo_t computePidInfo[DCGM_MAX_PID_INFO_NUM]; //!< List of compute processes that ran during the job + //!< 0=no process + + int numGraphicsPids; //!< Count of graphicsPids entries that are valid + dcgmProcessUtilInfo_t graphicsPidInfo[DCGM_MAX_PID_INFO_NUM]; //!< List of compute processes that ran during the job + //!< 0=no process + + long long maxGpuMemoryUsed; //!< Maximum amount of GPU memory that was used in bytes + + long long powerViolationTime; //!< Number of microseconds we were at reduced clocks due to power violation + long long thermalViolationTime; //!< Number of microseconds we were at reduced clocks due to thermal violation + long long reliabilityViolationTime; //!< Amount of microseconds we were at reduced clocks + //!< due to the reliability limit + long long boardLimitViolationTime; //!< Amount of microseconds we were at reduced clocks + //!< due to being at the board's max voltage + long long lowUtilizationTime; //!< Amount of microseconds we were at reduced clocks due to low utilization + long long syncBoostTime; //!< Amount of microseconds we were at reduced clocks due to sync boost + dcgmHealthWatchResults_t overallHealth; //!< The overall health of the system. \ref dcgmHealthWatchResults_t + unsigned int incidentCount; + struct + { + dcgmHealthSystems_t system; //!< system to which this information belongs + dcgmHealthWatchResults_t health; //!< health of the specified system on this GPU + } systems[DCGM_HEALTH_WATCH_COUNT_V1]; +} dcgmGpuUsageInfo_t; + + +/** + * To store job statistics + * The following fields are not applicable in the summary info: + * - pcieRxBandwidth (Min/Max) + * - pcieTxBandwidth (Min/Max) + * - smUtilization (Min/Max) + * - memoryUtilization (Min/Max) + * - memoryClock (Min/Max) + * - smClock (Min/Max) + * - processSamples + * + * The average value in the above fields (in the summary) is the + * average of the averages of respective fields from all GPUs + */ +typedef struct +{ + unsigned int version; //!< Version of this message (dcgmPidInfo_version) + int numGpus; //!< Number of GPUs that are valid in gpus[] + dcgmGpuUsageInfo_t summary; //!< Summary information for all GPUs listed in gpus[] + dcgmGpuUsageInfo_t gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU information for this PID +} dcgmJobInfo_v3; + +/** + * Typedef for \ref dcgmJobInfo_v3 + */ +typedef dcgmJobInfo_v3 dcgmJobInfo_t; + +/** + * Version 3 for \ref dcgmJobInfo_v3 + */ +#define dcgmJobInfo_version3 MAKE_DCGM_VERSION(dcgmJobInfo_v3, 3) + +/** + * Latest version for \ref dcgmJobInfo_t + */ +#define dcgmJobInfo_version dcgmJobInfo_version3 + + +/** + * Running process information for a compute or graphics process + */ +typedef struct +{ + unsigned int version; //!< Version of this message (dcgmRunningProcess_version) + unsigned int pid; //!< PID of the process + unsigned long long memoryUsed; //!< GPU memory used by this process in bytes. +} dcgmRunningProcess_v1; + +/** + * Typedef for \ref dcgmRunningProcess_v1 + */ +typedef dcgmRunningProcess_v1 dcgmRunningProcess_t; + +/** + * Version 1 for \ref dcgmRunningProcess_v1 + */ +#define dcgmRunningProcess_version1 MAKE_DCGM_VERSION(dcgmRunningProcess_v1, 1) + +/** + * Latest version for \ref dcgmRunningProcess_t + */ +#define dcgmRunningProcess_version dcgmRunningProcess_version1 + +/** + * Enumeration for diagnostic levels + */ +typedef enum +{ + DCGM_DIAG_LVL_INVALID = 0, //!< Uninitialized + DCGM_DIAG_LVL_SHORT = 10, //!< run a very basic health check on the system + DCGM_DIAG_LVL_MED = 20, //!< run a medium-length diagnostic (a few minutes) + DCGM_DIAG_LVL_LONG = 30, //!< run a extensive diagnostic (several minutes) + DCGM_DIAG_LVL_XLONG = 40, //!< run a very extensive diagnostic (many minutes) +} dcgmDiagnosticLevel_t; + +/** + * Diagnostic test results + */ +typedef enum dcgmDiagResult_enum +{ + DCGM_DIAG_RESULT_PASS = 0, //!< This test passed as diagnostics + DCGM_DIAG_RESULT_SKIP = 1, //!< This test was skipped + DCGM_DIAG_RESULT_WARN = 2, //!< This test passed with warnings + DCGM_DIAG_RESULT_FAIL = 3, //!< This test failed the diagnostics + DCGM_DIAG_RESULT_NOT_RUN = 4, //!< This test wasn't executed +} dcgmDiagResult_t; + +typedef struct +{ + dcgmDiagResult_t status; //!< The result of the test + dcgmDiagErrorDetail_t error; //!< The error message and error code, if any + char info[1024]; //!< Information details returned from the test, if any +} dcgmDiagTestResult_v2; + +#define DCGM_MAX_ERRORS 5 +typedef struct +{ + dcgmDiagResult_t status; //!< The result of the test + dcgmDiagErrorDetail_v2 error[DCGM_MAX_ERRORS]; //!< The error message and error code, if any + char info[DCGM_ERR_MSG_LENGTH]; //!< Information details returned from the test, if any +} dcgmDiagTestResult_v3; + +/** + * Diagnostic per gpu tests - fixed indices for dcgmDiagResponsePerGpu_t.results[] + */ +typedef enum dcgmPerGpuTestIndices_enum +{ + DCGM_MEMORY_INDEX = 0, //!< Memory test index + DCGM_DIAGNOSTIC_INDEX = 1, //!< Diagnostic test index + DCGM_PCI_INDEX = 2, //!< PCIe test index + DCGM_SM_STRESS_INDEX = 3, //!< SM Stress test index + DCGM_TARGETED_STRESS_INDEX = 4, //!< Targeted Stress test index + DCGM_TARGETED_POWER_INDEX = 5, //!< Targeted Power test index + DCGM_MEMORY_BANDWIDTH_INDEX = 6, //!< Memory bandwidth test index + DCGM_MEMTEST_INDEX = 7, //!< Memtest test index + DCGM_PULSE_TEST_INDEX = 8, //!< Pulse test index + DCGM_EUD_TEST_INDEX = 9, //!< EUD test index + // Remaining tests are included for convenience but have different execution rules + // See DCGM_PER_GPU_TEST_COUNT + DCGM_UNUSED2_TEST_INDEX = 10, + DCGM_UNUSED3_TEST_INDEX = 11, + DCGM_UNUSED4_TEST_INDEX = 12, + DCGM_UNUSED5_TEST_INDEX = 13, + DCGM_SOFTWARE_INDEX = 14, //!< Software test index + DCGM_CONTEXT_CREATE_INDEX = 15, //!< Context create test index + DCGM_UNKNOWN_INDEX = 16 //!< Unknown test +} dcgmPerGpuTestIndices_t; + +// TODO: transition these to dcgm_deprecated.h +#define DCGM_SM_PERF_INDEX DCGM_SM_STRESS_INDEX +#define DCGM_TARGETED_PERF_INDEX DCGM_TARGETED_PERF_INDEX + +// Number of diag tests +// NOTE: does not include software and context_create which have different execution rules +#define DCGM_PER_GPU_TEST_COUNT_V8 13 +#define DCGM_PER_GPU_TEST_COUNT_V7 9 + +/** + * Per GPU diagnostics result structure + */ +typedef struct +{ + unsigned int gpuId; //!< ID for the GPU this information pertains + unsigned int hwDiagnosticReturn; //!< Per GPU hardware diagnostic test return code + dcgmDiagTestResult_v2 results[DCGM_PER_GPU_TEST_COUNT_V8]; //!< Array with a result for each per-gpu test +} dcgmDiagResponsePerGpu_v4; + +typedef struct +{ + unsigned int gpuId; //!< ID for the GPU this information pertains + unsigned int hwDiagnosticReturn; //!< Per GPU hardware diagnostic test return code + dcgmDiagTestResult_v3 results[DCGM_PER_GPU_TEST_COUNT_V8]; //!< Array with a result for each per-gpu test +} dcgmDiagResponsePerGpu_v5; + +/** + * Per gpu response structure v3 + * + * Since DCGM 2.4 + */ +typedef struct +{ + unsigned int gpuId; //!< ID for the GPU this information pertains + unsigned int hwDiagnosticReturn; //!< Per GPU hardware diagnostic test return code + dcgmDiagTestResult_v2 results[DCGM_PER_GPU_TEST_COUNT_V7]; //!< Array with a result for each per-gpu test +} dcgmDiagResponsePerGpu_v3; + + +#define DCGM_SWTEST_COUNT 10 +#define LEVEL_ONE_MAX_RESULTS 16 + +typedef enum dcgmSoftwareTest_enum +{ + DCGM_SWTEST_DENYLIST = 0, //!< test for presence of drivers on the denylist (e.g. nouveau) + DCGM_SWTEST_NVML_LIBRARY = 1, //!< test for presence (and version) of NVML lib + DCGM_SWTEST_CUDA_MAIN_LIBRARY = 2, //!< test for presence (and version) of CUDA lib + DCGM_SWTEST_CUDA_RUNTIME_LIBRARY = 3, //!< test for presence (and version) of CUDA RT lib + DCGM_SWTEST_PERMISSIONS = 4, //!< test for character device permissions + DCGM_SWTEST_PERSISTENCE_MODE = 5, //!< test for persistence mode enabled + DCGM_SWTEST_ENVIRONMENT = 6, //!< test for CUDA environment vars that may slow tests + DCGM_SWTEST_PAGE_RETIREMENT = 7, //!< test for pending frame buffer page retirement + DCGM_SWTEST_GRAPHICS_PROCESSES = 8, //!< test for graphics processes running + DCGM_SWTEST_INFOROM = 9, //!< test for inforom corruption +} dcgmSoftwareTest_t; + +#define DCGM_DEVICE_ID_LEN 5 +#define DCGM_VERSION_LEN 12 + +/** + * Global diagnostics result structure v9 + * + * Since DCGM 3.3 + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmDiagResult_version) + unsigned int gpuCount; //!< number of valid per GPU results + unsigned int levelOneTestCount; //!< number of valid levelOne results + + dcgmDiagTestResult_v3 levelOneResults[LEVEL_ONE_MAX_RESULTS]; //!< Basic, system-wide test results. + dcgmDiagResponsePerGpu_v5 perGpuResponses[DCGM_MAX_NUM_DEVICES]; //!< per GPU test results + dcgmDiagErrorDetail_v2 systemError; //!< System-wide error reported from NVVS + char devIds[DCGM_MAX_NUM_DEVICES][DCGM_DEVICE_ID_LEN]; //!< The SKU device id for each GPU + char devSerials[DCGM_MAX_NUM_DEVICES][DCGM_MAX_STR_LENGTH]; //!< Serial for the device + char dcgmVersion[DCGM_VERSION_LEN]; //!< A string representing DCGM's version + char driverVersion[DCGM_MAX_STR_LENGTH]; //!< A string representing the driver version + char _unused[596]; //!< No longer used +} dcgmDiagResponse_v9; + +/** + * Global diagnostics result structure v8 + * + * Since DCGM 3.0 + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmDiagResult_version) + unsigned int gpuCount; //!< number of valid per GPU results + unsigned int levelOneTestCount; //!< number of valid levelOne results + + dcgmDiagTestResult_v2 levelOneResults[LEVEL_ONE_MAX_RESULTS]; //!< Basic, system-wide test results. + dcgmDiagResponsePerGpu_v4 perGpuResponses[DCGM_MAX_NUM_DEVICES]; //!< per GPU test results + dcgmDiagErrorDetail_t systemError; //!< System-wide error reported from NVVS + char devIds[DCGM_MAX_NUM_DEVICES][DCGM_DEVICE_ID_LEN]; //!< The SKU device id for each GPU + char dcgmVersion[DCGM_VERSION_LEN]; //!< A string representing DCGM's version + char driverVersion[DCGM_MAX_STR_LENGTH]; //!< A string representing the driver version + char _unused[596]; //!< No longer used +} dcgmDiagResponse_v8; + +/** + * Global diagnostics result structure v7 + * + * Since DCGM 2.4 + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmDiagResult_version) + unsigned int gpuCount; //!< number of valid per GPU results + unsigned int levelOneTestCount; //!< number of valid levelOne results + + dcgmDiagTestResult_v2 levelOneResults[LEVEL_ONE_MAX_RESULTS]; //!< Basic, system-wide test results. + dcgmDiagResponsePerGpu_v3 perGpuResponses[DCGM_MAX_NUM_DEVICES]; //!< per GPU test results + dcgmDiagErrorDetail_t systemError; //!< System-wide error reported from NVVS + char _unused[1024]; //!< No longer used +} dcgmDiagResponse_v7; + +/** + * Typedef for \ref dcgmDiagResponse_v9 + */ +typedef dcgmDiagResponse_v9 dcgmDiagResponse_t; + +/** + * Version 9 for \ref dcgmDiagResponse_v9 + */ +#define dcgmDiagResponse_version9 MAKE_DCGM_VERSION(dcgmDiagResponse_v9, 9) + +/** + * Version 8 for \ref dcgmDiagResponse_v8 + */ +#define dcgmDiagResponse_version8 MAKE_DCGM_VERSION(dcgmDiagResponse_v8, 8) + +/** + * Version 7 for \ref dcgmDiagResponse_v7 + */ +#define dcgmDiagResponse_version7 MAKE_DCGM_VERSION(dcgmDiagResponse_v7, 7) + +/** + * Latest version for \ref dcgmDiagResponse_t + */ +#define dcgmDiagResponse_version dcgmDiagResponse_version9 + +/** + * Represents level relationships within a system between two GPUs + * The enums are spaced to allow for future relationships. + * These match the definitions in nvml.h + */ +typedef enum dcgmGpuLevel_enum +{ + DCGM_TOPOLOGY_UNINITIALIZED = 0x0, + + /** \name PCI connectivity states */ + /**@{*/ + DCGM_TOPOLOGY_BOARD = 0x1, //!< multi-GPU board + DCGM_TOPOLOGY_SINGLE = 0x2, //!< all devices that only need traverse a single PCIe switch + DCGM_TOPOLOGY_MULTIPLE = 0x4, //!< all devices that need not traverse a host bridge + DCGM_TOPOLOGY_HOSTBRIDGE = 0x8, //!< all devices that are connected to the same host bridge + DCGM_TOPOLOGY_CPU = 0x10, //!< all devices that are connected to the same CPU but possibly multiple host bridges + DCGM_TOPOLOGY_SYSTEM = 0x20, //!< all devices in the system + /**@}*/ + + /** \name NVLINK connectivity states */ + /**@{*/ + DCGM_TOPOLOGY_NVLINK1 = 0x0100, //!< GPUs connected via a single NVLINK link + DCGM_TOPOLOGY_NVLINK2 = 0x0200, //!< GPUs connected via two NVLINK links + DCGM_TOPOLOGY_NVLINK3 = 0x0400, //!< GPUs connected via three NVLINK links + DCGM_TOPOLOGY_NVLINK4 = 0x0800, //!< GPUs connected via four NVLINK links + DCGM_TOPOLOGY_NVLINK5 = 0x1000, //!< GPUs connected via five NVLINK links + DCGM_TOPOLOGY_NVLINK6 = 0x2000, //!< GPUs connected via six NVLINK links + DCGM_TOPOLOGY_NVLINK7 = 0x4000, //!< GPUs connected via seven NVLINK links + DCGM_TOPOLOGY_NVLINK8 = 0x8000, //!< GPUs connected via eight NVLINK links + DCGM_TOPOLOGY_NVLINK9 = 0x10000, //!< GPUs connected via nine NVLINK links + DCGM_TOPOLOGY_NVLINK10 = 0x20000, //!< GPUs connected via ten NVLINK links + DCGM_TOPOLOGY_NVLINK11 = 0x40000, //!< GPUs connected via eleven NVLINK links + DCGM_TOPOLOGY_NVLINK12 = 0x80000, //!< GPUs connected via twelve NVLINK links + DCGM_TOPOLOGY_NVLINK13 = 0x100000, //!< GPUs connected via twelve NVLINK links + DCGM_TOPOLOGY_NVLINK14 = 0x200000, //!< GPUs connected via twelve NVLINK links + DCGM_TOPOLOGY_NVLINK15 = 0x400000, //!< GPUs connected via twelve NVLINK links + DCGM_TOPOLOGY_NVLINK16 = 0x800000, //!< GPUs connected via twelve NVLINK links + DCGM_TOPOLOGY_NVLINK17 = 0x1000000, //!< GPUs connected via twelve NVLINK links + DCGM_TOPOLOGY_NVLINK18 = 0x2000000, //!< GPUs connected via twelve NVLINK links + /**@}*/ +} dcgmGpuTopologyLevel_t; + +// the PCI paths are the lower 8 bits of the path information +#define DCGM_TOPOLOGY_PATH_PCI(x) (dcgmGpuTopologyLevel_t)((unsigned int)(x)&0xFF) + +// the NVLINK paths are the upper 24 bits of the path information +#define DCGM_TOPOLOGY_PATH_NVLINK(x) (dcgmGpuTopologyLevel_t)((unsigned int)(x)&0xFFFFFF00) + +#define DCGM_AFFINITY_BITMASK_ARRAY_SIZE 8 + +/** + * Device topology information + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmDeviceTopology_version) + + unsigned long cpuAffinityMask[DCGM_AFFINITY_BITMASK_ARRAY_SIZE]; //!< affinity mask for the specified GPU + //!< a 1 represents affinity to the CPU in that + //!< bit position supports up to 256 cores + unsigned int numGpus; //!< number of valid entries in gpuPaths + + struct + { + unsigned int gpuId; //!< gpuId to which the path represents + dcgmGpuTopologyLevel_t path; //!< path to the gpuId from this GPU. Note that this is a bit-mask + //!< of DCGM_TOPOLOGY_* values and can contain both PCIe topology + //!< and NvLink topology where applicable. For instance: + //!< 0x210 = DCGM_TOPOLOGY_CPU | DCGM_TOPOLOGY_NVLINK2 + //!< Use the macros DCGM_TOPOLOGY_PATH_NVLINK and + //!< DCGM_TOPOLOGY_PATH_PCI to mask the NvLink and PCI paths, respectively. + unsigned int localNvLinkIds; //!< bits representing the local links connected to gpuId + //!< e.g. if this field == 3, links 0 and 1 are connected, + //!< field is only valid if NVLINKS actually exist between GPUs + } gpuPaths[DCGM_MAX_NUM_DEVICES - 1]; +} dcgmDeviceTopology_v1; + +/** + * Typedef for \ref dcgmDeviceTopology_v1 + */ +typedef dcgmDeviceTopology_v1 dcgmDeviceTopology_t; + +/** + * Version 1 for \ref dcgmDeviceTopology_v1 + */ +#define dcgmDeviceTopology_version1 MAKE_DCGM_VERSION(dcgmDeviceTopology_v1, 1) + +/** + * Latest version for \ref dcgmDeviceTopology_t + */ +#define dcgmDeviceTopology_version dcgmDeviceTopology_version1 + +/** + * Group topology information + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmGroupTopology_version) + + unsigned long + groupCpuAffinityMask[DCGM_AFFINITY_BITMASK_ARRAY_SIZE]; //!< the CPU affinity mask for all GPUs in the group + //!< a 1 represents affinity to the CPU in that bit + //!< position supports up to 256 cores + unsigned int numaOptimalFlag; //!< a zero value indicates that 1 or more GPUs + //!< in the group have a different CPU affinity and thus + //!< may not be optimal for certain algorithms + dcgmGpuTopologyLevel_t slowestPath; //!< the slowest path amongst GPUs in the group +} dcgmGroupTopology_v1; + +/** + * Typedef for \ref dcgmGroupTopology_v1 + */ +typedef dcgmGroupTopology_v1 dcgmGroupTopology_t; + +/** + * Version 1 for \ref dcgmGroupTopology_v1 + */ +#define dcgmGroupTopology_version1 MAKE_DCGM_VERSION(dcgmGroupTopology_v1, 1) + +/** + * Latest version for \ref dcgmGroupTopology_t + */ +#define dcgmGroupTopology_version dcgmGroupTopology_version1 + +/** + * DCGM Memory usage information + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmIntrospectMemory_version) + long long bytesUsed; //!< number of bytes +} dcgmIntrospectMemory_v1; + +/** + * Typedef for \ref dcgmIntrospectMemory_t + */ +typedef dcgmIntrospectMemory_v1 dcgmIntrospectMemory_t; + +/** + * Version 1 for \ref dcgmIntrospectMemory_t + */ +#define dcgmIntrospectMemory_version1 MAKE_DCGM_VERSION(dcgmIntrospectMemory_v1, 1) + +/** + * Latest version for \ref dcgmIntrospectMemory_t + */ +#define dcgmIntrospectMemory_version dcgmIntrospectMemory_version1 + +/** + * DCGM CPU Utilization information. Multiply values by 100 to get them in %. + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmMetadataCpuUtil_version) + double total; //!< fraction of device's CPU resources that were used + double kernel; //!< fraction of device's CPU resources that were used in kernel mode + double user; //!< fraction of device's CPU resources that were used in user mode +} dcgmIntrospectCpuUtil_v1; + +/** + * Typedef for \ref dcgmIntrospectCpuUtil_t + */ +typedef dcgmIntrospectCpuUtil_v1 dcgmIntrospectCpuUtil_t; + +/** + * Version 1 for \ref dcgmIntrospectCpuUtil_t + */ +#define dcgmIntrospectCpuUtil_version1 MAKE_DCGM_VERSION(dcgmIntrospectCpuUtil_v1, 1) + +/** + * Latest version for \ref dcgmIntrospectCpuUtil_t + */ +#define dcgmIntrospectCpuUtil_version dcgmIntrospectCpuUtil_version1 + +#define DCGM_MAX_CONFIG_FILE_LEN 10000 +#define DCGM_MAX_TEST_NAMES 20 +#define DCGM_MAX_TEST_NAMES_LEN 50 +#define DCGM_MAX_TEST_PARMS 100 +#define DCGM_MAX_TEST_PARMS_LEN 100 +#define DCGM_GPU_LIST_LEN 50 +#define DCGM_FILE_LEN 30 +#define DCGM_PATH_LEN 128 +#define DCGM_THROTTLE_MASK_LEN 50 + +/** + * Flags options for running the GPU diagnostic + * @{ + * + */ + +#define DCGM_HOME_DIR_VAR_NAME "DCGM_HOME_DIR" + +/** + * Output in verbose mode; include information as well as warnings + */ +#define DCGM_RUN_FLAGS_VERBOSE 0x0001 + +/** + * Output stats only on failure + */ +#define DCGM_RUN_FLAGS_STATSONFAIL 0x0002 + +/** + * UNUSED Train DCGM diagnostic and output a configuration file with golden values + */ +#define DCGM_RUN_FLAGS_TRAIN 0x0004 + +/** + * UNUSED Ignore warnings against training the diagnostic and train anyway + */ +#define DCGM_RUN_FLAGS_FORCE_TRAIN 0x0008 + +/** + * Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress, and Diagnostic tests + */ +#define DCGM_RUN_FLAGS_FAIL_EARLY 0x0010 + +/** + * @} + */ + +/* + * Run diagnostic structure v7 + */ +typedef struct +{ + unsigned int version; //!< version of this message + unsigned int flags; //!< flags specifying binary options for running it. See DCGM_RUN_FLAGS_* + unsigned int debugLevel; //!< 0-5 for the debug level the GPU diagnostic will use for logging. + dcgmGpuGrp_t groupId; //!< group of GPUs to verify. Cannot be specified together with gpuList. + dcgmPolicyValidation_t validate; //!< 0-3 for which tests to run. Optional. + char testNames[DCGM_MAX_TEST_NAMES][DCGM_MAX_TEST_NAMES_LEN]; //!< Specified list of test names. Optional. + char testParms[DCGM_MAX_TEST_PARMS][DCGM_MAX_TEST_PARMS_LEN]; //!< Parameters to set for specified tests + //!< in the format: + //!< testName.parameterName=parameterValue. Optional. + char fakeGpuList[DCGM_GPU_LIST_LEN]; //!< Comma-separated list of GPUs. Cannot be specified with the groupId. + char gpuList[DCGM_GPU_LIST_LEN]; //!< Comma-separated list of GPUs. Cannot be specified with the groupId. + char debugLogFile[DCGM_PATH_LEN]; //!< Alternate name for the debug log file that should be used + char statsPath[DCGM_PATH_LEN]; //!< Path that the plugin's statistics files should be written to + char configFileContents[DCGM_MAX_CONFIG_FILE_LEN]; //!< Contents of nvvs config file (likely yaml) + char throttleMask[DCGM_THROTTLE_MASK_LEN]; //!< Throttle reasons to ignore as either integer mask or csv list of + //!< reasons + char pluginPath[DCGM_PATH_LEN]; //!< Custom path to the diagnostic plugins - No longer supported as of 2.2.9 + + unsigned int currentIteration; //!< The current iteration that will be executed + unsigned int totalIterations; //!< The total iterations that will be executed + unsigned int _unusedInt1; //!< No longer used + char _unusedBuf[DCGM_PATH_LEN]; //!< No longer used + unsigned int failCheckInterval; //!< How often the fail early checks should occur when enabled. +} dcgmRunDiag_v7; + +/** + * Version 7 for \ref dcgmRunDiag_t + */ +#define dcgmRunDiag_version7 MAKE_DCGM_VERSION(dcgmRunDiag_v7, 7) + +/** + * Flags for dcgmGetEntityGroupEntities's flags parameter + * + * Only return entities that are supported by DCGM. + * This mimics the behavior of dcgmGetAllSupportedDevices(). + */ +#define DCGM_GEGE_FLAG_ONLY_SUPPORTED 0x00000001 + +/** + * Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS + */ +typedef enum dcgmGpuNVLinkErrorType_enum +{ + DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1, //!< NVLink link recovery error occurred + DCGM_GPU_NVLINK_ERROR_FATAL, //!< NVLink link fatal error occurred +} dcgmGpuNVLinkErrorType_t; + +/** Topology hints for dcgmSelectGpusByTopology() + * @{ + */ + +/** No hints specified */ +#define DCGM_TOPO_HINT_F_NONE 0x00000000 + +/** Ignore the health of the GPUs when picking GPUs for job + * execution. By default, only healthy GPUs are considered. + */ +#define DCGM_TOPO_HINT_F_IGNOREHEALTH 0x00000001 + +/** + * @} + */ + + +typedef struct +{ + unsigned int version; //!< version of this message + uint64_t inputGpuIds; //!< bit-mask of the GPU ids to choose from + uint32_t numGpus; //!< the number of GPUs that DCGM should choose + uint64_t hintFlags; //!< Hints to ignore certain factors for the scheduling hint +} dcgmTopoSchedHint_v1; + +typedef dcgmTopoSchedHint_v1 dcgmTopoSchedHint_t; + +#define dcgmTopoSchedHint_version1 MAKE_DCGM_VERSION(dcgmTopoSchedHint_v1, 1) + +/** + * NvLink link states + */ +typedef enum dcgmNvLinkLinkState_enum +{ + DcgmNvLinkLinkStateNotSupported = 0, //!< NvLink is unsupported by this GPU (Default for GPUs) + DcgmNvLinkLinkStateDisabled = 1, //!< NvLink is supported for this link but this link is disabled + //!< (Default for NvSwitches) + DcgmNvLinkLinkStateDown = 2, //!< This NvLink link is down (inactive) + DcgmNvLinkLinkStateUp = 3 //!< This NvLink link is up (active) +} dcgmNvLinkLinkState_t; + +/** + * State of NvLink links for a GPU + */ +typedef struct +{ + dcgm_field_eid_t entityId; //!< Entity ID of the GPU (gpuId) + dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1]; //!< Per-GPU link states +} dcgmNvLinkGpuLinkStatus_v1; + +typedef struct +{ + dcgm_field_eid_t entityId; //!< Entity ID of the GPU (gpuId) + dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2]; //!< Per-GPU link states +} dcgmNvLinkGpuLinkStatus_v2; + + +typedef struct +{ + dcgm_field_eid_t entityId; //!< Entity ID of the GPU (gpuId) + dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_GPU]; //!< Per-GPU link states +} dcgmNvLinkGpuLinkStatus_v3; + +/** + * State of NvLink links for a NvSwitch + */ +typedef struct +{ + dcgm_field_eid_t entityId; //!< Entity ID of the NvSwitch (physicalId) + dcgmNvLinkLinkState_t linkState[DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH]; //!< Per-NvSwitch link states +} dcgmNvLinkNvSwitchLinkStatus_t; + +/** + * Status of all of the NvLinks in a given system + */ +typedef struct +{ + unsigned int version; //!< Version of this request. Should be dcgmNvLinkStatus_version1 + unsigned int numGpus; //!< Number of entries in gpus[] that are populated + dcgmNvLinkGpuLinkStatus_v3 gpus[DCGM_MAX_NUM_DEVICES]; //!< Per-GPU NvLink link statuses + unsigned int numNvSwitches; //!< Number of entries in nvSwitches[] that are populated + dcgmNvLinkNvSwitchLinkStatus_t nvSwitches[DCGM_MAX_NUM_SWITCHES]; //!< Per-NvSwitch link statuses +} dcgmNvLinkStatus_v3; + +typedef dcgmNvLinkStatus_v3 dcgmNvLinkStatus_t; + +/** + * Version 3 of dcgmNvLinkStatus + */ +#define dcgmNvLinkStatus_version3 MAKE_DCGM_VERSION(dcgmNvLinkStatus_v3, 3) + +/* Bitmask values for dcgmGetFieldIdSummary - Sync with DcgmcmSummaryType_t */ +#define DCGM_SUMMARY_MIN 0x00000001 +#define DCGM_SUMMARY_MAX 0x00000002 +#define DCGM_SUMMARY_AVG 0x00000004 +#define DCGM_SUMMARY_SUM 0x00000008 +#define DCGM_SUMMARY_COUNT 0x00000010 +#define DCGM_SUMMARY_INTEGRAL 0x00000020 +#define DCGM_SUMMARY_DIFF 0x00000040 +#define DCGM_SUMMARY_SIZE 7 + +/* dcgmSummaryResponse_t is part of dcgmFieldSummaryRequest, so it uses dcgmFieldSummaryRequest's version. */ + +typedef struct +{ + unsigned int fieldType; //!< type of field that is summarized (int64 or fp64) + unsigned int summaryCount; //!< the number of populated summaries in \ref values + union + { + int64_t i64; + double fp64; + } values[DCGM_SUMMARY_SIZE]; //!< array for storing the values of each summary. The summaries are stored + //!< in order. For example, if MIN AND MAX are requested, then 0 will be MIN + //!< and 1 will be MAX. If AVG and DIFF were requested, then AVG would be 0 + //!< and 1 would be DIFF +} dcgmSummaryResponse_t; + +typedef struct +{ + unsigned int version; //!< version of this message - dcgmFieldSummaryRequest_v1 + unsigned short fieldId; //!< field id to be summarized + dcgm_field_entity_group_t entityGroupId; //!< the type of entity whose field we're getting + dcgm_field_eid_t entityId; //!< ordinal id for this entity + uint32_t summaryTypeMask; //!< bit-mask of DCGM_SUMMARY_*, the requested summaries + uint64_t startTime; //!< start time for the interval being summarized. 0 means to use + //!< any data before. + uint64_t endTime; //!< end time for the interval being summarized. 0 means to use + //!< any data after. + dcgmSummaryResponse_t response; //!< response data for this request +} dcgmFieldSummaryRequest_v1; + +typedef dcgmFieldSummaryRequest_v1 dcgmFieldSummaryRequest_t; + +#define dcgmFieldSummaryRequest_version1 MAKE_DCGM_VERSION(dcgmFieldSummaryRequest_v1, 1) + +/** + * Module IDs + */ +typedef enum +{ + DcgmModuleIdCore = 0, //!< Core DCGM - always loaded + DcgmModuleIdNvSwitch = 1, //!< NvSwitch Module + DcgmModuleIdVGPU = 2, //!< VGPU Module + DcgmModuleIdIntrospect = 3, //!< Introspection Module + DcgmModuleIdHealth = 4, //!< Health Module + DcgmModuleIdPolicy = 5, //!< Policy Module + DcgmModuleIdConfig = 6, //!< Config Module + DcgmModuleIdDiag = 7, //!< GPU Diagnostic Module + DcgmModuleIdProfiling = 8, //!< Profiling Module + DcgmModuleIdSysmon = 9, //!< System Monitoring Module + + DcgmModuleIdCount //!< Always last. 1 greater than largest value above +} dcgmModuleId_t; + +/** + * Module Status. Modules are lazy loaded, so they will be in status DcgmModuleStatusNotLoaded + * until they are used. One modules are used, they will move to another status. + */ +typedef enum +{ + DcgmModuleStatusNotLoaded = 0, //!< Module has not been loaded yet + DcgmModuleStatusDenylisted = 1, //!< Module is on the denylist; can't be loaded + DcgmModuleStatusFailed = 2, //!< Loading the module failed + DcgmModuleStatusLoaded = 3, //!< Module has been loaded + DcgmModuleStatusUnloaded = 4, //!< Module has been unloaded, happens during shutdown + DcgmModuleStatusPaused = 5, /*!< Module has been paused. This is a temporary state that will + move to DcgmModuleStatusLoaded once the module is resumed. + This status implies that the module is loaded. */ +} dcgmModuleStatus_t; + +/** + * Status of all of the modules of the host engine + */ +typedef struct +{ + dcgmModuleId_t id; //!< ID of this module + dcgmModuleStatus_t status; //!< Status of this module +} dcgmModuleGetStatusesModule_t; + +/* This is larger than DcgmModuleIdCount so we can add modules without versioning this request */ +#define DCGM_MODULE_STATUSES_CAPACITY 16 + +typedef struct +{ + unsigned int version; //!< Version of this request. Should be dcgmModuleGetStatuses_version1 + unsigned int numStatuses; //!< Number of entries in statuses[] that are populated + dcgmModuleGetStatusesModule_t statuses[DCGM_MODULE_STATUSES_CAPACITY]; //!< Per-module status information +} dcgmModuleGetStatuses_v1; + +/** + * Version 1 of dcgmModuleGetStatuses + */ +#define dcgmModuleGetStatuses_version1 MAKE_DCGM_VERSION(dcgmModuleGetStatuses_v1, 1) +#define dcgmModuleGetStatuses_version dcgmModuleGetStatuses_version1 +typedef dcgmModuleGetStatuses_v1 dcgmModuleGetStatuses_t; + +/** + * Options for dcgmStartEmbedded_v2 + * + * Added in DCGM 2.0.0 + */ +typedef struct +{ + unsigned int version; /*!< Version number. Use dcgmStartEmbeddedV2Params_version1 */ + dcgmOperationMode_t opMode; /*!< IN: Collect data automatically or manually when asked by the user. */ + dcgmHandle_t dcgmHandle; /*!< OUT: DCGM Handle to use for API calls */ + const char *logFile; /*!< IN: File that DCGM should log to. NULL = do not log. '-' = stdout */ + DcgmLoggingSeverity_t severity; /*!< IN: Severity at which DCGM should log to logFile */ + unsigned int denyListCount; /*!< IN: Number of modules in denyList[] */ + unsigned int denyList[DcgmModuleIdCount]; /* IN: IDs of modules to add to the denylist */ +} dcgmStartEmbeddedV2Params_v1; + +/** + * Version 1 for \ref dcgmStartEmbeddedV2Params_v1 + */ +#define dcgmStartEmbeddedV2Params_version1 MAKE_DCGM_VERSION(dcgmStartEmbeddedV2Params_v1, 1) + +/** + * Options for dcgmStartEmbeddedV2Params_v2 + * + * Added in DCGM 2.4.0, renamed members in 3.0.0 + */ +typedef struct +{ + unsigned int version; /*!< Version number. Use dcgmStartEmbeddedV2Params_version2 */ + dcgmOperationMode_t opMode; /*!< IN: Collect data automatically or manually when asked by the user. */ + dcgmHandle_t dcgmHandle; /*!< OUT: DCGM Handle to use for API calls */ + const char *logFile; /*!< IN: File that DCGM should log to. NULL = do not log. '-' = stdout */ + DcgmLoggingSeverity_t severity; /*!< IN: Severity at which DCGM should log to logFile */ + unsigned int denyListCount; /*!< IN: Number of modules to be added to the denylist in denyList[] */ + const char *serviceAccount; /*!< IN: Service account for unprivileged processes */ + unsigned int denyList[DcgmModuleIdCount]; /*!< IN: IDs of modules to be added to the denylist */ +} dcgmStartEmbeddedV2Params_v2; + +/** + * Version 2 for \ref dcgmStartEmbeddedV2Params + */ +#define dcgmStartEmbeddedV2Params_version2 MAKE_DCGM_VERSION(dcgmStartEmbeddedV2Params_v2, 2) + +/** + * Maximum number of metric ID groups that can exist in DCGM + */ +#define DCGM_PROF_MAX_NUM_GROUPS_V2 10 + +/** + * Maximum number of field IDs that can be in a single DCGM profiling metric group + */ +#define DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2 64 + +/** + * Structure to return all of the profiling metric groups that are available for the given groupId. + */ +typedef struct +{ + unsigned short majorId; //!< Major ID of this metric group. Metric groups with the same majorId cannot be + //!< watched concurrently with other metric groups with the same majorId + unsigned short minorId; //!< Minor ID of this metric group. This distinguishes metric groups within the same + //!< major metric group from each other + unsigned int numFieldIds; //!< Number of field IDs that are populated in fieldIds[] + unsigned short fieldIds[DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2]; //!< DCGM Field IDs that are part of this profiling + //!< group. See DCGM_FI_PROF_* definitions in + //!< dcgm_fields.h for details. +} dcgmProfMetricGroupInfo_v2; + +typedef struct +{ + /** \name Input parameters + * @{ + */ + unsigned int version; //!< Version of this request. Should be dcgmProfGetMetricGroups_version + unsigned int unused; //!< Not used for now. Set to 0 + unsigned int gpuId; //!< GPU ID we should get the metric groups for. + /** + * @} + */ + + /** \name Output + * @{ + */ + unsigned int numMetricGroups; //!< Number of entries in metricGroups[] that are populated + dcgmProfMetricGroupInfo_v2 metricGroups[DCGM_PROF_MAX_NUM_GROUPS_V2]; //!< Info for each metric group + /** + * @} + */ +} dcgmProfGetMetricGroups_v3; + +/** + * Version 3 of dcgmProfGetMetricGroups_t. See dcgm_structs_24.h for v2 + */ +#define dcgmProfGetMetricGroups_version3 MAKE_DCGM_VERSION(dcgmProfGetMetricGroups_v3, 3) +#define dcgmProfGetMetricGroups_version dcgmProfGetMetricGroups_version3 +typedef dcgmProfGetMetricGroups_v3 dcgmProfGetMetricGroups_t; + +/** + * Structure to pass to dcgmProfWatchFields() when watching profiling metrics + */ +typedef struct +{ + unsigned int version; //!< Version of this request. Should be dcgmProfWatchFields_version + dcgmGpuGrp_t groupId; //!< Group ID representing collection of one or more GPUs. Look at \ref dcgmGroupCreate + //!< for details on creating the group. Alternatively, pass in the group id as \a + //!< DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. The GPUs of the group + //!< must all be identical or DCGM_ST_GROUP_INCOMPATIBLE will be returned by this API. + unsigned int numFieldIds; //!< Number of field IDs that are being passed in fieldIds[] + unsigned short fieldIds[64]; //!< DCGM_FI_PROF_? field IDs to watch + long long updateFreq; //!< How often to update this field in usec. Note that profiling metrics may need to be + //!< sampled more frequently than this value. See + //!< dcgmProfMetricGroupInfo_t.minUpdateFreqUsec of the metric group matching + //!< metricGroupTag to see what this minimum is. If minUpdateFreqUsec < updateFreq + //!< then samples will be aggregated to updateFreq intervals in DCGM's internal cache. + double maxKeepAge; //!< How long to keep data for every fieldId in seconds + int maxKeepSamples; //!< Maximum number of samples to keep for each fieldId. 0=no limit + unsigned int flags; //!< For future use. Set to 0 for now. +} dcgmProfWatchFields_v2; + +/** + * Version 2 of dcgmProfWatchFields_v2 + */ +#define dcgmProfWatchFields_version2 MAKE_DCGM_VERSION(dcgmProfWatchFields_v2, 2) +#define dcgmProfWatchFields_version dcgmProfWatchFields_version2 +typedef dcgmProfWatchFields_v2 dcgmProfWatchFields_t; + +/** + * Structure to pass to dcgmProfUnwatchFields when unwatching profiling metrics + */ +typedef struct +{ + unsigned int version; //!< Version of this request. Should be dcgmProfUnwatchFields_version + dcgmGpuGrp_t groupId; //!< Group ID representing collection of one or more GPUs. Look at + //!< \ref dcgmGroupCreate for details on creating the group. + //!< Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS + //!< to perform operation on all the GPUs. The GPUs of the group must all be + //!< identical or DCGM_ST_GROUP_INCOMPATIBLE will be returned by this API. + unsigned int flags; //!< For future use. Set to 0 for now. +} dcgmProfUnwatchFields_v1; + +/** + * Version 1 of dcgmProfUnwatchFields_v1 + */ +#define dcgmProfUnwatchFields_version1 MAKE_DCGM_VERSION(dcgmProfUnwatchFields_v1, 1) +#define dcgmProfUnwatchFields_version dcgmProfUnwatchFields_version1 +typedef dcgmProfUnwatchFields_v1 dcgmProfUnwatchFields_t; + +/** + * Version 1 of dcgmSettingsSetLoggingSeverity_t + */ +typedef struct +{ + int targetLogger; + DcgmLoggingSeverity_t targetSeverity; +} dcgmSettingsSetLoggingSeverity_v1; + + +#define dcgmSettingsSetLoggingSeverity_version1 MAKE_DCGM_VERSION(dcgmSettingsSetLoggingSeverity_v1, 1) +#define dcgmSettingsSetLoggingSeverity_version dcgmSettingsSetLoggingSeverity_version1 +typedef dcgmSettingsSetLoggingSeverity_v1 dcgmSettingsSetLoggingSeverity_t; + +/** + * Structure to describe the DCGM build environment ver 2.0 + */ +typedef struct +{ + unsigned int version; // + * Every pair is separated by a colon char (:). Only the very first colon is considered as a separation.
+ * Values can contain colon chars. Values and Keys cannot contain semicolon chars.
+ * Usually defined keys are: + *

+ * version : DCGM Version.
+ * arch : Target DCGM Architecture.
+ * buildid : Build ID. Usually a sequential number.
+ * commit : Commit ID (Usually a git commit hash).
+ * author : Author of the commit above.
+ * branch : Branch (Usually a git branch that was used for the build).
+ * buildtype : Build Type.
+ * builddate : Date of the build.
+ * buildplatform : Platform where the build was made.
+ *

+ * Any or all keys may be absent.
+ * This values are for reference only are not supposed to participate in some complicated logic.
+ */ + char rawBuildInfoString[DCGM_MAX_STR_LENGTH * 2]; +} dcgmVersionInfo_v2; + +/** + * Version 2 of the dcgmVersionInfo_v2 + */ +#define dcgmVersionInfo_version2 MAKE_DCGM_VERSION(dcgmVersionInfo_v2, 2) + +#define dcgmVersionInfo_version dcgmVersionInfo_version2 +typedef dcgmVersionInfo_v2 dcgmVersionInfo_t; + +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* DCGM_STRUCTS_H */ diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_structs_internal.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_structs_internal.h new file mode 100644 index 0000000000..a224635be9 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_structs_internal.h @@ -0,0 +1,906 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * File: dcgm_structs_internal.h + */ + +#ifndef DCGM_STRUCTS_INTERNAL_H +#define DCGM_STRUCTS_INTERNAL_H + +/* Make sure that dcgm_structs.h is loaded first. This file depends on it */ +#include "dcgm_agent.h" +#include "dcgm_structs.h" +#include "dcgm_test_structs.h" +#include + +#ifdef INJECTION_LIBRARY_AVAILABLE +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * The following is a compile time assertion. It makes use of the + * restriction that you cannot have an array with a negative size. + * If the expression resolves to 0, then the index to the array is + * defined as -1, and a compile time error is generated. Note that + * all three macros are needed because of the way the preprocessor + * evaluates the directives. Also note that the line number is + * embedded in the name of the array so that the array name is unique + * and we can have multiple calls to the assert with the same msg. + * + * Usage would be like this: + * DCGM_CASSERT(DCGM_VGPU_NAME_BUFFER_SIZE == NVML_VGPU_NAME_BUFFER_SIZE, DCGM_VGPU_NAME_BUFFER_SIZE); + * + */ +#define _DCGM_CASSERT_SYMBOL_INNER(line, msg) COMPILE_TIME_ASSERT_DETECTED_AT_LINE_##line##__##msg +#define _DCGM_CASSERT_SYMBOL(line, msg) _DCGM_CASSERT_SYMBOL_INNER(line, msg) +#define DCGM_CASSERT(expression, msg) \ + __attribute__((unused)) typedef char _DCGM_CASSERT_SYMBOL(__LINE__, msg)[((expression) ? 1 : -1)] + +/** + * Max length of the DCGM string field + */ +#define DCGM_MAX_STR_LENGTH 256 + +typedef struct +{ + unsigned int gpuId; /* DCGM GPU ID */ + char uuid[DCGM_MAX_STR_LENGTH]; /* UUID String */ +} dcgmGpuInfo_t; + +/* Below is a test API simply to make sure versioning is working correctly + */ + +typedef struct +{ + // version must always be first + unsigned int version; + + unsigned int a; +} dcgmVersionTest_v1; + +typedef struct +{ + // version must always be first + unsigned int version; + + unsigned int a; + unsigned int b; +} dcgmVersionTest_v2; + +typedef dcgmVersionTest_v2 dcgmVersionTest_t; +#define dcgmVersionTest_version1 MAKE_DCGM_VERSION(dcgmVersionTest_v1, 1) +#define dcgmVersionTest_version2 MAKE_DCGM_VERSION(dcgmVersionTest_v2, 2) +#define dcgmVersionTest_version3 MAKE_DCGM_VERSION(dcgmVersionTest_v2, 3) +#define dcgmVersionTest_version dcgmVersionTest_version2 + +/** + * Represents a command to save or load a JSON file to/from the DcgmCacheManager + */ + +typedef enum dcgmStatsFileType_enum +{ + DCGM_STATS_FILE_TYPE_JSON = 0 /* JSON */ +} dcgmStatsFileType_t; + +typedef struct +{ + // version must always be first + unsigned int version; + + dcgmStatsFileType_t fileType; /* File type to save to/load from */ + char filename[256]; /* Filename to save to/load from */ +} dcgmCacheManagerSave_v1_t; + +#define dcgmCacheManagerSave_version1 MAKE_DCGM_VERSION(dcgmCacheManagerSave_v1_t, 1) +#define dcgmCacheManagerSave_version dcgmCacheManagerSave_version1 + +typedef dcgmCacheManagerSave_v1_t dcgmCacheManagerSave_t; + +/* Same message contents for now */ +typedef dcgmCacheManagerSave_v1_t dcgmCacheManagerLoad_v1_t; + +typedef dcgmCacheManagerLoad_v1_t dcgmCacheManagerLoad_t; + +#define dcgmCacheManagerLoad_version1 MAKE_DCGM_VERSION(dcgmCacheManagerLoad_v1_t, 1) +#define dcgmCacheManagerLoad_version dcgmCacheManagerLoad_version1 + +#define dcgmWatchFieldValue_version1 1 +#define dcgmWatchFieldValue_version dcgmWatchFieldValue_version1 + +#define dcgmUpdateAllFields_version1 1 +#define dcgmUpdateAllFields_version dcgmUpdateAllFields_version1 + +#define dcgmGetMultipleValuesForField_version1 1 +#define dcgmGetMultipleValuesForField_version dcgmGetMultipleValuesForField_version1 + +#define dcgmUnwatchFieldValue_version1 1 +#define dcgmUnwatchFieldValue_version dcgmUnwatchFieldValue_version1 + +/** + * This structure is used to represent a field value to be injected into + * the cache manager + */ +typedef dcgmFieldValue_v1 dcgmInjectFieldValue_v1; +typedef dcgmInjectFieldValue_v1 dcgmInjectFieldValue_t; +#define dcgmInjectFieldValue_version1 MAKE_DCGM_VERSION(dcgmInjectFieldValue_v1, 1) +#define dcgmInjectFieldValue_version dcgmInjectFieldValue_version1 + +#define dcgmGetMultipleValuesForFieldResponse_version1 1 +#define dcgmGetMultipleValuesForFieldResponse_version dcgmGetMultipleValuesForFieldResponse_version1 + +/* Underlying structure for the GET_MULTIPLE_LATEST_VALUES request */ +typedef struct +{ + unsigned int version; /* Set this to dcgmGetMultipleLatestValues_version1 */ + dcgmGpuGrp_t groupId; /* Entity group to retrieve values for. This is only + looked at if entitiesCount is 0 */ + unsigned int entitiesCount; /* Number of entities provided in entities[]. This + should only be provided if you aren't also setting + entityGroupId */ + dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES]; /* Entities to retrieve values for. + Only looked at if entitiesCount > 0 */ + dcgmFieldGrp_t fieldGroupId; /* Field group to retrive values for. This is onlu looked + at if fieldIdCount is 0 */ + unsigned int fieldIdCount; /* Number of field IDs in fieldIds[] that are valid. This + should only be set if fieldGroupId is not set */ + unsigned short fieldIds[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; /* Field IDs for which values should + be retrieved. only looked at if fieldIdCount is > 0 */ + unsigned int flags; /* Mask of DCGM_FV_FLAG_? #defines that affect this + request */ + +} dcgmGetMultipleLatestValues_v1, dcgmGetMultipleLatestValues_t; + +#define dcgmGetMultipleLatestValues_version1 MAKE_DCGM_VERSION(dcgmGetMultipleLatestValues_v1, 1) +#define dcgmGetMultipleLatestValues_version dcgmGetMultipleLatestValues_version1 + +/* Represents cached record metadata */ + +/* Represents a unique watcher of an entity in DCGM */ + +/* Watcher types. Each watcher type's watches are tracked separately within subsystems */ +typedef enum +{ + DcgmWatcherTypeClient = 0, /* Embedded or remote client via external APIs */ + DcgmWatcherTypeHostEngine = 1, /* Watcher is DcgmHostEngineHandler */ + DcgmWatcherTypeHealthWatch = 2, /* Watcher is DcgmHealthWatch */ + DcgmWatcherTypePolicyManager = 3, /* Watcher is DcgmPolicyMgr */ + DcgmWatcherTypeCacheManager = 4, /* Watcher is DcgmCacheManager */ + DcgmWatcherTypeConfigManager = 5, /* Watcher is DcgmConfigMgr */ + DcgmWatcherTypeNvSwitchManager = 6, /* Watcher is NvSwitchManager */ + + DcgmWatcherTypeCount /* Should always be last */ +} DcgmWatcherType_t; + + +/* ID of a remote client connection within the host engine */ +typedef unsigned int dcgm_connection_id_t; + +/* Special constant for not connected */ +#define DCGM_CONNECTION_ID_NONE ((dcgm_connection_id_t)0) + +/* Cache Manager Info flags */ +#define DCGM_CMI_F_WATCHED 0x00000001 /* Is this field being watched? */ + +/* This structure mirrors the DcgmWatcher object */ +typedef struct dcgm_cm_field_info_watcher_t +{ + DcgmWatcherType_t watcherType; /* Type of watcher. See DcgmWatcherType_t */ + dcgm_connection_id_t connectionId; /* Connection ID of the watcher */ + long long monitorIntervalUsec; /* How often this field should be sampled */ + long long maxAgeUsec; /* Maximum time to cache samples of this + field. If 0, the class default is used */ +} dcgm_cm_field_info_watcher_t, *dcgm_cm_field_info_watcher_p; + +/** + * Number of watchers to show for each field + */ +#define DCGM_CM_FIELD_INFO_NUM_WATCHERS 10 + +typedef struct dcgmCacheManagerFieldInfo_v4_t +{ + unsigned int version; /* Version. Check against dcgmCacheManagerInfo_version */ + unsigned int flags; /* Bitmask of DCGM_CMI_F_? #defines that apply to this field */ + unsigned int entityId; /* ordinal id for this entity */ + unsigned int entityGroupId; /* the type of entity, see dcgm_field_entity_group_t */ + unsigned short fieldId; /* Field ID of this field */ + short lastStatus; /* Last nvml status returned for this field when taking a sample */ + long long oldestTimestamp; /* Timestamp of the oldest record. 0=no records or single + non-time series record */ + long long newestTimestamp; /* Timestamp of the newest record. 0=no records or + single non-time series record */ + long long monitorIntervalUsec; /* How often is this field updated in usec */ + long long maxAgeUsec; /* How often is this field updated */ + long long execTimeUsec; /* Cumulative time spent updating this + field since the cache manager started */ + long long fetchCount; /* Number of times that this field has been + fetched from the driver */ + int numSamples; /* Number of samples currently cached for this field */ + int numWatchers; /* Number of watchers that are valid in watchers[] */ + dcgm_cm_field_info_watcher_t watchers[DCGM_CM_FIELD_INFO_NUM_WATCHERS]; /* Who are the first 10 + watchers of this field? */ +} dcgmCacheManagerFieldInfo_v4_t, *dcgmCacheManagerFieldInfo_v4_p; + +#define dcgmCacheManagerFieldInfo_version4 MAKE_DCGM_VERSION(dcgmCacheManagerFieldInfo_v4_t, 4) + +/** + * The maximum number of topology elements possible given DCGM_MAX_NUM_DEVICES + * calculated using arithmetic sequence formula + * (DCGM_MAX_NUM_DEVICES - 1) * (1 + (DCGM_MAX_NUM_DEVICES-2)/2) + */ +#define DCGM_TOPOLOGY_MAX_ELEMENTS 496 + +/** + * Topology element structure + */ +typedef struct +{ + unsigned int dcgmGpuA; //!< GPU A + unsigned int dcgmGpuB; //!< GPU B + unsigned int AtoBNvLinkIds; //!< bits representing the links connected from GPU A to GPU B + //!< e.g. if this field == 3, links 0 and 1 are connected, + //!< field is only valid if NVLINKS actually exist between GPUs + unsigned int BtoANvLinkIds; //!< bits representing the links connected from GPU B to GPU A + //!< e.g. if this field == 3, links 0 and 1 are connected, + //!< field is only valid if NVLINKS actually exist between GPUs + dcgmGpuTopologyLevel_t path; //!< path between A and B +} dcgmTopologyElement_t; + +/** + * Topology results structure + */ +typedef struct +{ + unsigned int version; //!< version number (dcgmTopology_version) + unsigned int numElements; //!< number of valid dcgmTopologyElement_t elements + + dcgmTopologyElement_t element[DCGM_TOPOLOGY_MAX_ELEMENTS]; +} dcgmTopology_v1; + +/** + * Typedef for \ref dcgmTopology_v1 + */ +typedef dcgmTopology_v1 dcgmTopology_t; + +/** + * Version 1 for \ref dcgmTopology_v1 + */ +#define dcgmTopology_version1 MAKE_DCGM_VERSION(dcgmTopology_v1, 1) + +/** + * Latest version for \ref dcgmTopology_t + */ +#define dcgmTopology_version dcgmTopology_version1 + +typedef struct +{ + unsigned int numGpus; + struct + { + unsigned int dcgmGpuId; + unsigned long bitmask[DCGM_AFFINITY_BITMASK_ARRAY_SIZE]; + } affinityMasks[DCGM_MAX_NUM_DEVICES]; +} dcgmAffinity_t; + + +typedef struct +{ + unsigned int version; //!< IN: Version number (dcgmCreateFakeEntities_version) + unsigned int numToCreate; //!< IN: Number of fake entities to create + dcgmMigHierarchyInfo_t entityList[DCGM_MAX_HIERARCHY_INFO]; //!< IN: specifies who to create and the parent +} dcgmCreateFakeEntities_v2; + +typedef dcgmCreateFakeEntities_v2 dcgmCreateFakeEntities_t; + +/** + * Version 2 for \ref dcgmCreateFakeEntities_t + */ +#define dcgmCreateFakeEntities_version2 MAKE_DCGM_VERSION(dcgmCreateFakeEntities_v2, 2) + +/** + * Latest version for \ref dcgmCreateFakeEntities_t + */ +#define dcgmCreateFakeEntities_version dcgmCreateFakeEntities_version2 + + +/* Field watch predefined groups */ +typedef enum +{ + DCGM_WATCH_PREDEF_INVALID = 0, + DCGM_WATCH_PREDEF_PID, /*!< PID stats */ + DCGM_WATCH_PREDEF_JOB, /*!< Job stats */ +} dcgmWatchPredefinedType_t; + +typedef struct +{ + unsigned int version; + dcgmWatchPredefinedType_t watchPredefType; /*!< Which type of predefined watch are we adding? */ + + dcgmGpuGrp_t groupId; /*!< GPU group to watch fields for */ + long long updateFreq; /*!< How often to update the fields in usec */ + double maxKeepAge; /*!< How long to keep values for the fields in seconds */ + int maxKeepSamples; /*!< Maximum number of samples we should keep at a time */ +} dcgmWatchPredefined_v1; + +typedef dcgmWatchPredefined_v1 dcgmWatchPredefined_t; + +/** + * Version 1 for \ref dcgmWatchPredefined_t + */ +#define dcgmWatchPredefined_version1 MAKE_DCGM_VERSION(dcgmWatchPredefined_v1, 1) + +/** + * Latest version for \ref dcgmWatchPredefined_t + */ +#define dcgmWatchPredefined_version dcgmWatchPredefined_version1 + +/** + * Request to set a NvLink link state for an entity + */ +typedef struct +{ + unsigned int version; /*!< Version. Should be dcgmSetNvLinkLinkState_version1 */ + dcgm_field_entity_group_t entityGroupId; /*!< Entity group of the entity to set the link state of */ + dcgm_field_eid_t entityId; /*!< ID of the entity to set the link state of */ + unsigned int linkId; /*!< Link (or portId) of the link to set the state of */ + dcgmNvLinkLinkState_t linkState; /*!< State to set the link to */ + unsigned int unused; /*!< Not used for now. Set to 0 */ +} dcgmSetNvLinkLinkState_v1; + +#define dcgmSetNvLinkLinkState_version1 MAKE_DCGM_VERSION(dcgmSetNvLinkLinkState_v1, 1) + + +/** + * Request to add a module ID to the denylist + */ +typedef struct +{ + unsigned int version; /*!< Version. Should be dcgmModuleDenylist_version */ + dcgmModuleId_t moduleId; /*!< Module to add to the denylist */ +} dcgmModuleDenylist_v1; + +#define dcgmModuleDenylist_version1 MAKE_DCGM_VERSION(dcgmModuleDenylist_v1, 1) + + +/** + * Counter to use for NvLink + */ +#define DCGMCM_NVLINK_COUNTER_BYTES 0 + +/** + * The Brand of the GPU. These are 1:1 with NVML_BRAND_*. There's a DCGM_CASSERT() below that tests that + */ +typedef enum dcgmGpuBrandType_enum +{ + DCGM_GPU_BRAND_UNKNOWN = 0, + DCGM_GPU_BRAND_QUADRO = 1, + DCGM_GPU_BRAND_TESLA = 2, + DCGM_GPU_BRAND_NVS = 3, + DCGM_GPU_BRAND_GRID = 4, + DCGM_GPU_BRAND_GEFORCE = 5, + DCGM_GPU_BRAND_TITAN = 6, + /* The following are new as of r460 TRD2's nvml.h */ + DCGM_BRAND_NVIDIA_VAPPS = 7, // NVIDIA Virtual Applications + DCGM_BRAND_NVIDIA_VPC = 8, // NVIDIA Virtual PC + DCGM_BRAND_NVIDIA_VCS = 9, // NVIDIA Virtual Compute Server + DCGM_BRAND_NVIDIA_VWS = 10, // NVIDIA RTX Virtual Workstation + DCGM_BRAND_NVIDIA_VGAMING = 11, // NVIDIA vGaming + DCGM_BRAND_QUADRO_RTX = 12, + DCGM_BRAND_NVIDIA_RTX = 13, + DCGM_BRAND_NVIDIA = 14, + DCGM_BRAND_GEFORCE_RTX = 15, + DCGM_BRAND_TITAN_RTX = 16, + // Keep this last + DCGM_GPU_BRAND_COUNT +} dcgmGpuBrandType_t; + +/*****************************************************************************/ +typedef enum dcgmEntityStatusType_enum +{ + DcgmEntityStatusUnknown = 0, /* Entity has not been referenced yet */ + DcgmEntityStatusOk, /* Entity is known and OK */ + DcgmEntityStatusUnsupported, /* Entity is unsupported by DCGM */ + DcgmEntityStatusInaccessible, /* Entity is inaccessible, usually due to cgroups */ + DcgmEntityStatusLost, /* Entity has been lost. Usually set from NVML + returning NVML_ERROR_GPU_IS_LOST */ + DcgmEntityStatusFake, /* Entity is a fake, injection-only entity for testing */ + DcgmEntityStatusDisabled, /* Don't collect values from this GPU */ + DcgmEntityStatusDetached /* Entity is detached, not good for any uses */ +} DcgmEntityStatus_t; + +/** + * Making these internal so that client apps must be explicit with struct versions. + */ + +/** + * Typedef for \ref dcgmRunDiag_t + */ +typedef dcgmRunDiag_v7 dcgmRunDiag_t; + +/** + * Latest version for \ref dcgmRunDiag_t + */ +#define dcgmRunDiag_version dcgmRunDiag_version7 + +/** + * Version 1 of dcgmCreateGroup_t + */ + +typedef struct +{ + dcgmGroupType_t groupType; //!< Type of group to create + char groupName[1024]; //!< Name to give new group + dcgmGpuGrp_t newGroupId; //!< On success, the ID of the newly created group + dcgmReturn_t cmdRet; //!< Error code generated when creating new group +} dcgmCreateGroup_v1; + +/** + * Version 1 of dcgmRemoveEntity_t + */ + +typedef struct +{ + unsigned int groupId; //!< IN: Group id from which entity should be removed + unsigned int entityGroupId; //!< IN: Entity group that entity belongs to + unsigned int entityId; //!< IN: Entity id to remove + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmAddRemoveEntity_v1; + +/** + * Version 1 of dcgmGroupDestroy_t + */ + +typedef struct +{ + unsigned int groupId; //!< IN: Group to remove + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGroupDestroy_v1; + +/** + * Version 1 of dcgmGetEntityGroupEntities_t + */ + +typedef struct +{ + unsigned int entityGroup; //!< IN: Entity of group to list entities + unsigned int entities[DCGM_GROUP_MAX_ENTITIES]; //!< OUT: Array of entities for entityGroup + unsigned int numEntities; //!< IN/OUT: Upon calling, this should be the number of + // entities that entityList[] can hold. Upon + // return, this will contain the number of + // entities actually saved to entityList. + unsigned int flags; //!< IN: Flags to modify the behavior of this request. + // See DCGM_GEGE_FLAG_* + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetEntityGroupEntities_v1; + +/** + * Version 1 of dcgmGroupGetAllIds_t + */ + +typedef struct +{ + unsigned int groupIds[DCGM_MAX_NUM_GROUPS]; //!< OUT: List of group ids + unsigned int numGroups; //!< OUT: Number of group ids in the list + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGroupGetAllIds_v1; + +/** + * Version 1 of dcgmGroupGetInfo_t + */ + +typedef struct +{ + unsigned int groupId; //!< IN: Group ID for which information to be fetched + dcgmGroupInfo_t groupInfo; //!< OUT: Group Information + long long timestamp; //!< OUT: Timestamp of information + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGroupGetInfo_v1; + +#define SAMPLES_BUFFER_SIZE_V1 16384 + +/** + * Version 1 of dcgmEntitiesGetLatestValues_t + */ +typedef struct +{ + unsigned int groupId; //!< IN: Optional group id for information to be fetched + dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES]; //!< IN: List of entities to get values for + unsigned int entitiesCount; //!< IN: Number of entries in entities[] + unsigned int fieldGroupId; //!< IN: Optional fieldGroupId that will be resolved by the host engine. + //!< This is ignored if fieldIdList[] is provided + unsigned short fieldIdList[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< IN: Field IDs to return data for + unsigned int fieldIdCount; //!< IN: Number of field IDs in fieldIdList[] array. + unsigned int flags; //!< IN: Optional flags that affect how this request is processed. + unsigned int cmdRet; //!< OUT: Error code generated + unsigned int bufferSize; //!< OUT: Length of populated buffer + char buffer[SAMPLES_BUFFER_SIZE_V1]; //!< OUT: this field is last, and can be truncated for speed */ +} dcgmEntitiesGetLatestValues_v1; + +#define SAMPLES_BUFFER_SIZE_V2 4186112 // 4MB - 8k for header + +/** + * Version 2 of dcgmEntitiesGetLatestValues_t + */ +typedef struct +{ + unsigned int groupId; //!< IN: Optional group id for information to be fetched + dcgmGroupEntityPair_t entities[DCGM_GROUP_MAX_ENTITIES]; //!< IN: List of entities to get values for + unsigned int entitiesCount; //!< IN: Number of entries in entities[] + unsigned int fieldGroupId; //!< IN: Optional fieldGroupId that will be resolved by the host engine. + //!< This is ignored if fieldIdList[] is provided + unsigned short fieldIdList[DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< IN: Field IDs to return data for + unsigned int fieldIdCount; //!< IN: Number of field IDs in fieldIdList[] array. + unsigned int flags; //!< IN: Optional flags that affect how this request is processed. + unsigned int cmdRet; //!< OUT: Error code generated + unsigned int bufferSize; //!< OUT: Length of populated buffer + char buffer[SAMPLES_BUFFER_SIZE_V2]; //!< OUT: this field is last, and can be truncated for speed */ +} dcgmEntitiesGetLatestValues_v2; + +/** + * Version 1 of dcgmGetMultipleValuesForField + */ +typedef struct +{ + unsigned int entityGroupId; //!< IN: Optional group id for information to be fetched + unsigned int entityId; //!< IN: Optional entity id for information to be fetched + unsigned int fieldId; //!< IN: Field id to fetch + long long startTs; //!< IN: Starting timestamp + long long endTs; //!< IN: End timestamp + unsigned int order; //!< IN: Order for output data, see dcgmOrder_t + unsigned int count; //!< IN: Number of values to retrieve (may be limited by size of buffer) + unsigned int cmdRet; //!< OUT: Error code generated + unsigned int bufferSize; //!< OUT: Length of populated buffer + char buffer[SAMPLES_BUFFER_SIZE_V1]; //!< OUT:: this field is last, and can be truncated for speed */ +} dcgmGetMultipleValuesForField_v1; + +/** + * Version 2 of dcgmGetMultipleValuesForField + */ +typedef struct +{ + unsigned int entityGroupId; //!< IN: Optional group id for information to be fetched + unsigned int entityId; //!< IN: Optional entity id for information to be fetched + unsigned int fieldId; //!< IN: Field id to fetch + long long startTs; //!< IN: Starting timestamp + long long endTs; //!< IN: End timestamp + unsigned int order; //!< IN: Order for output data, see dcgmOrder_t + unsigned int count; //!< IN: Number of values to retrieve (may be limited by size of buffer) + unsigned int cmdRet; //!< OUT: Error code generated + unsigned int bufferSize; //!< OUT: Length of populated buffer + char buffer[SAMPLES_BUFFER_SIZE_V2]; //!< OUT:: this field is last, and can be truncated for speed */ +} dcgmGetMultipleValuesForField_v2; + +/** + * Version 1 of dcgmJobCmd_t + */ + +typedef struct +{ + unsigned int groupId; //!< IN: optional group id + char jobId[64]; //!< IN: job id + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmJobCmd_v1; + +/** + * Version 1 of dcgmJobGetStats_t + */ + +typedef struct +{ + char jobId[64]; //!< IN: job id + dcgmJobInfo_t jobStats; //!< OUT: job stats + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmJobGetStats_v1; + +/** + * Version 1 of dcgmWatchFieldValue_t (DCGM 2.x) + */ +typedef struct +{ + int gpuId; //!< IN: GPU ID to watch field on + unsigned int entityGroupId; //!< IN: Optional entity group id + unsigned short fieldId; //!< IN: Field ID to watch + long long updateFreq; //!< IN: How often to update this field in usec + double maxKeepAge; //!< IN: How long to keep data for this field in seconds + int maxKeepSamples; //!< IN: Maximum number of samples to keep. 0=no limit + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmWatchFieldValue_v1; + +/** + * Version 2 of dcgmWatchFieldValue_t (DCGM 3.x+) + */ +typedef struct +{ + unsigned int entityId; //!< IN: entityId (gpuId for GPUs) to watch field on + unsigned int entityGroupId; //!< IN: Optional entity group id + unsigned short fieldId; //!< IN: Field ID to watch + unsigned char unused[6]; //!< IN: Unused. Aligns next member to 8-byte boundary + long long updateFreq; //!< IN: How often to update this field in usec + double maxKeepAge; //!< IN: How long to keep data for this field in seconds + int maxKeepSamples; //!< IN: Maximum number of samples to keep. 0=no limit + int updateOnFirstWatcher; //!< IN: Should we do an UpdateAllFields() automatically if we are the first watcher? + //!< 1=yes. 0=no. + int wereFirstWatcher; //!< OUT: Returns 1 if we were the first watcher. 0 if not */ + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmWatchFieldValue_v2; + +/** + * Version 1 of dcgmUpdateAllFields_v1 + */ +typedef struct +{ + int waitForUpdate; //!< IN: Whether or not to wait for the update loop to complete before returning to the + // caller 1=wait. 0=do not wait. + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmUpdateAllFields_v1; + +/** + * Version 1 of dcgmUnwatchFieldValue_t + */ +typedef struct +{ + int gpuId; //!< IN: GPU ID to watch field on + unsigned int entityGroupId; //!< IN: Optional entity group id + unsigned short fieldId; //!< IN: Field id to unwatch + int clearCache; //!< IN: Whether or not to clear all cached data for + // the field after the watch is removed + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmUnwatchFieldValue_v1; + +/** + * Version 1 of dcgmInjectFieldValue_t + */ +typedef struct +{ + unsigned int entityGroupId; //!< IN: entity group id + unsigned int entityId; //!< IN: entity id + dcgmFieldValue_v1 fieldValue; //!< IN: field value to insert + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmInjectFieldValueMsg_v1; + +#define dcgmInjectFieldValueMsg_version1 MAKE_DCGM_VERSION(dcgmInjectFieldValueMsg_v1, 1) +#define dcgmInjectFieldValueMsg_version dcgmInjectFieldValueMsg_version1 +typedef dcgmInjectFieldValueMsg_v1 dcgmInjectFieldValueMsg_t; + +/** + * Version 2 of dcgmGetCacheManagerFieldInfo_t + */ +typedef struct +{ + dcgmCacheManagerFieldInfo_v4_t + fieldInfo; //!< IN/OUT: Structure to populate. fieldInfo->gpuId and fieldInfo->fieldId must + // be populated on calling for this call to work + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetCacheManagerFieldInfo_v2; + +typedef struct +{ + unsigned int groupId; //!< IN: Group ID representing collection of one or more entities + unsigned int fieldGroupId; //!< IN: Fields to watch. + long long updateFreq; //!< IN: How often to update this field in usec + double maxKeepAge; //!< IN: How long to keep data for this field in seconds + int maxKeepSamples; //!< IN: Maximum number of samples to keep. 0=no limit + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmWatchFields_v1; + +#define dcgmWatchFields_version1 1 +#define dcgmWatchFields_version dcgmWatchFields_version1 + +typedef struct +{ + unsigned int groupId; //!< IN: Group ID representing collection of one or more entities + dcgmTopology_t topology; //!< OUT: populated struct + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetTopologyMsg_v1; + +typedef struct +{ + unsigned int groupId; //!< IN: Group ID representing collection of one or more entities + dcgmAffinity_t affinity; //!< OUT: populated struct + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetTopologyAffinityMsg_v1; + +typedef struct +{ + uint64_t inputGpus; //!< IN: bitmask of available gpus + uint32_t numGpus; //!< IN: number of gpus needed + uint64_t flags; //!< IN: Hints to ignore certain factors for the scheduling hint + uint64_t outputGpus; //!< OUT: bitmask of selected gpus + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmSelectGpusByTopologyMsg_v1; + +typedef struct +{ + int supported; //!< IN: boolean to ONLY include Ids of supported GPUs + unsigned int devices[DCGM_MAX_NUM_DEVICES]; //!< OUT: GPU Ids present on the system. + int count; //!< OUT: Number of devices returned in "devices" + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetAllDevicesMsg_v1; + +typedef struct +{ + int persistAfterDisconnect; //!< IN: boolean whether to persist groups, etc after client is disconnected + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmClientLogin_v1; + +typedef struct +{ + dcgmFieldGroupInfo_t fg; //!< IN/OUT: field group info populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmFieldGroupOp_v1; + +typedef struct +{ + unsigned int groupId; //!< IN: group id for query + dcgmPidInfo_t pidInfo; //!< IN/OUT: pid info populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmPidGetInfo_v1; + +typedef struct +{ + dcgmFieldSummaryRequest_t fsr; //!< IN/OUT: field summary populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetFieldSummary_v1; + +typedef struct +{ + dcgmNvLinkStatus_v3 ls; //!< IN/OUT: nvlink status populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetNvLinkStatus_v2; + +typedef struct +{ + dcgmCreateFakeEntities_v2 fe; //!< IN/OUT: fake entity info, populated on success + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgCreateFakeEntities_v1; + +typedef struct +{ + dcgmWatchPredefined_t wpf; //!< IN: watch info + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmWatchPredefinedFields_v1; + +typedef struct +{ + unsigned int moduleId; //!< IN: Module to add to the denylist + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgModuleDenylist_v1; + +typedef struct +{ + dcgmModuleGetStatuses_t st; //!< IN/OUT: module status + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgModuleStatus_v1; + +typedef struct +{ + unsigned int overallHealth; //!< IN/OUT: hostengine health + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgHostEngineHealth_v1; + +typedef struct +{ + dcgmAllFieldGroup_t fg; //!< IN/OUT: hostengine health + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmGetAllFieldGroup_v1; + +typedef struct +{ + dcgmMigHierarchy_v2 data; //!< OUT: populated on success + + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgGetGpuInstanceHierarchy_v1; + +typedef struct +{ + unsigned int index; //!< IN: the index of the GPU to create + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgNvmlCreateInjectionGpu_v1; + +#ifdef INJECTION_LIBRARY_AVAILABLE +#define DCGM_MAX_EXTRA_KEYS 4 +typedef struct +{ + unsigned int gpuId; //!< IN: the DCGM gpu id of the device being injected + char key[DCGM_MAX_STR_LENGTH]; //!< IN: The key for the NVML injected value + injectNvmlVal_t extraKeys[DCGM_MAX_EXTRA_KEYS]; //!< IN: extra keys, optional + unsigned int extraKeyCount; //!< IN: the number of extra keys + injectNvmlVal_t value; //!< IN: the NVML value being injected + unsigned int cmdRet; //!< OUT: Error code generated +} dcgmMsgNvmlInjectDevice_v1; +#endif + +/** + * Verify that DCGM definitions that are copies of NVML ones match up with their NVML counterparts + */ +DCGM_CASSERT(DCGM_VGPU_NAME_BUFFER_SIZE == NVML_VGPU_NAME_BUFFER_SIZE, NVML_VGPU_NAME_BUFFER_SIZE); +DCGM_CASSERT(DCGM_GRID_LICENSE_BUFFER_SIZE == NVML_GRID_LICENSE_BUFFER_SIZE, NVML_GRID_LICENSE_BUFFER_SIZE); +DCGM_CASSERT(DCGM_DEVICE_UUID_BUFFER_SIZE == NVML_DEVICE_UUID_BUFFER_SIZE, NVML_DEVICE_UUID_BUFFER_SIZE); +DCGM_CASSERT(DCGM_NVLINK_MAX_LINKS_PER_GPU == NVML_NVLINK_MAX_LINKS, NVML_NVLINK_MAX_LINKS); +DCGM_CASSERT((int)DCGM_GPU_BRAND_COUNT == (int)NVML_BRAND_COUNT, NVML_BRAND_COUNT); + +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_NONE == (int)NVML_GPU_VIRTUALIZATION_MODE_NONE, + NVML_GPU_VIRTUALIZATION_MODE_NONE); +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_PASSTHROUGH == (int)NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH, + NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH); +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_VGPU == (int)NVML_GPU_VIRTUALIZATION_MODE_VGPU, + NVML_GPU_VIRTUALIZATION_MODE_VGPU); +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_HOST_VGPU == (int)NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU, + NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU); +DCGM_CASSERT((int)DCGM_GPU_VIRTUALIZATION_MODE_HOST_VSGA == (int)NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA, + NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA); + +/** + * Verify correct version of APIs that use a versioned structure + */ + +DCGM_CASSERT(dcgmPidInfo_version == (long)0x02004528, 1); +DCGM_CASSERT(dcgmConfig_version == (long)16777256, 1); +DCGM_CASSERT(dcgmConnectV2Params_version1 == (long)16777224, 1); +DCGM_CASSERT(dcgmConnectV2Params_version == (long)0x02000010, 1); +DCGM_CASSERT(dcgmCpuHierarchyOwnedCores_version1 == (long)0x1000088, 1); +DCGM_CASSERT(dcgmCpuHierarchy_version1 == (long)0x1000488, 1); +DCGM_CASSERT(dcgmFieldGroupInfo_version == (long)16777744, 1); +DCGM_CASSERT(dcgmAllFieldGroup_version == (long)16811016, 1); +DCGM_CASSERT(dcgmDeviceAttributes_version3 == (long)0x3001464, 1); +DCGM_CASSERT(dcgmDeviceAttributes_version == (long)0x3001464, 1); +DCGM_CASSERT(dcgmHealthResponse_version4 == (long)0x0401050C, 1); +DCGM_CASSERT(dcgmIntrospectMemory_version == (long)16777232, 1); +DCGM_CASSERT(dcgmIntrospectCpuUtil_version == (long)16777248, 1); +DCGM_CASSERT(dcgmJobInfo_version == (long)0x030098A8, 1); +DCGM_CASSERT(dcgmPolicy_version == (long)16777360, 1); +DCGM_CASSERT(dcgmPolicyCallbackResponse_version == (long)16777240, 1); +DCGM_CASSERT(dcgmDiagResponse_version7 == (long)0x07099290, 1); +DCGM_CASSERT(dcgmDiagResponse_version8 == (long)0x80d9690, 8); +DCGM_CASSERT(dcgmDiagResponse_version9 == (long)0x914f4dc, 9); +DCGM_CASSERT(dcgmDiagResponse_version == (long)0x914f4dc, 9); +DCGM_CASSERT(dcgmRunDiag_version7 == (long)0x70054D0, 1); +DCGM_CASSERT(dcgmVgpuDeviceAttributes_version6 == (long)16787744, 1); +DCGM_CASSERT(dcgmVgpuDeviceAttributes_version7 == (long)117451168, 1); +DCGM_CASSERT(dcgmVgpuDeviceAttributes_version == (long)117451168, 1); +DCGM_CASSERT(dcgmVgpuInstanceAttributes_version == (long)16777556, 1); +DCGM_CASSERT(dcgmVgpuConfig_version == (long)16777256, 1); +DCGM_CASSERT(dcgmModuleGetStatuses_version == (long)0x01000088, 1); +DCGM_CASSERT(dcgmModuleDenylist_version1 == (long)0x01000008, 1); +DCGM_CASSERT(dcgmSettingsSetLoggingSeverity_version1 == (long)0x01000008, 1); +DCGM_CASSERT(dcgmVersionInfo_version == (long)0x2000204, 1); +DCGM_CASSERT(dcgmStartEmbeddedV2Params_version1 == (long)0x01000048, 1); +DCGM_CASSERT(dcgmStartEmbeddedV2Params_version2 == (long)0x02000050, 2); +DCGM_CASSERT(dcgmInjectFieldValue_version1 == (long)0x1001018, 1); +DCGM_CASSERT(dcgmInjectFieldValue_version == (long)0x1001018, 1); +DCGM_CASSERT(dcgmNvLinkStatus_version3 == (long)0x30015bc, 3); + +#ifndef DCGM_ARRAY_CAPACITY +#ifdef __cplusplus +#define DCGM_ARRAY_CAPACITY(a) std::extent::value +static_assert(NVML_COMPUTE_INSTANCE_PROFILE_COUNT == 0x08); +static_assert(NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 == 0x09); +#endif +#endif + +#ifndef DCGM_ARRAY_CAPACITY +#define DCGM_ARRAY_CAPACITY(a) (sizeof(a) / sizeof(a[0])) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* DCGM_STRUCTS_H */ diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_test_apis.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_test_apis.h new file mode 100644 index 0000000000..6b8bf78a8d --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_test_apis.h @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * File: dcgm_test_apis.h + */ + +#ifndef DCGM_AGENT_INTERNAL_H +#define DCGM_AGENT_INTERNAL_H + +#include "dcgm_api_export.h" +#include "dcgm_structs.h" +#include "dcgm_structs_internal.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/***************************************************************************** + *****************************************************************************/ +/***************************************************************************** + * DCGM Test Methods, only used for testing, not officially supported + *****************************************************************************/ +/***************************************************************************** + *****************************************************************************/ + +#define DCGM_EMBEDDED_HANDLE 0x7fffffff + +/** + * This method starts the Host Engine Server + * + * @param portNumber IN: TCP port to listen on. This is only used if isTcp == 1. + * @param socketPath IN: This is the path passed to bind() when creating the socket + * For isConnectionTCP == 1, this is the bind address. "" or NULL = All interfaces + * For isConnectionTCP == 0, this is the path to the domain socket to use + * @param isConnectionTCP IN: Whether to listen on a TCP/IP socket (1) or a unix domain socket (0) + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmEngineRun(unsigned short portNumber, + char const *socketPath, + unsigned int isConnectionTCP); + +/** + * This method is used to get values corresponding to the fields. + * @return + * - \ref DCGM_ST_SUCCESS On Success. Even when the API returns success, check for + * individual status inside each field. + * Look at values[index].status. The field values will be + * populated only when status in each field is DCGM_ST_SUCCESS + * - DCGM_ST_? In case of error + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetLatestValuesForFields(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fieldIds[], + unsigned int count, + dcgmFieldValue_v1 values[]); + +/** + * This method is used to get multiple values for a single field + * + * @return + * - \ref DCGM_ST_SUCCESS on success. + * - DCGM_ST_? error code on failure + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetMultipleValuesForField(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fieldId, + int *count, + long long startTs, + long long endTs, + dcgmOrder_t order, + dcgmFieldValue_v1 values[]); + +/** + * Request updates for all field values that have updated since a given timestamp + * + * @param groupId IN: Group ID representing a collection of one or more GPUs + * Refer to \ref dcgmEngineGroupCreate for details on creating a group + * @param sinceTimestamp IN: Timestamp to request values since in usec since 1970. This will + * be returned in nextSinceTimestamp for subsequent calls + * 0 = request all data + * @param fieldIds IN: Fields to return data for + * @param numFieldIds IN: Number of entries in fieldIds + * @param nextSinceTimestamp OUT: Timestamp to use for sinceTimestamp on next call to this function + * @param enumCB IN: Callback to invoke for every field value update. Note that + * multiple updates can be returned in each invocation + * @param userData IN: User data pointer to pass to the userData field of enumCB. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetFieldValuesSince(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + long long sinceTimestamp, + unsigned short *fieldIds, + int numFieldIds, + long long *nextSinceTimestamp, + dcgmFieldValueEnumeration_f enumCB, + void *userData); + +/** + * This method is used to tell the cache manager to watch a field value + * + * @param gpuId GPU ID to watch field on + * @param fieldId Field ID to watch + * @param updateFreq How often to update this field in usec + * @param maxKeepAge How long to keep data for this field in seconds + * @param maxKeepSamples Maximum number of samples to keep. 0=no limit + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if \a gpuId, \a fieldId, \a updateFreq, \a maxKeepAge, + * or \a maxKeepSamples are invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmWatchFieldValue(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fieldId, + long long updateFreq, + double maxKeepAge, + int maxKeepSamples); + +/** + * This method is used to tell the cache manager to unwatch a field value + * + * @param gpuId GPU ID to watch field on + * @param fieldId Field ID to watch + * @param clearCache Whether or not to clear all cached data for + * the field after the watch is removed + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if \a gpuId, \a fieldId, or \a clearCache is invalid + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmUnwatchFieldValue(dcgmHandle_t pDcgmHandle, + int gpuId, + unsigned short fieldId, + int clearCache); + +/*************************************************************************/ +/** + * Used to set vGPU configuration for the group of one or more GPUs identified by \a groupId. + * + * The configuration settings specified in \a pDeviceConfig are applied to all the GPUs in the + * group. Since DCGM groups are a logical grouping of GPUs, the configuration settings Set for a group + * stay intact for the individual GPUs even after the group is destroyed. + * + * If the user wishes to ignore the configuration of one or more properties in the input + * \a pDeviceConfig then the property should be specified as one of \a DCGM_INT32_BLANK, + * \a DCGM_INT64_BLANK, \a DCGM_FP64_BLANK or \a DCGM_STR_BLANK based on the data type of the + * property to be ignored. + * + * If any of the properties fail to be configured for any of the GPUs in the group then the API + * returns an error. The status handle \a statusHandle should be further evaluated to access error + * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST + * to access the error attributes. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @param groupId IN: Group ID representing collection of one or more GPUs. Look + * at \ref dcgmGroupCreate for details on creating the group. + * applied for all the GPU in the group represented by + * \a groupId. The caller must populate the version field of + * \a pDeviceConfig. + * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as + * NULL if the detailed error information is not needed. + * Look at \ref dcgmStatusCreate for details on creating + * status handle. + * + * @return + * - \ref DCGM_ST_OK if the configuration has been successfully set. + * - \ref DCGM_ST_BADPARAM if any of \a groupId or \a pDeviceConfig is invalid. + * - \ref DCGM_ST_VER_MISMATCH if \a pDeviceConfig has the incorrect version. + * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmVgpuConfigSet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmVgpuConfig_t *pDeviceConfig, + dcgmStatus_t statusHandle); + +/*************************************************************************/ +/** + * Used to get vGPU configuration for all the GPUs present in the group. + * + * This API can get the most recent target or desired configuration set by \ref dcgmConfigSet. + * Set type as \a DCGM_CONFIG_TARGET_STATE to get target configuration. The target configuration + * properties are maintained by DCGM and are automatically enforced after a GPU reset or + * reinitialization is completed. + * + * The method can also be used to get the actual configuration state for the GPUs in the group. + * Set type as \a DCGM_CONFIG_CURRENT_STATE to get the actually configuration state. Ideally, the + * actual configuration state will be exact same as the target configuration state. + * + * If any of the property in the target configuration is unknown then the property value in the + * output is populated as one of DCGM_INT32_BLANK, DCGM_INT64_BLANK, DCGM_FP64_BLANK or + * DCGM_STR_BLANK based on the data type of the property. + * + * If any of the property in the current configuration state is not supported then the property + * value in the output is populated as one of DCGM_INT32_NOT_SUPPORTED, DCGM_INT64_NOT_SUPPORTED, + * DCGM_FP64_NOT_SUPPORTED or DCGM_STR_NOT_SUPPORTED based on the data type of the property. + * + * If any of the properties can't be fetched for any of the GPUs in the group then the API returns + * an error. The status handle \a statusHandle should be further evaluated to access error + * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST + * to access the error attributes. + * + * @param pDcgmHandle IN: DCGM Handle + * @param groupId IN: Group ID representing collection of one or more GPUs. Look + * at \ref dcgmGroupCreate for details on creating the group. + * @param type IN: Type of configuration values to be fetched. + * @param count IN: The number of entries that \a deviceConfigList array can store. + * @param deviceConfigList OUT: Pointer to memory to hold requested configuration + * corresponding to all the GPUs in the group (\a groupId). The + * size of the memory must be greater than or equal to hold + * output information for the number of GPUs present in the + * group (\a groupId). + * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as + * NULL if the detailed error information is not needed. + * Look at \ref dcgmStatusCreate for details on creating + * status handle. + * + * @return + * - \ref DCGM_ST_OK if the configuration has been successfully fetched. + * - \ref DCGM_ST_BADPARAM if any of \a groupId, \a type, \a count, + * or \a deviceConfigList is invalid. + * - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. + * - \ref DCGM_ST_VER_MISMATCH if \a deviceConfigList has the incorrect version. + * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmVgpuConfigGet(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmConfigType_t type, + int count, + dcgmVgpuConfig_t deviceConfigList[], + dcgmStatus_t statusHandle); + +/*************************************************************************/ +/** + * Used to enforce previously set vGPU configuration for all the GPUs present in the group. + * + * This API provides a mechanism to the users to manually enforce the configuration at any point of + * time. The configuration can only be enforced if it's already configured using the API \ref + * dcgmConfigSet. + * + * If any of the properties can't be enforced for any of the GPUs in the group then the API returns + * an error. The status handle \a statusHandle should be further evaluated to access error + * attributes for the failed operations. Please refer to status management APIs at \ref DCGMAPI_ST + * to access the error attributes. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @param groupId IN: Group ID representing collection of one or more GPUs. Look at + * \ref dcgmGroupCreate for details on creating the group. + * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS + * to perform operation on all the GPUs. + * @param statusHandle IN/OUT: Resulting error status for multiple operations. Pass it as + * NULL if the detailed error information is not needed. + * Look at \ref dcgmStatusCreate for details on creating + * status handle. + * @return + * - \ref DCGM_ST_OK if the configuration has been successfully enforced. + * - \ref DCGM_ST_BADPARAM if \a groupId is invalid. + * - \ref DCGM_ST_NOT_CONFIGURED if the target configuration is not already set. + * - \ref DCGM_ST_GENERIC_ERROR if an unknown error has occurred. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmVgpuConfigEnforce(dcgmHandle_t pDcgmHandle, + dcgmGpuGrp_t groupId, + dcgmStatus_t statusHandle); + +/*************************************************************************/ +/** + * Gets vGPU device attributes corresponding to the \a gpuId. If operation is not successful for any of + * the requested fields then the field is populated with one of DCGM_BLANK_VALUES defined in + * dcgm_structs.h. + * + * @param pDcgmHandle IN: DCGM Handle + * @param gpuId IN: GPU Id corresponding to which the attributes + * should be fetched + * @param pDcgmVgpuAttr IN/OUT: vGPU Device attributes corresponding to \a gpuId.
+ * .version should be set to \ref dcgmVgpuDeviceAttributes_version before this call. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_VER_MISMATCH if version is not set or is invalid. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetVgpuDeviceAttributes(dcgmHandle_t pDcgmHandle, + unsigned int gpuId, + dcgmVgpuDeviceAttributes_t *pDcgmVgpuAttr); + +/*************************************************************************/ +/** + * Gets vGPU attributes corresponding to the \a vgpuId. If operation is not successful for any of + * the requested fields then the field is populated with one of DCGM_BLANK_VALUES defined in + * dcgm_structs.h. + * + * @param pDcgmHandle IN: DCGM Handle + * @param vgpuId IN: vGPU Id corresponding to which the attributes should be fetched + * @param pDcgmVgpuInstanceAttr IN/OUT: vGPU attributes corresponding to \a vgpuId.
.version should be set to + * \ref dcgmVgpuInstanceAttributes_version before this call. + * + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_VER_MISMATCH if .version is not set or is invalid. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetVgpuInstanceAttributes(dcgmHandle_t pDcgmHandle, + unsigned int vgpuId, + dcgmVgpuInstanceAttributes_t *pDcgmVgpuInstanceAttr); + +/** + * Stop a diagnostic if there is one currently running. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a provided parameter is invalid or missing + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmStopDiagnostic(dcgmHandle_t pDcgmHandle); + +/** + * This method injects a sample into the cache manager + * + * @param gpuId + * @param dcgmInjectFieldValue + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmInjectFieldValue(dcgmHandle_t pDcgmHandle, + unsigned int gpuId, + dcgmInjectFieldValue_t *dcgmInjectFieldValue); + +/** + * This method retries the state of a field within the cache manager + * + * @param fieldInfo Structure to populate. fieldInfo->gpuId and fieldInfo->fieldId must + * be populated on calling for this call to work + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetCacheManagerFieldInfo(dcgmHandle_t pDcgmHandle, + dcgmCacheManagerFieldInfo_v4_t *fieldInfo); + +/** + * This method returns the status of the gpu + * + * @param gpuId + * @param DcgmEntityStatus_t + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmGetGpuStatus(dcgmHandle_t pDcgmHandle, unsigned int gpuId, DcgmEntityStatus_t *status); + +/** + * Create fake entities for injection testing + * + * @param createFakeEntities Details about the number and type of entities to create + * + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmCreateFakeEntities(dcgmHandle_t pDcgmHandle, + dcgmCreateFakeEntities_t *createFakeEntities); + +/** + * This method injects a sample into the cache manager + * + * @param entityGroupId + * @param entityId + * @param dcgmInjectFieldValue + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmEntityInjectFieldValue(dcgmHandle_t pDcgmHandle, + dcgm_field_entity_group_t entityGroupId, + dcgm_field_eid_t entityId, + dcgmInjectFieldValue_t *dcgmInjectFieldValue); + +/** + * This method sets the link state of an entity's NvLink + * + * dcgmHandle_t dcgmHandle + * linkState contains details about the link state to set + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmSetEntityNvLinkLinkState(dcgmHandle_t dcgmHandle, + dcgmSetNvLinkLinkState_v1 *linkState); + +/** + * Creates a MIG entity with according to the specifications in the struct + * + * @param dcgmHandle IN: DCGM Handle + * @param cme IN: struct stating which kind of entity is being created, who the parent entity is, flags + * for processing it, and the profile to specify what size of that entity to create. + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + * - \ref DCGM_ST_REQUIRES_ROOT if the hostengine is not running as root + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmCreateMigEntity(dcgmHandle_t dcgmHandle, dcgmCreateMigEntity_t *cme); + +/** + * Delete the specified MIG entity + * + * @param dcgmHandle IN: DCGM Handle + * @param dme IN: struct specifying which entity to delete with flags to suggest how to process it. + * @return + * - \ref DCGM_ST_OK if the call was successful. + * - \ref DCGM_ST_BADPARAM if any parameter is invalid + * - \ref DCGM_ST_REQUIRES_ROOT if the hostengine is not running as root + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmDeleteMigEntity(dcgmHandle_t dcgmHandle, dcgmDeleteMigEntity_t *dme); + +/** + * @brief Pauses all DCGM modules from updating field values + * + * This method sends a pause message to each loaded module. + * It's up to the module to decide whether to handle or ignore the message. + * + * @param[in] pDcgmHandle DCGM Handle of an active connection + * + * @return + * - \ref DCGM_ST_OK if successful + * - \ref DCGM_ST_* on error + * + * @note If this function fails, the modules may be in an inconsistent state. + * @note You may call \ref dcgmModuleGetStatuses to see which modules are paused. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmPauseTelemetryForDiag(dcgmHandle_t pDcgmHandle); + +/** + * @brief Resumes all DCGM modules to updating field values + * + * This method sends a resume message to each loaded module. + * It's up to the module to decide whether to handle or ignore the message. + * + * @param[in] pDcgmHandle DCGM Handle of an active connection + * + * @return + * - \ref DCGM_ST_OK if successful + * - \ref DCGM_ST_* on error + * + * @note If this function fails, the modules may be in an inconsistent state. + * @note You may call \ref dcgmModuleGetStatuses to see which modules are resumed. The satus of the resumed modules + * should be \ref dcgmModuleStatus_t::DcgmModuleStatusLoaded. + */ +dcgmReturn_t DCGM_PUBLIC_API dcgmResumeTelemetryForDiag(dcgmHandle_t pDcgmHandle); + +#ifdef __cplusplus +} +#endif + +#endif /* DCGM_AGENT_INTERNAL_H */ diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_test_structs.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_test_structs.h new file mode 100644 index 0000000000..83e3418119 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/dcgm_test_structs.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * File: dcgm_test_structs.h + */ + +#ifndef DCGM_TEST_STRUCTS_H +#define DCGM_TEST_STRUCTS_H + +#include "dcgm_fields.h" +#include "dcgm_structs.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Structure to represent default and target vgpu configuration for a device + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmConfig_version) + unsigned int gpuId; //!< GPU ID + unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored) + unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore) + dcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode) + dcgmConfigPowerLimit_t powerLimit; //!< Power Limits +} dcgmVgpuConfig_v1; + +/** + * Typedef for \ref dcgmVgpuConfig_v1 + */ +typedef dcgmVgpuConfig_v1 dcgmVgpuConfig_t; + +/** + * Version 1 for \ref dcgmVgpuConfig_v1 + */ +#define dcgmVgpuConfig_version1 MAKE_DCGM_VERSION(dcgmVgpuConfig_v1, 1) + +/** + * Latest version for \ref dcgmVgpuConfig_t + */ +#define dcgmVgpuConfig_version dcgmVgpuConfig_version1 + +/** + * Represents the vGPU attributes corresponding to a physical device + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmVgpuDeviceAttributes_version) + unsigned int activeVgpuInstanceCount; //!< Count of active vGPU instances on the device + unsigned int activeVgpuInstanceIds[DCGM_MAX_VGPU_INSTANCES_PER_PGPU]; //!< List of vGPU instances + unsigned int creatableVgpuTypeCount; //!< Creatable vGPU type count + unsigned int creatableVgpuTypeIds[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< List of Creatable vGPU types + unsigned int supportedVgpuTypeCount; //!< Supported vGPU type count + dcgmDeviceVgpuTypeInfo_v1 + supportedVgpuTypeInfo[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< Info related to vGPUs supported on the device + dcgmDeviceVgpuUtilInfo_v1 vgpuUtilInfo[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< Utilization specific to vGPU instance + unsigned int gpuUtil; //!< GPU utilization + unsigned int memCopyUtil; //!< Memory utilization + unsigned int encUtil; //!< Encoder utilization + unsigned int decUtil; //!< Decoder utilization +} dcgmVgpuDeviceAttributes_v6; + +/** + * Version 6 for \ref dcgmVgpuDeviceAttributes_v6 + */ +#define dcgmVgpuDeviceAttributes_version6 MAKE_DCGM_VERSION(dcgmVgpuDeviceAttributes_v6, 1) + +typedef struct +{ + unsigned int version; //!< Version number (dcgmVgpuDeviceAttributes_version) + unsigned int activeVgpuInstanceCount; //!< Count of active vGPU instances on the device + unsigned int activeVgpuInstanceIds[DCGM_MAX_VGPU_INSTANCES_PER_PGPU]; //!< List of vGPU instances + unsigned int creatableVgpuTypeCount; //!< Creatable vGPU type count + unsigned int creatableVgpuTypeIds[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< List of Creatable vGPU types + unsigned int supportedVgpuTypeCount; //!< Supported vGPU type count + dcgmDeviceVgpuTypeInfo_v2 + supportedVgpuTypeInfo[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< Info related to vGPUs supported on the device + dcgmDeviceVgpuUtilInfo_v1 vgpuUtilInfo[DCGM_MAX_VGPU_TYPES_PER_PGPU]; //!< Utilization specific to vGPU instance + unsigned int gpuUtil; //!< GPU utilization + unsigned int memCopyUtil; //!< Memory utilization + unsigned int encUtil; //!< Encoder utilization + unsigned int decUtil; //!< Decoder utilization +} dcgmVgpuDeviceAttributes_v7; + +/** + * * Typedef for \ref dcgmVgpuDeviceAttributes_v7 + * */ +typedef dcgmVgpuDeviceAttributes_v7 dcgmVgpuDeviceAttributes_t; + +/** + * Version 7 for \ref dcgmVgpuDeviceAttributes_v7 + */ +#define dcgmVgpuDeviceAttributes_version7 MAKE_DCGM_VERSION(dcgmVgpuDeviceAttributes_v7, 7) + +/** + * Latest version for \ref dcgmVgpuDeviceAttributes_t + */ +#define dcgmVgpuDeviceAttributes_version dcgmVgpuDeviceAttributes_version7 + +/** + * Represents attributes specific to vGPU instance + */ +typedef struct +{ + unsigned int version; //!< Version number (dcgmVgpuInstanceAttributes_version) + char vmId[DCGM_DEVICE_UUID_BUFFER_SIZE]; //!< VM ID of the vGPU instance + char vmName[DCGM_DEVICE_UUID_BUFFER_SIZE]; //!< VM name of the vGPU instance + unsigned int vgpuTypeId; //!< Type ID of the vGPU instance + char vgpuUuid[DCGM_DEVICE_UUID_BUFFER_SIZE]; //!< UUID of the vGPU instance + char vgpuDriverVersion[DCGM_DEVICE_UUID_BUFFER_SIZE]; //!< Driver version of the vGPU instance + unsigned int fbUsage; //!< Fb usage of the vGPU instance + unsigned int licenseStatus; //!< License status of the vGPU instance + unsigned int frameRateLimit; //!< Frame rate limit of the vGPU instance +} dcgmVgpuInstanceAttributes_v1; + +/** + * Typedef for \ref dcgmVgpuInstanceAttributes_v1 + */ +typedef dcgmVgpuInstanceAttributes_v1 dcgmVgpuInstanceAttributes_t; + +/** + * Version 1 for \ref dcgmVgpuInstanceAttributes_v1 + */ +#define dcgmVgpuInstanceAttributes_version1 MAKE_DCGM_VERSION(dcgmVgpuInstanceAttributes_v1, 1) + +/** + * Latest version for \ref dcgmVgpuInstanceAttributes_t + */ +#define dcgmVgpuInstanceAttributes_version dcgmVgpuInstanceAttributes_version1 + +/* Flags to ask nv-hostengine to process MIG events differently. */ +/* + * This flag is only meant to be used when running many commands that will trigger the + * MIG configuration to get loaded again. The intended use is that if you are running + * many commands that will cause the MIG configuration to change, then ask the hostengine + * to only process the last one in order to prevent conflicts in how you are updating the + * MIG information. For example, if you delete one compute instance on a GPU and + * the hostengine processes the event from NVML before you delete the next one, the ID + * of the compute instance will have changed in between. Using the flag asks the + * hostengine to ignore those events temporarily while you are performing updates */ +#define DCGM_MIG_RECONFIG_DELAY_PROCESSING 0x1 + +typedef struct +{ + unsigned int version; //!< Version number of this struct + dcgm_field_entity_group_t entityGroupId; //!< entity group of the MIG entity being deleted + dcgm_field_eid_t entityId; //!< entity id of the MIG entity being deleted + unsigned int flags; //!< flags to influence nv-hostengine's processing of the request +} dcgmDeleteMigEntity_v1; + +#define dcgmDeleteMigEntity_version1 MAKE_DCGM_VERSION(dcgmDeleteMigEntity_v1, 1) + +#define dcgmDeleteMigEntity_version dcgmDeleteMigEntity_version1 + +typedef dcgmDeleteMigEntity_v1 dcgmDeleteMigEntity_t; + +/** + * Enum for the kinds of MIG creations + */ +typedef enum +{ + DcgmMigCreateGpuInstance = 0, /*!< Create a GPU instance */ + DcgmMigCreateComputeInstance = 1, /*!< Create a compute instance */ +} dcgmMigCreate_t; + +typedef struct +{ + unsigned int version; //!< Version number of this request + dcgm_field_eid_t parentId; //!< The entity id of the parent (entity group is inferred from createOption + dcgmMigProfile_t profile; //!< Specify the MIG profile to create + dcgmMigCreate_t createOption; //!< Specify if we're creating a GPU instance or compute instance + unsigned int flags; //!< flags to influence nv-hostengine's processing of the request +} dcgmCreateMigEntity_v1; + +#define dcgmCreateMigEntity_version1 MAKE_DCGM_VERSION(dcgmCreateMigEntity_v1, 1) + +#define dcgmCreateMigEntity_version dcgmCreateMigEntity_version1 + +typedef dcgmCreateMigEntity_v1 dcgmCreateMigEntity_t; + +#ifdef __cplusplus +} +#endif + +#endif /* DCGM_STRUCTS_H */ diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/device_info.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/device_info.go new file mode 100644 index 0000000000..5e229c2d12 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/device_info.go @@ -0,0 +1,260 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "math/rand" + "unsafe" + + "github.com/bits-and-blooms/bitset" +) + +type PCIInfo struct { + BusID string + BAR1 uint // MB + FBTotal uint // MB + Bandwidth int64 // MB/s +} + +type DeviceIdentifiers struct { + Brand string + Model string + Serial string + Vbios string + InforomImageVersion string + DriverVersion string +} + +type Device struct { + GPU uint + DCGMSupported string + UUID string + Power uint // W + PCI PCIInfo + Identifiers DeviceIdentifiers + Topology []P2PLink + CPUAffinity string +} + +// getAllDeviceCount counts all GPUs on the system +func getAllDeviceCount() (gpuCount uint, err error) { + var gpuIdList [C.DCGM_MAX_NUM_DEVICES]C.uint + var count C.int + + result := C.dcgmGetAllDevices(handle.handle, &gpuIdList[0], &count) + if err = errorString(result); err != nil { + return gpuCount, fmt.Errorf("Error getting devices count: %s", err) + } + gpuCount = uint(count) + return +} + +// getAllDeviceCount counts all GPUs on the system +func getEntityGroupEntities(entityGroup Field_Entity_Group) (entities []uint, err error) { + var pEntities [C.DCGM_MAX_NUM_DEVICES]C.uint + var count C.int = C.DCGM_MAX_NUM_DEVICES + + result := C.dcgmGetEntityGroupEntities(handle.handle, C.dcgm_field_entity_group_t(entityGroup), &pEntities[0], &count, 0) + if err = errorString(result); err != nil { + return nil, fmt.Errorf("Error getting entity count: %s", err) + } + + for i := 0; i < int(count); i++ { + entities = append(entities, uint(pEntities[i])) + } + return entities, nil +} + +// getSupportedDevices returns DCGM supported GPUs +func getSupportedDevices() (gpus []uint, err error) { + var gpuIdList [C.DCGM_MAX_NUM_DEVICES]C.uint + var count C.int + + result := C.dcgmGetAllSupportedDevices(handle.handle, &gpuIdList[0], &count) + if err = errorString(result); err != nil { + return gpus, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + numGpus := int(count) + gpus = make([]uint, numGpus) + for i := 0; i < numGpus; i++ { + gpus[i] = uint(gpuIdList[i]) + } + return +} + +func getPciBandwidth(gpuId uint) (int64, error) { + const ( + maxLinkGen int = iota + maxLinkWidth + fieldsCount + ) + + pciFields := make([]Short, fieldsCount) + pciFields[maxLinkGen] = C.DCGM_FI_DEV_PCIE_MAX_LINK_GEN + pciFields[maxLinkWidth] = C.DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH + + fieldsName := fmt.Sprintf("pciBandwidthFields%d", rand.Uint64()) + + fieldsId, err := FieldGroupCreate(fieldsName, pciFields) + if err != nil { + return 0, err + } + + groupName := fmt.Sprintf("pciBandwidth%d", rand.Uint64()) + groupId, err := WatchFields(gpuId, fieldsId, groupName) + if err != nil { + _ = FieldGroupDestroy(fieldsId) + return 0, err + } + + values, err := GetLatestValuesForFields(gpuId, pciFields) + if err != nil { + _ = FieldGroupDestroy(fieldsId) + _ = DestroyGroup(groupId) + return 0, fmt.Errorf("Error getting Pcie bandwidth: %s", err) + } + + gen := values[maxLinkGen].Int64() + width := values[maxLinkWidth].Int64() + + _ = FieldGroupDestroy(fieldsId) + _ = DestroyGroup(groupId) + + genMap := map[int64]int64{ + 1: 250, // MB/s + 2: 500, + 3: 985, + 4: 1969, + } + + bandwidth := genMap[gen] * width + return bandwidth, nil +} + +func getCPUAffinity(gpuId uint) (string, error) { + const ( + affinity0 int = iota + affinity1 + affinity2 + affinity3 + fieldsCount + ) + + affFields := make([]Short, fieldsCount) + affFields[affinity0] = C.DCGM_FI_DEV_CPU_AFFINITY_0 + affFields[affinity1] = C.DCGM_FI_DEV_CPU_AFFINITY_1 + affFields[affinity2] = C.DCGM_FI_DEV_CPU_AFFINITY_2 + affFields[affinity3] = C.DCGM_FI_DEV_CPU_AFFINITY_3 + + fieldsName := fmt.Sprintf("cpuAffFields%d", rand.Uint64()) + + fieldsId, err := FieldGroupCreate(fieldsName, affFields) + if err != nil { + return "N/A", err + } + defer FieldGroupDestroy(fieldsId) + + groupName := fmt.Sprintf("cpuAff%d", rand.Uint64()) + groupId, err := WatchFields(gpuId, fieldsId, groupName) + if err != nil { + return "N/A", err + } + defer DestroyGroup(groupId) + + values, err := GetLatestValuesForFields(gpuId, affFields) + if err != nil { + return "N/A", fmt.Errorf("Error getting cpu affinity: %s", err) + } + + bits := make([]uint64, 4) + bits[0] = uint64(values[affinity0].Int64()) + bits[1] = uint64(values[affinity1].Int64()) + bits[2] = uint64(values[affinity2].Int64()) + bits[3] = uint64(values[affinity3].Int64()) + + b := bitset.From(bits) + + return b.String(), nil +} + +func getDeviceInfo(gpuid uint) (deviceInfo Device, err error) { + var device C.dcgmDeviceAttributes_t + device.version = makeVersion3(unsafe.Sizeof(device)) + + result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) + if err = errorString(result); err != nil { + return deviceInfo, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + // check if the given GPU is DCGM supported + gpus, err := getSupportedDevices() + if err != nil { + return + } + + supported := "No" + + for _, gpu := range gpus { + if gpuid == gpu { + supported = "Yes" + break + } + } + + busid := *stringPtr(&device.identifiers.pciBusId[0]) + + cpuAffinity, err := getCPUAffinity(gpuid) + if err != nil { + return + } + + var topology []P2PLink + var bandwidth int64 + // get device topology and bandwidth only if its a DCGM supported device + if supported == "Yes" { + topology, err = getDeviceTopology(gpuid) + if err != nil { + return + } + bandwidth, err = getPciBandwidth(gpuid) + if err != nil { + return + } + } + + uuid := *stringPtr(&device.identifiers.uuid[0]) + power := *uintPtr(device.powerLimits.defaultPowerLimit) + + pci := PCIInfo{ + BusID: busid, + BAR1: *uintPtr(device.memoryUsage.bar1Total), + FBTotal: *uintPtr(device.memoryUsage.fbTotal), + Bandwidth: bandwidth, + } + + identifiers := DeviceIdentifiers{ + Brand: *stringPtr(&device.identifiers.brandName[0]), + Model: *stringPtr(&device.identifiers.deviceName[0]), + Serial: *stringPtr(&device.identifiers.serial[0]), + Vbios: *stringPtr(&device.identifiers.vbios[0]), + InforomImageVersion: *stringPtr(&device.identifiers.inforomImageVersion[0]), + DriverVersion: *stringPtr(&device.identifiers.driverVersion[0]), + } + + deviceInfo = Device{ + GPU: gpuid, + DCGMSupported: supported, + UUID: uuid, + Power: power, + PCI: pci, + Identifiers: identifiers, + Topology: topology, + CPUAffinity: cpuAffinity, + } + return +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/device_status.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/device_status.go new file mode 100644 index 0000000000..4aa7918c6d --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/device_status.go @@ -0,0 +1,179 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "math/rand" +) + +type PerfState uint + +const ( + PerfStateMax = 0 + PerfStateMin = 15 + PerfStateUnknown = 32 +) + +func (p PerfState) String() string { + if p >= PerfStateMax && p <= PerfStateMin { + return fmt.Sprintf("P%d", p) + } + return "Unknown" +} + +type UtilizationInfo struct { + GPU int64 // % + Memory int64 // % + Encoder int64 // % + Decoder int64 // % +} + +type ECCErrorsInfo struct { + SingleBit int64 + DoubleBit int64 +} + +type MemoryInfo struct { + GlobalUsed int64 + ECCErrors ECCErrorsInfo +} + +type ClockInfo struct { + Cores int64 // MHz + Memory int64 // MHz +} + +type PCIThroughputInfo struct { + Rx int64 // MB + Tx int64 // MB + Replays int64 +} + +type PCIStatusInfo struct { + BAR1Used int64 // MB + Throughput PCIThroughputInfo + FBUsed int64 +} + +type DeviceStatus struct { + Power float64 // W + Temperature int64 // °C + Utilization UtilizationInfo + Memory MemoryInfo + Clocks ClockInfo + PCI PCIStatusInfo + Performance PerfState + FanSpeed int64 // % +} + +func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) { + const ( + pwr int = iota + temp + sm + mem + enc + dec + smClock + memClock + bar1Used + pcieRxThroughput + pcieTxThroughput + pcieReplay + fbUsed + sbe + dbe + pstate + fanSpeed + fieldsCount + ) + + deviceFields := make([]Short, fieldsCount) + deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE + deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP + deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL + deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL + deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL + deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL + deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK + deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK + deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED + deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT + deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT + deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER + deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED + deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL + deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL + deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE + deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED + + fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) + fieldsId, err := FieldGroupCreate(fieldsName, deviceFields) + if err != nil { + return + } + + groupName := fmt.Sprintf("devStatus%d", rand.Uint64()) + groupId, err := WatchFields(gpuId, fieldsId, groupName) + if err != nil { + _ = FieldGroupDestroy(fieldsId) + return + } + + values, err := GetLatestValuesForFields(gpuId, deviceFields) + if err != nil { + _ = FieldGroupDestroy(fieldsId) + _ = DestroyGroup(groupId) + return status, err + } + + power := values[pwr].Float64() + + gpuUtil := UtilizationInfo{ + GPU: values[sm].Int64(), + Memory: values[mem].Int64(), + Encoder: values[enc].Int64(), + Decoder: values[dec].Int64(), + } + + memory := MemoryInfo{ + ECCErrors: ECCErrorsInfo{ + SingleBit: values[sbe].Int64(), + DoubleBit: values[dbe].Int64(), + }, + } + + clocks := ClockInfo{ + Cores: values[smClock].Int64(), + Memory: values[memClock].Int64(), + } + + pci := PCIStatusInfo{ + BAR1Used: values[bar1Used].Int64(), + Throughput: PCIThroughputInfo{ + Rx: values[pcieRxThroughput].Int64(), + Tx: values[pcieTxThroughput].Int64(), + Replays: values[pcieReplay].Int64(), + }, + FBUsed: values[fbUsed].Int64(), + } + + status = DeviceStatus{ + Power: power, + Temperature: values[temp].Int64(), + Utilization: gpuUtil, + Memory: memory, + Clocks: clocks, + PCI: pci, + Performance: PerfState(values[pstate].Int64()), + FanSpeed: values[fanSpeed].Int64(), + } + + _ = FieldGroupDestroy(fieldsId) + _ = DestroyGroup(groupId) + return +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/diag.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/diag.go new file mode 100644 index 0000000000..8212842c93 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/diag.go @@ -0,0 +1,169 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "unsafe" +) + +const DIAG_RESULT_STRING_SIZE = 1024 + +type DiagType int + +const ( + DiagQuick DiagType = 1 + DiagMedium = 2 + DiagLong = 3 + DiagExtended = 4 +) + +type DiagResult struct { + Status string + TestName string + TestOutput string + ErrorCode uint + ErrorMessage string +} + +type GpuResult struct { + GPU uint + RC uint + DiagResults []DiagResult +} + +type DiagResults struct { + Software []DiagResult + PerGpu []GpuResult +} + +func diagResultString(r int) string { + switch r { + case C.DCGM_DIAG_RESULT_PASS: + return "pass" + case C.DCGM_DIAG_RESULT_SKIP: + return "skipped" + case C.DCGM_DIAG_RESULT_WARN: + return "warn" + case C.DCGM_DIAG_RESULT_FAIL: + return "fail" + case C.DCGM_DIAG_RESULT_NOT_RUN: + return "notrun" + } + return "" +} + +func swTestName(t int) string { + switch t { + case C.DCGM_SWTEST_DENYLIST: + return "presence of drivers on the denylist (e.g. nouveau)" + case C.DCGM_SWTEST_NVML_LIBRARY: + return "presence (and version) of NVML lib" + case C.DCGM_SWTEST_CUDA_MAIN_LIBRARY: + return "presence (and version) of CUDA lib" + case C.DCGM_SWTEST_CUDA_RUNTIME_LIBRARY: + return "presence (and version) of CUDA RT lib" + case C.DCGM_SWTEST_PERMISSIONS: + return "character device permissions" + case C.DCGM_SWTEST_PERSISTENCE_MODE: + return "persistence mode enabled" + case C.DCGM_SWTEST_ENVIRONMENT: + return "CUDA environment vars that may slow tests" + case C.DCGM_SWTEST_PAGE_RETIREMENT: + return "pending frame buffer page retirement" + case C.DCGM_SWTEST_GRAPHICS_PROCESSES: + return "graphics processes running" + case C.DCGM_SWTEST_INFOROM: + return "inforom corruption" + } + + return "" +} + +func gpuTestName(t int) string { + + switch t { + case C.DCGM_MEMORY_INDEX: + return "Memory" + case C.DCGM_DIAGNOSTIC_INDEX: + return "Diagnostic" + case C.DCGM_PCI_INDEX: + return "PCIe" + case C.DCGM_SM_STRESS_INDEX: + return "SM Stress" + case C.DCGM_TARGETED_STRESS_INDEX: + return "Targeted Stress" + case C.DCGM_TARGETED_POWER_INDEX: + return "Targeted Power" + case C.DCGM_MEMORY_BANDWIDTH_INDEX: + return "Memory bandwidth" + case C.DCGM_MEMTEST_INDEX: + return "Memtest" + case C.DCGM_PULSE_TEST_INDEX: + return "Pulse" + case C.DCGM_EUD_TEST_INDEX: + return "EUD" + case C.DCGM_SOFTWARE_INDEX: + return "Software" + case C.DCGM_CONTEXT_CREATE_INDEX: + return "Context create" + } + return "" +} + +func newDiagResult(testResult C.dcgmDiagTestResult_v3, testName string) DiagResult { + msg := C.GoString((*C.char)(unsafe.Pointer(&testResult.error[0].msg))) + info := C.GoString((*C.char)(unsafe.Pointer(&testResult.info))) + + return DiagResult{ + Status: diagResultString(int(testResult.status)), + TestName: testName, + TestOutput: info, + ErrorCode: uint(testResult.error[0].code), + ErrorMessage: msg, + } +} + +func diagLevel(diagType DiagType) C.dcgmDiagnosticLevel_t { + switch diagType { + case DiagQuick: + return C.DCGM_DIAG_LVL_SHORT + case DiagMedium: + return C.DCGM_DIAG_LVL_MED + case DiagLong: + return C.DCGM_DIAG_LVL_LONG + case DiagExtended: + return C.DCGM_DIAG_LVL_XLONG + } + return C.DCGM_DIAG_LVL_INVALID +} + +func RunDiag(diagType DiagType, groupId GroupHandle) (DiagResults, error) { + var diagResults C.dcgmDiagResponse_v9 + diagResults.version = makeVersion9(unsafe.Sizeof(diagResults)) + + result := C.dcgmRunDiagnostic(handle.handle, groupId.handle, diagLevel(diagType), (*C.dcgmDiagResponse_v9)(unsafe.Pointer(&diagResults))) + if err := errorString(result); err != nil { + return DiagResults{}, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + var diagRun DiagResults + for i := 0; i < int(diagResults.levelOneTestCount); i++ { + dr := newDiagResult(diagResults.levelOneResults[i], swTestName(i)) + diagRun.Software = append(diagRun.Software, dr) + } + + for i := uint(0); i < uint(diagResults.gpuCount); i++ { + r := diagResults.perGpuResponses[i] + gr := GpuResult{GPU: uint(r.gpuId), RC: uint(r.hwDiagnosticReturn)} + for j := 0; j < int(C.DCGM_PER_GPU_TEST_COUNT_V8); j++ { + dr := newDiagResult(r.results[j], gpuTestName(j)) + gr.DiagResults = append(gr.DiagResults, dr) + } + diagRun.PerGpu = append(diagRun.PerGpu, gr) + } + + return diagRun, nil +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/field_values.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/field_values.go new file mode 100644 index 0000000000..a116fcc958 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/field_values.go @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +#include "field_values_cb.h" +extern int go_dcgmFieldValueEntityEnumeration(dcgm_field_entity_group_t entityGroupId, + dcgm_field_eid_t entityId, + dcgmFieldValue_v1 *values, + int numValues, + void *userData); +*/ +import "C" +import ( + "fmt" + "sync" + "time" + "unsafe" +) + +type callback struct { + mu sync.Mutex + Values []FieldValue_v2 +} + +func (cb *callback) processValues(entityGroup Field_Entity_Group, entityID uint, cvalues []C.dcgmFieldValue_v1) { + values := dcgmFieldValue_v1ToFieldValue_v2(entityGroup, entityID, cvalues) + cb.mu.Lock() + defer cb.mu.Unlock() + cb.Values = append(cb.Values, values...) +} + +//export go_dcgmFieldValueEntityEnumeration +func go_dcgmFieldValueEntityEnumeration( + entityGroup C.dcgm_field_entity_group_t, + entityID C.dcgm_field_eid_t, + values *C.dcgmFieldValue_v1, + numValues C.int, + userData unsafe.Pointer) C.int { + ptrValues := unsafe.Pointer(values) + if ptrValues != nil { + valuesSlice := (*[1 << 30]C.dcgmFieldValue_v1)(ptrValues)[0:numValues] + if userData != nil { + processor := (*callback)(userData) + processor.processValues(Field_Entity_Group(entityGroup), uint(entityID), valuesSlice) + } + } + return 0 +} + +// GetValuesSince reads and returns field values for a specified group of entities, such as GPUs, +// that have been updated since a given timestamp. It allows for targeted data retrieval based on time criteria. +// +// GPUGroup is a GroupHandle that identifies the group of entities to operate on. It can be obtained from CreateGroup +// for a specific group of GPUs or use GroupAllGPUs() to target all GPUs. +// +// fieldGroup is a FieldHandle representing the group of fields for which data is requested. +// +// sinceTime is a time.Time value representing the timestamp from which to request updated values. +// A zero value (time.Time{}) requests all available data. +// +// Returns []FieldValue_v2 slice containing the requested field values, a time.Time indicating the time +// of the latest data retrieval, and an error if there is any issue during the operation. +func GetValuesSince(GPUGroup GroupHandle, fieldGroup FieldHandle, sinceTime time.Time) ([]FieldValue_v2, time.Time, error) { + var ( + nextSinceTimestamp C.longlong + ) + + cbResult := &callback{} + + result := C.dcgmGetValuesSince_v2(handle.handle, + GPUGroup.handle, + C.dcgmFieldGrp_t(fieldGroup.handle), + C.longlong(sinceTime.UnixMicro()), + &nextSinceTimestamp, + (C.dcgmFieldValueEnumeration_f)(unsafe.Pointer(C.fieldValueEntityCallback)), + unsafe.Pointer(cbResult)) + if result != C.DCGM_ST_OK { + return nil, time.Time{}, fmt.Errorf("dcgmGetValuesSince_v2 failed with error code %d", int(result)) + } + + return cbResult.Values, timestampUSECToTime(int64(nextSinceTimestamp)), nil +} + +func timestampUSECToTime(timestampUSEC int64) time.Time { + // Convert microseconds to seconds and nanoseconds + sec := timestampUSEC / 1000000 // Convert microseconds to seconds + nsec := (timestampUSEC % 1000000) * 1000 // Convert the remaining microseconds to nanoseconds + // Use time.Unix to get a time.Time object + return time.Unix(sec, nsec) +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/field_values_cb.c b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/field_values_cb.c new file mode 100644 index 0000000000..d807a2db2f --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/field_values_cb.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dcgm_agent.h" +#include "dcgm_structs.h" +#include "_cgo_export.h" + +int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId, + dcgm_field_eid_t entityId, + dcgmFieldValue_v1 *values, + int numValues, + void *userData) { + return go_dcgmFieldValueEntityEnumeration(entityGroupId, entityId, values, numValues, userData); +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/field_values_cb.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/field_values_cb.h new file mode 100644 index 0000000000..d5ae815ba2 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/field_values_cb.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FIELD_VALUES +#define FIELD_VALUES + +#include "dcgm_agent.h" +#include "dcgm_structs.h" + +int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId, + dcgm_field_eid_t entityId, + dcgmFieldValue_v1 *values, + int numValues, + void *userData); + +#endif \ No newline at end of file diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/fields.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/fields.go new file mode 100644 index 0000000000..d967502dfd --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/fields.go @@ -0,0 +1,315 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "encoding/binary" + "fmt" + "unicode" + "unsafe" +) + +const ( + defaultUpdateFreq = 30000000 // usec + defaultMaxKeepAge = 0 // sec + defaultMaxKeepSamples = 1 // Keep one sample by default since we only ask for latest +) + +type FieldMeta struct { + FieldId Short + FieldType byte + Size byte + Tag string + Scope int + NvmlFieldId int + EntityLevel Field_Entity_Group +} + +type FieldHandle struct{ handle C.dcgmFieldGrp_t } + +func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error) { + var fieldsGroup C.dcgmFieldGrp_t + cfields := *(*[]C.ushort)(unsafe.Pointer(&fields)) + + groupName := C.CString(fieldsGroupName) + defer freeCString(groupName) + + result := C.dcgmFieldGroupCreate(handle.handle, C.int(len(fields)), &cfields[0], groupName, &fieldsGroup) + if err = errorString(result); err != nil { + return fieldsId, fmt.Errorf("Error creating DCGM fields group: %s", err) + } + + fieldsId = FieldHandle{fieldsGroup} + return +} + +func FieldGroupDestroy(fieldsGroup FieldHandle) (err error) { + result := C.dcgmFieldGroupDestroy(handle.handle, fieldsGroup.handle) + if err = errorString(result); err != nil { + fmt.Errorf("Error destroying DCGM fields group: %s", err) + } + + return +} + +func WatchFields(gpuId uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error) { + group, err := CreateGroup(groupName) + if err != nil { + return + } + + err = AddToGroup(group, gpuId) + if err != nil { + return + } + + result := C.dcgmWatchFields(handle.handle, group.handle, fieldsGroup.handle, C.longlong(defaultUpdateFreq), C.double(defaultMaxKeepAge), C.int(defaultMaxKeepSamples)) + if err = errorString(result); err != nil { + return groupId, fmt.Errorf("Error watching fields: %s", err) + } + + _ = UpdateAllFields() + return group, nil +} + +func WatchFieldsWithGroupEx(fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error { + result := C.dcgmWatchFields(handle.handle, group.handle, fieldsGroup.handle, + C.longlong(updateFreq), C.double(maxKeepAge), C.int(maxKeepSamples)) + + if err := errorString(result); err != nil { + return fmt.Errorf("Error watching fields: %s", err) + } + + if err := UpdateAllFields(); err != nil { + return err + } + + return nil +} + +func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error { + return WatchFieldsWithGroupEx(fieldsGroup, group, defaultUpdateFreq, defaultMaxKeepAge, defaultMaxKeepSamples) +} + +func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error) { + values := make([]C.dcgmFieldValue_v1, len(fields)) + cfields := *(*[]C.ushort)(unsafe.Pointer(&fields)) + + result := C.dcgmGetLatestValuesForFields(handle.handle, C.int(gpu), &cfields[0], C.uint(len(fields)), &values[0]) + if err := errorString(result); err != nil { + return nil, fmt.Errorf("Error watching fields: %s", err) + } + + return toFieldValue(values), nil +} + +func LinkGetLatestValues(index uint, parentId uint, fields []Short) ([]FieldValue_v1, error) { + slice := []byte{uint8(FE_SWITCH), uint8(index), uint8(parentId), 0} + + entityId := binary.LittleEndian.Uint32(slice) + + return EntityGetLatestValues(FE_LINK, uint(entityId), fields) +} + +func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error) { + values := make([]C.dcgmFieldValue_v1, len(fields)) + cfields := (*C.ushort)(unsafe.Pointer(&fields[0])) + + result := C.dcgmEntityGetLatestValues(handle.handle, C.dcgm_field_entity_group_t(entityGroup), C.int(entityId), cfields, C.uint(len(fields)), &values[0]) + if result != C.DCGM_ST_OK { + return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + return toFieldValue(values), nil +} + +func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error) { + values := make([]C.dcgmFieldValue_v2, len(fields)*len(entities)) + cfields := (*C.ushort)(unsafe.Pointer(&fields[0])) + cEntities := make([]C.dcgmGroupEntityPair_t, len(entities)) + cPtrEntities := *(*[]C.dcgmGroupEntityPair_t)(unsafe.Pointer(&cEntities)) + for i, entity := range entities { + cEntities[i] = C.dcgmGroupEntityPair_t{C.dcgm_field_entity_group_t(entity.EntityGroupId), C.dcgm_field_eid_t(entity.EntityId)} + } + + result := C.dcgmEntitiesGetLatestValues(handle.handle, &cPtrEntities[0], C.uint(len(entities)), cfields, C.uint(len(fields)), C.uint(flags), &values[0]) + if err := errorString(result); err != nil { + return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + return toFieldValue_v2(values), nil +} + +func UpdateAllFields() error { + waitForUpdate := C.int(1) + result := C.dcgmUpdateAllFields(handle.handle, waitForUpdate) + + return errorString(result) +} + +func toFieldValue(cfields []C.dcgmFieldValue_v1) []FieldValue_v1 { + fields := make([]FieldValue_v1, len(cfields)) + for i, f := range cfields { + fields[i] = FieldValue_v1{ + Version: uint(f.version), + FieldId: uint(f.fieldId), + FieldType: uint(f.fieldType), + Status: int(f.status), + Ts: int64(f.ts), + Value: f.value, + } + } + + return fields +} + +func (fv FieldValue_v1) Int64() int64 { + return *(*int64)(unsafe.Pointer(&fv.Value[0])) +} + +func (fv FieldValue_v1) Float64() float64 { + return *(*float64)(unsafe.Pointer(&fv.Value[0])) +} + +func (fv FieldValue_v1) String() string { + return C.GoString((*C.char)(unsafe.Pointer(&fv.Value[0]))) +} + +func (fv FieldValue_v1) Blob() [4096]byte { + return fv.Value +} + +func toFieldValue_v2(cfields []C.dcgmFieldValue_v2) []FieldValue_v2 { + fields := make([]FieldValue_v2, len(cfields)) + for i, f := range cfields { + if uint(f.fieldType) == DCGM_FT_STRING { + fields[i] = FieldValue_v2{ + Version: uint(f.version), + EntityGroupId: Field_Entity_Group(f.entityGroupId), + EntityId: uint(f.entityId), + FieldId: uint(f.fieldId), + FieldType: uint(f.fieldType), + Status: int(f.status), + Ts: int64(f.ts), + Value: f.value, + StringValue: stringPtr((*C.char)(unsafe.Pointer(&f.value[0]))), + } + } else { + fields[i] = FieldValue_v2{ + Version: uint(f.version), + EntityGroupId: Field_Entity_Group(f.entityGroupId), + EntityId: uint(f.entityId), + FieldId: uint(f.fieldId), + FieldType: uint(f.fieldType), + Status: int(f.status), + Ts: int64(f.ts), + Value: f.value, + StringValue: nil, + } + } + } + + return fields +} + +func dcgmFieldValue_v1ToFieldValue_v2(fieldEntityGroup Field_Entity_Group, entityId uint, cfields []C.dcgmFieldValue_v1) []FieldValue_v2 { + fields := make([]FieldValue_v2, len(cfields)) + for i, f := range cfields { + fields[i] = FieldValue_v2{ + Version: C.dcgmFieldValue_version2, + EntityGroupId: fieldEntityGroup, + EntityId: entityId, + FieldId: uint(f.fieldId), + FieldType: uint(f.fieldType), + Status: int(f.status), + Ts: int64(f.ts), + Value: f.value, + StringValue: nil, + } + + if uint(f.fieldType) == DCGM_FT_STRING { + fields[i].StringValue = stringPtr((*C.char)(unsafe.Pointer(&f.value[0]))) + } + } + + return fields +} + +func (fv FieldValue_v2) Int64() int64 { + return *(*int64)(unsafe.Pointer(&fv.Value[0])) +} + +func (fv FieldValue_v2) Float64() float64 { + return *(*float64)(unsafe.Pointer(&fv.Value[0])) +} + +func (fv FieldValue_v2) String() string { + return C.GoString((*C.char)(unsafe.Pointer(&fv.Value[0]))) +} + +func (fv FieldValue_v2) Blob() [4096]byte { + return fv.Value +} + +// Deprecated: Fv2_Int64 exists for backward compatibility +// and should not be used. To access the int64 returned by a FieldValue_v2, +// use the FieldValue_v2.Int64 method. +func Fv2_Int64(fv FieldValue_v2) int64 { + return *(*int64)(unsafe.Pointer(&fv.Value[0])) +} + +// Deprecated: Fv2_Float64 exists for backward compatibility +// and should not be used. To access the int64 returned by a FieldValue_v2, +// use the FieldValue_v2.Float64 method. +func Fv2_Float64(fv FieldValue_v2) float64 { + return *(*float64)(unsafe.Pointer(&fv.Value[0])) +} + +func FindFirstNonAsciiIndex(value [4096]byte) int { + for i := 0; i < 4096; i++ { + if value[i] > unicode.MaxASCII || value[i] < 33 { + return i + } + } + + return 4096 +} + +func Fv2_String(fv FieldValue_v2) string { + if fv.FieldType == DCGM_FT_STRING { + return *fv.StringValue + } else { + return string(fv.Value[:]) + } +} + +func Fv2_Blob(fv FieldValue_v2) [4096]byte { + return fv.Value +} + +func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta { + return FieldMeta{ + FieldId: Short(fieldInfo.fieldId), + FieldType: byte(fieldInfo.fieldType), + Size: byte(fieldInfo.size), + Tag: *stringPtr((*C.char)(unsafe.Pointer(&fieldInfo.tag[0]))), + Scope: int(fieldInfo.scope), + NvmlFieldId: int(fieldInfo.nvmlFieldId), + EntityLevel: Field_Entity_Group(fieldInfo.entityLevel), + } +} + +func FieldGetById(fieldId Short) FieldMeta { + return ToFieldMeta(C.DcgmFieldGetById(C.ushort(fieldId))) +} + +func FieldsInit() int { + return int(C.DcgmFieldsInit()) +} + +func FieldsTerm() int { + return int(C.DcgmFieldsTerm()) +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/gpu_group.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/gpu_group.go new file mode 100644 index 0000000000..2e83db2a70 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/gpu_group.go @@ -0,0 +1,85 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "encoding/binary" + "fmt" +) + +const ( + DCGM_GROUP_MAX_ENTITIES int = C.DCGM_GROUP_MAX_ENTITIES +) + +type GroupHandle struct{ handle C.dcgmGpuGrp_t } + +func GroupAllGPUs() GroupHandle { + return GroupHandle{C.DCGM_GROUP_ALL_GPUS} +} + +func CreateGroup(groupName string) (goGroupId GroupHandle, err error) { + var cGroupId C.dcgmGpuGrp_t + cname := C.CString(groupName) + defer freeCString(cname) + + result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_EMPTY, cname, &cGroupId) + if err = errorString(result); err != nil { + return goGroupId, fmt.Errorf("Error creating group: %s", err) + } + + goGroupId = GroupHandle{cGroupId} + return +} + +func NewDefaultGroup(groupName string) (GroupHandle, error) { + var cGroupId C.dcgmGpuGrp_t + + cname := C.CString(groupName) + defer freeCString(cname) + + result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_DEFAULT, cname, &cGroupId) + if err := errorString(result); err != nil { + return GroupHandle{}, fmt.Errorf("Error creating group: %s", err) + } + + return GroupHandle{cGroupId}, nil +} + +func AddToGroup(groupId GroupHandle, gpuId uint) (err error) { + result := C.dcgmGroupAddDevice(handle.handle, groupId.handle, C.uint(gpuId)) + if err = errorString(result); err != nil { + return fmt.Errorf("Error adding GPU %v to group: %s", gpuId, err) + } + + return +} + +func AddLinkEntityToGroup(groupId GroupHandle, index uint, parentId uint) (err error) { + /* Only supported on little-endian systems currently */ + slice := []byte{uint8(FE_SWITCH), uint8(index), uint8(parentId), 0} + + entityId := binary.LittleEndian.Uint32(slice) + + return AddEntityToGroup(groupId, FE_LINK, uint(entityId)) +} + +func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error) { + result := C.dcgmGroupAddEntity(handle.handle, groupId.handle, C.dcgm_field_entity_group_t(entityGroupId), C.uint(entityId)) + if err = errorString(result); err != nil { + return fmt.Errorf("Error adding entity group type %v, entity %v to group: %s", entityGroupId, entityId, err) + } + + return +} + +func DestroyGroup(groupId GroupHandle) (err error) { + result := C.dcgmGroupDestroy(handle.handle, groupId.handle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error destroying group: %s", err) + } + + return +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/health.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/health.go new file mode 100644 index 0000000000..3a5f6adc1d --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/health.go @@ -0,0 +1,121 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "math/rand" + "unsafe" +) + +type SystemWatch struct { + Type string + Status string + Error string +} + +type DeviceHealth struct { + GPU uint + Status string + Watches []SystemWatch +} + +func setHealthWatches(groupId GroupHandle) (err error) { + result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL) + if err = errorString(result); err != nil { + return fmt.Errorf("Error setting health watches: %s", err) + } + return +} + +func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { + name := fmt.Sprintf("health%d", rand.Uint64()) + groupId, err := CreateGroup(name) + if err != nil { + return + } + + err = AddToGroup(groupId, gpuId) + if err != nil { + return + } + + err = setHealthWatches(groupId) + if err != nil { + return + } + + var healthResults C.dcgmHealthResponse_v4 + healthResults.version = makeVersion4(unsafe.Sizeof(healthResults)) + + result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) + + if err = errorString(result); err != nil { + return deviceHealth, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + status := healthStatus(int8(healthResults.overallHealth)) + watches := []SystemWatch{} + + // number of watches that encountred error/warning + incidents := uint(healthResults.incidentCount) + + for j := uint(0); j < incidents; j++ { + watch := SystemWatch{ + Type: systemWatch(int(healthResults.incidents[j].system)), + Status: healthStatus(int8(healthResults.incidents[j].health)), + + Error: *stringPtr(&healthResults.incidents[j].error.msg[0]), + } + watches = append(watches, watch) + } + + deviceHealth = DeviceHealth{ + GPU: gpuId, + Status: status, + Watches: watches, + } + _ = DestroyGroup(groupId) + return +} + +func healthStatus(status int8) string { + switch status { + case 0: + return "Healthy" + case 10: + return "Warning" + case 20: + return "Failure" + } + return "N/A" +} + +func systemWatch(watch int) string { + switch watch { + case 1: + return "PCIe watches" + case 2: + return "NVLINK watches" + case 4: + return "Power Managemnt unit watches" + case 8: + return "Microcontroller unit watches" + case 16: + return "Memory watches" + case 32: + return "Streaming Multiprocessor watches" + case 64: + return "Inforom watches" + case 128: + return "Temperature watches" + case 256: + return "Power watches" + case 512: + return "Driver-related watches" + } + return "N/A" +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/hostengine_status.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/hostengine_status.go new file mode 100644 index 0000000000..695df9bf49 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/hostengine_status.go @@ -0,0 +1,41 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "unsafe" +) + +type DcgmStatus struct { + Memory int64 + CPU float64 +} + +func introspect() (engine DcgmStatus, err error) { + var memory C.dcgmIntrospectMemory_t + memory.version = makeVersion1(unsafe.Sizeof(memory)) + waitIfNoData := 1 + result := C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData)) + + if err = errorString(result); err != nil { + return engine, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + var cpu C.dcgmIntrospectCpuUtil_t + + cpu.version = makeVersion1(unsafe.Sizeof(cpu)) + result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData)) + + if err = errorString(result); err != nil { + return engine, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + engine = DcgmStatus{ + Memory: toInt64(memory.bytesUsed) / 1024, + CPU: *dblToFloat(cpu.total) * 100, + } + return +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/internal.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/internal.go new file mode 100644 index 0000000000..8a1f2ff48d --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/internal.go @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +/* +#cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#cgo darwin LDFLAGS: -ldl -Wl,--export-dynamic -Wl,-undefined,dynamic_lookup + +#include "dcgm_test_apis.h" +#include "dcgm_test_structs.h" +#include "dcgm_structs_internal.h" +*/ +import "C" +import ( + "unsafe" +) + +type MigHierarchyInfo struct { + Entity GroupEntityPair + Parent GroupEntityPair + SliceProfile MigProfile +} + +func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error) { + ccfe := C.dcgmCreateFakeEntities_v2{ + version: C.dcgmCreateFakeEntities_version2, + numToCreate: C.uint(len(entities)), + entityList: [C.DCGM_MAX_HIERARCHY_INFO]C.dcgmMigHierarchyInfo_t{}, + } + for i := range entities { + if i >= C.DCGM_MAX_HIERARCHY_INFO { + break + } + entity := entities[i] + ccfe.entityList[i] = C.dcgmMigHierarchyInfo_t{ + entity: C.dcgmGroupEntityPair_t{ + entityGroupId: C.dcgm_field_entity_group_t(entity.Entity.EntityGroupId), + entityId: C.uint(entity.Entity.EntityId), + }, + parent: C.dcgmGroupEntityPair_t{ + entityGroupId: C.dcgm_field_entity_group_t(entity.Parent.EntityGroupId), + entityId: C.uint(entity.Parent.EntityId), + }, + sliceProfile: C.dcgmMigProfile_t(entity.SliceProfile), + } + } + result := C.dcgmCreateFakeEntities(handle.handle, &ccfe) + + if err := errorString(result); err != nil { + return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + gpuIDs := make([]uint, ccfe.numToCreate) + for i := 0; i < int(ccfe.numToCreate); i++ { + gpuIDs[i] = uint(ccfe.entityList[i].entity.entityId) + } + + return gpuIDs, nil +} + +func InjectFieldValue(gpu uint, fieldID uint, fieldType uint, status int, ts int64, value interface{}) error { + field := C.dcgmInjectFieldValue_t{ + version: C.dcgmInjectFieldValue_version1, + fieldId: C.ushort(fieldID), + fieldType: C.ushort(fieldType), + status: C.int(status), + ts: C.long(ts), + } + + switch fieldType { + case DCGM_FT_INT64: + i64Val := value.(int64) + ptr := (*C.int64_t)(unsafe.Pointer(&field.value[0])) + *ptr = C.int64_t(i64Val) + case DCGM_FT_DOUBLE: + dbVal := value.(float64) + ptr := (*C.double)(unsafe.Pointer(&field.value[0])) + *ptr = C.double(dbVal) + } + + result := C.dcgmInjectFieldValue(handle.handle, C.uint(gpu), &field) + + if err := errorString(result); err != nil { + return &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + return nil +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/mig.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/mig.go new file mode 100644 index 0000000000..00beeebbc5 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/mig.go @@ -0,0 +1,114 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "unsafe" +) + +type Field_Entity_Group uint + +const ( + FE_NONE Field_Entity_Group = iota + FE_GPU + FE_VGPU + FE_SWITCH + FE_GPU_I + FE_GPU_CI + FE_LINK + FE_CPU + FE_CPU_CORE + FE_COUNT +) + +func (e Field_Entity_Group) String() string { + switch e { + case FE_GPU: + return "GPU" + case FE_VGPU: + return "vGPU" + case FE_SWITCH: + return "NvSwitch" + case FE_GPU_I: + return "GPU Instance" + case FE_GPU_CI: + return "GPU Compute Instance" + case FE_LINK: + return "NvLink" + case FE_CPU: + return "CPU" + case FE_CPU_CORE: + return "CPU Core" + } + return "unknown" +} + +type GroupEntityPair struct { + EntityGroupId Field_Entity_Group + EntityId uint +} + +type MigEntityInfo struct { + GpuUuid string + NvmlGpuIndex uint + NvmlInstanceId uint + NvmlComputeInstanceId uint + NvmlMigProfileId uint + NvmlProfileSlices uint +} + +type MigHierarchyInfo_v2 struct { + Entity GroupEntityPair + Parent GroupEntityPair + Info MigEntityInfo +} + +const ( + MAX_NUM_DEVICES uint = C.DCGM_MAX_NUM_DEVICES + MAX_HIERARCHY_INFO uint = C.DCGM_MAX_HIERARCHY_INFO +) + +type MigHierarchy_v2 struct { + Version uint + Count uint + EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2 +} + +func GetGpuInstanceHierarchy() (hierarchy MigHierarchy_v2, err error) { + var c_hierarchy C.dcgmMigHierarchy_v2 + c_hierarchy.version = C.dcgmMigHierarchy_version2 + ptr_hierarchy := (*C.dcgmMigHierarchy_v2)(unsafe.Pointer(&c_hierarchy)) + result := C.dcgmGetGpuInstanceHierarchy(handle.handle, ptr_hierarchy) + + if err = errorString(result); err != nil { + return toMigHierarchy(c_hierarchy), fmt.Errorf("Error retrieving DCGM MIG hierarchy: %s", err) + } + + return toMigHierarchy(c_hierarchy), nil +} + +func toMigHierarchy(c_hierarchy C.dcgmMigHierarchy_v2) MigHierarchy_v2 { + var hierarchy MigHierarchy_v2 + hierarchy.Version = uint(c_hierarchy.version) + hierarchy.Count = uint(c_hierarchy.count) + for i := uint(0); i < hierarchy.Count; i++ { + hierarchy.EntityList[i] = MigHierarchyInfo_v2{ + Entity: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].entity.entityGroupId), uint(c_hierarchy.entityList[i].entity.entityId)}, + Parent: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].parent.entityGroupId), uint(c_hierarchy.entityList[i].parent.entityId)}, + Info: MigEntityInfo{ + GpuUuid: *stringPtr(&c_hierarchy.entityList[i].info.gpuUuid[0]), + NvmlGpuIndex: uint(c_hierarchy.entityList[i].info.nvmlGpuIndex), + NvmlInstanceId: uint(c_hierarchy.entityList[i].info.nvmlInstanceId), + NvmlComputeInstanceId: uint(c_hierarchy.entityList[i].info.nvmlComputeInstanceId), + NvmlMigProfileId: uint(c_hierarchy.entityList[i].info.nvmlMigProfileId), + NvmlProfileSlices: uint(c_hierarchy.entityList[i].info.nvmlProfileSlices), + }, + } + } + + return hierarchy +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/nvml.h b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/nvml.h new file mode 100644 index 0000000000..6f4e8a898f --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/nvml.h @@ -0,0 +1,9459 @@ +/* + * Copyright 1993-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +/* +NVML API Reference + +The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and +managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building +3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi +tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. + +API Documentation + +Supported platforms: +- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit +- Linux: 32-bit and 64-bit +- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 + +Supported products: +- Full Support + - All Tesla products, starting with the Fermi architecture + - All Quadro products, starting with the Fermi architecture + - All vGPU Software products, starting with the Kepler architecture + - Selected GeForce Titan products +- Limited Support + - All Geforce products, starting with the Fermi architecture + +The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is +not be added to the system path by default. To dynamically link to NVML, add this path to the PATH +environmental variable. To dynamically load NVML, call LoadLibrary with this path. + +On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit +and 64 bit NVML libraries will be installed. + +Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html +*/ + +#ifndef __nvml_nvml_h__ +#define __nvml_nvml_h__ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * On Windows, set up methods for DLL export + * define NVML_STATIC_IMPORT when using nvml_loader library + */ +#if defined _WINDOWS + #if !defined NVML_STATIC_IMPORT + #if defined NVML_LIB_EXPORT + #define DECLDIR __declspec(dllexport) + #else + #define DECLDIR __declspec(dllimport) + #endif + #else + #define DECLDIR + #endif +#else + #define DECLDIR +#endif + +/** + * NVML API versioning support + */ +#define NVML_API_VERSION 11 +#define NVML_API_VERSION_STR "11" +/** + * Defining NVML_NO_UNVERSIONED_FUNC_DEFS will disable "auto upgrading" of APIs. + * e.g. the user will have to call nvmlInit_v2 instead of nvmlInit. Enable this + * guard if you need to support older versions of the API + */ +#ifndef NVML_NO_UNVERSIONED_FUNC_DEFS + #define nvmlInit nvmlInit_v2 + #define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 + #define nvmlDeviceGetCount nvmlDeviceGetCount_v2 + #define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 + #define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 + #define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 + #define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 + #define nvmlDeviceGetGridLicensableFeatures nvmlDeviceGetGridLicensableFeatures_v4 + #define nvmlEventSetWait nvmlEventSetWait_v2 + #define nvmlDeviceGetAttributes nvmlDeviceGetAttributes_v2 + #define nvmlComputeInstanceGetInfo nvmlComputeInstanceGetInfo_v2 + #define nvmlDeviceGetComputeRunningProcesses nvmlDeviceGetComputeRunningProcesses_v3 + #define nvmlDeviceGetGraphicsRunningProcesses nvmlDeviceGetGraphicsRunningProcesses_v3 + #define nvmlDeviceGetMPSComputeRunningProcesses nvmlDeviceGetMPSComputeRunningProcesses_v3 + #define nvmlBlacklistDeviceInfo_t nvmlExcludedDeviceInfo_t + #define nvmlGetBlacklistDeviceCount nvmlGetExcludedDeviceCount + #define nvmlGetBlacklistDeviceInfoByIndex nvmlGetExcludedDeviceInfoByIndex + #define nvmlDeviceGetGpuInstancePossiblePlacements nvmlDeviceGetGpuInstancePossiblePlacements_v2 + #define nvmlVgpuInstanceGetLicenseInfo nvmlVgpuInstanceGetLicenseInfo_v2 +#endif // #ifndef NVML_NO_UNVERSIONED_FUNC_DEFS + +#define NVML_STRUCT_VERSION(data, ver) (unsigned int)(sizeof(nvml ## data ## _v ## ver ## _t) | \ + (ver << 24U)) + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceStructs Device Structs + * @{ + */ +/***************************************************************************************************/ + +/** + * Special constant that some fields take when they are not available. + * Used when only part of the struct is not available. + * + * Each structure explicitly states when to check for this value. + */ +#define NVML_VALUE_NOT_AVAILABLE (-1) + +typedef struct nvmlDevice_st* nvmlDevice_t; + +/** + * Buffer size guaranteed to be large enough for pci bus id + */ +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 + +/** + * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy + */ +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 + +/** + * PCI information about a GPU device. + */ +typedef struct nvmlPciInfo_st +{ + char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) + unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff + unsigned int bus; //!< The bus on which the device resides, 0 to 0xff + unsigned int device; //!< The device's id on the bus, 0 to 31 + unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id + + // Added in NVML 2.285 API + unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID + + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) +} nvmlPciInfo_t; + +/** + * PCI format string for ::busIdLegacy + */ +#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT "%04X:%02X:%02X.0" + +/** + * PCI format string for ::busId + */ +#define NVML_DEVICE_PCI_BUS_ID_FMT "%08X:%02X:%02X.0" + +/** + * Utility macro for filling the pci bus id format from a nvmlPciInfo_t + */ +#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo) (pciInfo)->domain, \ + (pciInfo)->bus, \ + (pciInfo)->device + +/** + * Detailed ECC error counts for a device. + * + * @deprecated Different GPU families can have different memory error counters + * See \ref nvmlDeviceGetMemoryErrorCounter + */ +typedef struct nvmlEccErrorCounts_st +{ + unsigned long long l1Cache; //!< L1 cache errors + unsigned long long l2Cache; //!< L2 cache errors + unsigned long long deviceMemory; //!< Device memory errors + unsigned long long registerFile; //!< Register file errors +} nvmlEccErrorCounts_t; + +/** + * Utilization information for a device. + * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. + */ +typedef struct nvmlUtilization_st +{ + unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU + unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written +} nvmlUtilization_t; + +/** + * Memory allocation information for a device (v1). + * The total amount is equal to the sum of the amounts of free and used memory. + */ +typedef struct nvmlMemory_st +{ + unsigned long long total; //!< Total physical device memory (in bytes) + unsigned long long free; //!< Unallocated device memory (in bytes) + unsigned long long used; //!< Sum of Reserved and Allocated device memory (in bytes). + //!< Note that the driver/GPU always sets aside a small amount of memory for bookkeeping +} nvmlMemory_t; + +/** + * Memory allocation information for a device (v2). + * + * Version 2 adds versioning for the struct and the amount of system-reserved memory as an output. + * @note The \ref nvmlMemory_v2_t.used amount also includes the \ref nvmlMemory_v2_t.reserved amount. + */ +typedef struct nvmlMemory_v2_st +{ + unsigned int version; //!< Structure format version (must be 2) + unsigned long long total; //!< Total physical device memory (in bytes) + unsigned long long reserved; //!< Device memory (in bytes) reserved for system use (driver or firmware) + unsigned long long free; //!< Unallocated device memory (in bytes) + unsigned long long used; //!< Allocated device memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping +} nvmlMemory_v2_t; + +#define nvmlMemory_v2 NVML_STRUCT_VERSION(Memory, 2) + +/** + * BAR1 Memory allocation Information for a device + */ +typedef struct nvmlBAR1Memory_st +{ + unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) + unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) + unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) +}nvmlBAR1Memory_t; + +/** + * Information about running compute processes on the GPU, legacy version + * for older versions of the API. + */ +typedef struct nvmlProcessInfo_v1_st +{ + unsigned int pid; //!< Process ID + unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. + //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported + //! because Windows KMD manages all the memory and not the NVIDIA driver +} nvmlProcessInfo_v1_t; + +/** + * Information about running compute processes on the GPU + */ +typedef struct nvmlProcessInfo_v2_st +{ + unsigned int pid; //!< Process ID + unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. + //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported + //! because Windows KMD manages all the memory and not the NVIDIA driver + unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to + // 0xFFFFFFFF otherwise. + unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to + // 0xFFFFFFFF otherwise. +} nvmlProcessInfo_v2_t; + +/** + * Information about running compute processes on the GPU + * Version 2 adds versioning for the struct and the conf compute protected memory in output. + */ +typedef struct nvmlProcessInfo_st +{ + unsigned int pid; //!< Process ID + unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. + //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported + //! because Windows KMD manages all the memory and not the NVIDIA driver + unsigned int gpuInstanceId; //!< If MIG is enabled, stores a valid GPU instance ID. gpuInstanceId is set to + // 0xFFFFFFFF otherwise. + unsigned int computeInstanceId; //!< If MIG is enabled, stores a valid compute instance ID. computeInstanceId is set to + // 0xFFFFFFFF otherwise. + unsigned long long usedGpuCcProtectedMemory; //!< Amount of used GPU conf compute protected memory in bytes. +} nvmlProcessInfo_t; + +typedef struct nvmlDeviceAttributes_st +{ + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count + unsigned int gpuInstanceSliceCount; //!< GPU instance slice count + unsigned int computeInstanceSliceCount; //!< Compute instance slice count + unsigned long long memorySizeMB; //!< Device memory size (in MiB) +} nvmlDeviceAttributes_t; + +/** + * Possible values that classify the remap availability for each bank. The max + * field will contain the number of banks that have maximum remap availability + * (all reserved rows are available). None means that there are no reserved + * rows available. + */ +typedef struct nvmlRowRemapperHistogramValues_st +{ + unsigned int max; + unsigned int high; + unsigned int partial; + unsigned int low; + unsigned int none; +} nvmlRowRemapperHistogramValues_t; + +/** + * Enum to represent type of bridge chip + */ +typedef enum nvmlBridgeChipType_enum +{ + NVML_BRIDGE_CHIP_PLX = 0, + NVML_BRIDGE_CHIP_BRO4 = 1 +}nvmlBridgeChipType_t; + +/** + * Maximum number of NvLink links supported + */ +#define NVML_NVLINK_MAX_LINKS 18 + +/** + * Enum to represent the NvLink utilization counter packet units + */ +typedef enum nvmlNvLinkUtilizationCountUnits_enum +{ + NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles + NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets + NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes + NVML_NVLINK_COUNTER_UNIT_RESERVED = 3, // count reserved for internal use + // this must be last + NVML_NVLINK_COUNTER_UNIT_COUNT +} nvmlNvLinkUtilizationCountUnits_t; + +/** + * Enum to represent the NvLink utilization counter packet types to count + * ** this is ONLY applicable with the units as packets or bytes + * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t + * ** all packet filter descriptions are target GPU centric + * ** these can be "OR'd" together + */ +typedef enum nvmlNvLinkUtilizationCountPktTypes_enum +{ + NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets + NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets + NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets + NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests + NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests + NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests + NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data + NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data + NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets +} nvmlNvLinkUtilizationCountPktTypes_t; + +/** + * Struct to define the NVLINK counter controls + */ +typedef struct nvmlNvLinkUtilizationControl_st +{ + nvmlNvLinkUtilizationCountUnits_t units; + nvmlNvLinkUtilizationCountPktTypes_t pktfilter; +} nvmlNvLinkUtilizationControl_t; + +/** + * Enum to represent NvLink queryable capabilities + */ +typedef enum nvmlNvLinkCapability_enum +{ + NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported + NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported + NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported + NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported + NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link + NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device + // should be last + NVML_NVLINK_CAP_COUNT +} nvmlNvLinkCapability_t; + +/** + * Enum to represent NvLink queryable error counters + */ +typedef enum nvmlNvLinkErrorCounter_enum +{ + NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter + NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter + NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter + NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter + NVML_NVLINK_ERROR_DL_ECC_DATA = 4, // Data link receive data ECC error counter + + // this must be last + NVML_NVLINK_ERROR_COUNT +} nvmlNvLinkErrorCounter_t; + +/** + * Enum to represent NvLink's remote device type + */ +typedef enum nvmlIntNvLinkDeviceType_enum +{ + NVML_NVLINK_DEVICE_TYPE_GPU = 0x00, + NVML_NVLINK_DEVICE_TYPE_IBMNPU = 0x01, + NVML_NVLINK_DEVICE_TYPE_SWITCH = 0x02, + NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF +} nvmlIntNvLinkDeviceType_t; + +/** + * Represents level relationships within a system between two GPUs + * The enums are spaced to allow for future relationships + */ +typedef enum nvmlGpuLevel_enum +{ + NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 + NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch + NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge + NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge + NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges + NVML_TOPOLOGY_SYSTEM = 50 // all devices in the system + + // there is purposefully no COUNT here because of the need for spacing above +} nvmlGpuTopologyLevel_t; + +/* Compatibility for CPU->NODE renaming */ +#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE + +/* P2P Capability Index Status*/ +typedef enum nvmlGpuP2PStatus_enum +{ + NVML_P2P_STATUS_OK = 0, + NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, + NVML_P2P_STATUS_GPU_NOT_SUPPORTED, + NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, + NVML_P2P_STATUS_DISABLED_BY_REGKEY, + NVML_P2P_STATUS_NOT_SUPPORTED, + NVML_P2P_STATUS_UNKNOWN + +} nvmlGpuP2PStatus_t; + +/* P2P Capability Index*/ +typedef enum nvmlGpuP2PCapsIndex_enum +{ + NVML_P2P_CAPS_INDEX_READ = 0, + NVML_P2P_CAPS_INDEX_WRITE, + NVML_P2P_CAPS_INDEX_NVLINK, + NVML_P2P_CAPS_INDEX_ATOMICS, + NVML_P2P_CAPS_INDEX_PROP, + NVML_P2P_CAPS_INDEX_UNKNOWN +}nvmlGpuP2PCapsIndex_t; + +/** + * Maximum limit on Physical Bridges per Board + */ +#define NVML_MAX_PHYSICAL_BRIDGE (128) + +/** + * Information about the Bridge Chip Firmware + */ +typedef struct nvmlBridgeChipInfo_st +{ + nvmlBridgeChipType_t type; //!< Type of Bridge Chip + unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable +}nvmlBridgeChipInfo_t; + +/** + * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate + * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. + */ +typedef struct nvmlBridgeChipHierarchy_st +{ + unsigned char bridgeCount; //!< Number of Bridge Chips on the Board + nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board +}nvmlBridgeChipHierarchy_t; + +/** + * Represents Type of Sampling Event + */ +typedef enum nvmlSamplingType_enum +{ + NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU + NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU + NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written + NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy + NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy + NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples + NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples + + // Keep this last + NVML_SAMPLINGTYPE_COUNT +}nvmlSamplingType_t; + +/** + * Represents the queryable PCIe utilization counters + */ +typedef enum nvmlPcieUtilCounter_enum +{ + NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity + NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity + + // Keep this last + NVML_PCIE_UTIL_COUNT +} nvmlPcieUtilCounter_t; + +/** + * Represents the type for sample value returned + */ +typedef enum nvmlValueType_enum +{ + NVML_VALUE_TYPE_DOUBLE = 0, + NVML_VALUE_TYPE_UNSIGNED_INT = 1, + NVML_VALUE_TYPE_UNSIGNED_LONG = 2, + NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, + NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, + + // Keep this last + NVML_VALUE_TYPE_COUNT +}nvmlValueType_t; + + +/** + * Union to represent different types of Value + */ +typedef union nvmlValue_st +{ + double dVal; //!< If the value is double + unsigned int uiVal; //!< If the value is unsigned int + unsigned long ulVal; //!< If the value is unsigned long + unsigned long long ullVal; //!< If the value is unsigned long long + signed long long sllVal; //!< If the value is signed long long +}nvmlValue_t; + +/** + * Information for Sample + */ +typedef struct nvmlSample_st +{ + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlValue_t sampleValue; //!< Sample Value +}nvmlSample_t; + +/** + * Represents type of perf policy for which violation times can be queried + */ +typedef enum nvmlPerfPolicyType_enum +{ + NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks + NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks + NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks + NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks + NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks + NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks + + NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) + NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks + + // Keep this last + NVML_PERF_POLICY_COUNT +}nvmlPerfPolicyType_t; + +/** + * Struct to hold perf policy violation status data + */ +typedef struct nvmlViolationTime_st +{ + unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds + unsigned long long violationTime; //!< violationTime in Nanoseconds +}nvmlViolationTime_t; + +#define NVML_MAX_THERMAL_SENSORS_PER_GPU 3 + +typedef enum +{ + NVML_THERMAL_TARGET_NONE = 0, + NVML_THERMAL_TARGET_GPU = 1, //!< GPU core temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_MEMORY = 2, //!< GPU memory temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_POWER_SUPPLY = 4, //!< GPU power supply temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_BOARD = 8, //!< GPU board ambient temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_VCD_BOARD = 9, //!< Visual Computing Device Board temperature requires NvVisualComputingDeviceHandle + NVML_THERMAL_TARGET_VCD_INLET = 10, //!< Visual Computing Device Inlet temperature requires NvVisualComputingDeviceHandle + NVML_THERMAL_TARGET_VCD_OUTLET = 11, //!< Visual Computing Device Outlet temperature requires NvVisualComputingDeviceHandle + + NVML_THERMAL_TARGET_ALL = 15, + NVML_THERMAL_TARGET_UNKNOWN = -1, +} nvmlThermalTarget_t; + +typedef enum +{ + NVML_THERMAL_CONTROLLER_NONE = 0, + NVML_THERMAL_CONTROLLER_GPU_INTERNAL, + NVML_THERMAL_CONTROLLER_ADM1032, + NVML_THERMAL_CONTROLLER_ADT7461, + NVML_THERMAL_CONTROLLER_MAX6649, + NVML_THERMAL_CONTROLLER_MAX1617, + NVML_THERMAL_CONTROLLER_LM99, + NVML_THERMAL_CONTROLLER_LM89, + NVML_THERMAL_CONTROLLER_LM64, + NVML_THERMAL_CONTROLLER_G781, + NVML_THERMAL_CONTROLLER_ADT7473, + NVML_THERMAL_CONTROLLER_SBMAX6649, + NVML_THERMAL_CONTROLLER_VBIOSEVT, + NVML_THERMAL_CONTROLLER_OS, + NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS, + NVML_THERMAL_CONTROLLER_NVSYSCON_E551, + NVML_THERMAL_CONTROLLER_MAX6649R, + NVML_THERMAL_CONTROLLER_ADT7473S, + NVML_THERMAL_CONTROLLER_UNKNOWN = -1, +} nvmlThermalController_t; + +typedef struct +{ + unsigned int count; + struct + { + nvmlThermalController_t controller; + unsigned int defaultMinTemp; + unsigned int defaultMaxTemp; + unsigned int currentTemp; + nvmlThermalTarget_t target; + } sensor[NVML_MAX_THERMAL_SENSORS_PER_GPU]; + +} nvmlGpuThermalSettings_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceEnumvs Device Enums + * @{ + */ +/***************************************************************************************************/ + +/** + * Generic enable/disable enum. + */ +typedef enum nvmlEnableState_enum +{ + NVML_FEATURE_DISABLED = 0, //!< Feature disabled + NVML_FEATURE_ENABLED = 1 //!< Feature enabled +} nvmlEnableState_t; + +//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. +#define nvmlFlagDefault 0x00 +//! Generic flag used to force some behavior. See description of particular functions for details. +#define nvmlFlagForce 0x01 + +/** + * * The Brand of the GPU + * */ +typedef enum nvmlBrandType_enum +{ + NVML_BRAND_UNKNOWN = 0, + NVML_BRAND_QUADRO = 1, + NVML_BRAND_TESLA = 2, + NVML_BRAND_NVS = 3, + NVML_BRAND_GRID = 4, // Deprecated from API reporting. Keeping definition for backward compatibility. + NVML_BRAND_GEFORCE = 5, + NVML_BRAND_TITAN = 6, + NVML_BRAND_NVIDIA_VAPPS = 7, // NVIDIA Virtual Applications + NVML_BRAND_NVIDIA_VPC = 8, // NVIDIA Virtual PC + NVML_BRAND_NVIDIA_VCS = 9, // NVIDIA Virtual Compute Server + NVML_BRAND_NVIDIA_VWS = 10, // NVIDIA RTX Virtual Workstation + NVML_BRAND_NVIDIA_CLOUD_GAMING = 11, // NVIDIA Cloud Gaming + NVML_BRAND_NVIDIA_VGAMING = NVML_BRAND_NVIDIA_CLOUD_GAMING, // Deprecated from API reporting. Keeping definition for backward compatibility. + NVML_BRAND_QUADRO_RTX = 12, + NVML_BRAND_NVIDIA_RTX = 13, + NVML_BRAND_NVIDIA = 14, + NVML_BRAND_GEFORCE_RTX = 15, // Unused + NVML_BRAND_TITAN_RTX = 16, // Unused + + // Keep this last + NVML_BRAND_COUNT +} nvmlBrandType_t; + +/** + * Temperature thresholds. + */ +typedef enum nvmlTemperatureThresholds_enum +{ + NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will + // shut down for HW protection + NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will + // begin HW slowdown + NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will + // begin SW slowdown + NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU + // can be throttled below base clock + NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN = 4, // Minimum GPU Temperature that can be + // set as acoustic threshold + NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5, // Current temperature that is set as + // acoustic threshold. + NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX = 6, // Maximum GPU temperature that can be + // set as acoustic threshold. + // Keep this last + NVML_TEMPERATURE_THRESHOLD_COUNT +} nvmlTemperatureThresholds_t; + +/** + * Temperature sensors. + */ +typedef enum nvmlTemperatureSensors_enum +{ + NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die + + // Keep this last + NVML_TEMPERATURE_COUNT +} nvmlTemperatureSensors_t; + +/** + * Compute mode. + * + * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. + * Earlier CUDA versions supported a single exclusive mode, + * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. + */ +typedef enum nvmlComputeMode_enum +{ + NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed + NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time + + // Keep this last + NVML_COMPUTEMODE_COUNT +} nvmlComputeMode_t; + +/** + * Max Clock Monitors available + */ +#define MAX_CLK_DOMAINS 32 + +/** + * Clock Monitor error types + */ +typedef struct nvmlClkMonFaultInfo_struct { + /** + * The Domain which faulted + */ + unsigned int clkApiDomain; + + /** + * Faults Information + */ + unsigned int clkDomainFaultMask; +} nvmlClkMonFaultInfo_t; + +/** + * Clock Monitor Status + */ +typedef struct nvmlClkMonStatus_status { + /** + * Fault status Indicator + */ + unsigned int bGlobalStatus; + + /** + * Total faulted domain numbers + */ + unsigned int clkMonListSize; + + /** + * The fault Information structure + */ + nvmlClkMonFaultInfo_t clkMonList[MAX_CLK_DOMAINS]; +} nvmlClkMonStatus_t; + +/** + * ECC bit types. + * + * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type + */ +#define nvmlEccBitType_t nvmlMemoryErrorType_t + +/** + * Single bit ECC errors + * + * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED + */ +#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED + +/** + * Double bit ECC errors + * + * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED + */ +#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED + +/** + * Memory error types + */ +typedef enum nvmlMemoryErrorType_enum +{ + /** + * A memory error that was corrected + * + * For ECC errors, these are single bit errors + * For Texture memory, these are errors fixed by resend + */ + NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, + /** + * A memory error that was not corrected + * + * For ECC errors, these are double bit errors + * For Texture memory, these are errors where the resend fails + */ + NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, + + + // Keep this last + NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types + +} nvmlMemoryErrorType_t; + +/** + * ECC counter types. + * + * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. + * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver + * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app + * is run. + */ +typedef enum nvmlEccCounterType_enum +{ + NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. + NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) + + // Keep this last + NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types +} nvmlEccCounterType_t; + +/** + * Clock types. + * + * All speeds are in Mhz. + */ +typedef enum nvmlClockType_enum +{ + NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain + NVML_CLOCK_SM = 1, //!< SM clock domain + NVML_CLOCK_MEM = 2, //!< Memory clock domain + NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain + + // Keep this last + NVML_CLOCK_COUNT //!< Count of clock types +} nvmlClockType_t; + +/** + * Clock Ids. These are used in combination with nvmlClockType_t + * to specify a single clock value. + */ +typedef enum nvmlClockId_enum +{ + NVML_CLOCK_ID_CURRENT = 0, //!< Current actual clock value + NVML_CLOCK_ID_APP_CLOCK_TARGET = 1, //!< Target application clock + NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2, //!< Default application clock target + NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3, //!< OEM-defined maximum clock rate + + //Keep this last + NVML_CLOCK_ID_COUNT //!< Count of Clock Ids. +} nvmlClockId_t; + +/** + * Driver models. + * + * Windows only. + */ + +typedef enum nvmlDriverModel_enum +{ + NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device + NVML_DRIVER_WDM = 1 //!< WDM (TCC) model (recommended) -- GPU treated as a generic device +} nvmlDriverModel_t; + +#define NVML_MAX_GPU_PERF_PSTATES 16 + +/** + * Allowed PStates. + */ +typedef enum nvmlPStates_enum +{ + NVML_PSTATE_0 = 0, //!< Performance state 0 -- Maximum Performance + NVML_PSTATE_1 = 1, //!< Performance state 1 + NVML_PSTATE_2 = 2, //!< Performance state 2 + NVML_PSTATE_3 = 3, //!< Performance state 3 + NVML_PSTATE_4 = 4, //!< Performance state 4 + NVML_PSTATE_5 = 5, //!< Performance state 5 + NVML_PSTATE_6 = 6, //!< Performance state 6 + NVML_PSTATE_7 = 7, //!< Performance state 7 + NVML_PSTATE_8 = 8, //!< Performance state 8 + NVML_PSTATE_9 = 9, //!< Performance state 9 + NVML_PSTATE_10 = 10, //!< Performance state 10 + NVML_PSTATE_11 = 11, //!< Performance state 11 + NVML_PSTATE_12 = 12, //!< Performance state 12 + NVML_PSTATE_13 = 13, //!< Performance state 13 + NVML_PSTATE_14 = 14, //!< Performance state 14 + NVML_PSTATE_15 = 15, //!< Performance state 15 -- Minimum Performance + NVML_PSTATE_UNKNOWN = 32 //!< Unknown performance state +} nvmlPstates_t; + +/** + * GPU Operation Mode + * + * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features. + * + * Each GOM is designed to meet specific user needs. + */ +typedef enum nvmlGom_enum +{ + NVML_GOM_ALL_ON = 0, //!< Everything is enabled and running at full speed + + NVML_GOM_COMPUTE = 1, //!< Designed for running only compute tasks. Graphics operations + //!< are not allowed + + NVML_GOM_LOW_DP = 2 //!< Designed for running graphics applications that don't require + //!< high bandwidth double precision +} nvmlGpuOperationMode_t; + +/** + * Available infoROM objects. + */ +typedef enum nvmlInforomObject_enum +{ + NVML_INFOROM_OEM = 0, //!< An object defined by OEM + NVML_INFOROM_ECC = 1, //!< The ECC object determining the level of ECC support + NVML_INFOROM_POWER = 2, //!< The power management object + + // Keep this last + NVML_INFOROM_COUNT //!< This counts the number of infoROM objects the driver knows about +} nvmlInforomObject_t; + +/** + * Return values for NVML API calls. + */ +typedef enum nvmlReturn_enum +{ + // cppcheck-suppress * + NVML_SUCCESS = 0, //!< The operation was successful + NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() + NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid + NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device + NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation + NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting + NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful + NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough + NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached + NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded + NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed + NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU + NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded + NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function + NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted + NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible + NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again + NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups + NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch + NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use + NVML_ERROR_MEMORY = 20, //!< Insufficient memory + NVML_ERROR_NO_DATA = 21, //!< No data + NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, becasue ECC is enabled + NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory + NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory + NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported + NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred +} nvmlReturn_t; + +/** + * See \ref nvmlDeviceGetMemoryErrorCounter + */ +typedef enum nvmlMemoryLocation_enum +{ + NVML_MEMORY_LOCATION_L1_CACHE = 0, //!< GPU L1 Cache + NVML_MEMORY_LOCATION_L2_CACHE = 1, //!< GPU L2 Cache + NVML_MEMORY_LOCATION_DRAM = 2, //!< Turing+ DRAM + NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2, //!< GPU Device Memory + NVML_MEMORY_LOCATION_REGISTER_FILE = 3, //!< GPU Register File + NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4, //!< GPU Texture Memory + NVML_MEMORY_LOCATION_TEXTURE_SHM = 5, //!< Shared memory + NVML_MEMORY_LOCATION_CBU = 6, //!< CBU + NVML_MEMORY_LOCATION_SRAM = 7, //!< Turing+ SRAM + // Keep this last + NVML_MEMORY_LOCATION_COUNT //!< This counts the number of memory locations the driver knows about +} nvmlMemoryLocation_t; + +/** + * Causes for page retirement + */ +typedef enum nvmlPageRetirementCause_enum +{ + NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error + NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1, //!< Page was retired due to double bit ECC error + + // Keep this last + NVML_PAGE_RETIREMENT_CAUSE_COUNT +} nvmlPageRetirementCause_t; + +/** + * API types that allow changes to default permission restrictions + */ +typedef enum nvmlRestrictedAPI_enum +{ + NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0, //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks + //!< and see nvmlDeviceResetApplicationsClocks + NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1, //!< APIs that enable/disable Auto Boosted clocks + //!< see nvmlDeviceSetAutoBoostedClocksEnabled + // Keep this last + NVML_RESTRICTED_API_COUNT +} nvmlRestrictedAPI_t; + +/** @} */ + +/***************************************************************************************************/ +/** @addtogroup virtualGPU + * @{ + */ +/***************************************************************************************************/ +/** @defgroup nvmlVirtualGpuEnums vGPU Enums + * @{ + */ +/***************************************************************************************************/ + +/*! + * GPU virtualization mode types. + */ +typedef enum nvmlGpuVirtualizationMode { + NVML_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU + NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthorugh + NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. + NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode + NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4 //!< Device is associated with VGX hypervisor in vSGA mode +} nvmlGpuVirtualizationMode_t; + +/** + * Host vGPU modes + */ +typedef enum nvmlHostVgpuMode_enum +{ + NVML_HOST_VGPU_MODE_NON_SRIOV = 0, //!< Non SR-IOV mode + NVML_HOST_VGPU_MODE_SRIOV = 1 //!< SR-IOV mode +} nvmlHostVgpuMode_t; + +/*! + * Types of VM identifiers + */ +typedef enum nvmlVgpuVmIdType { + NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID + NVML_VGPU_VM_ID_UUID = 1 //!< VM ID represents UUID +} nvmlVgpuVmIdType_t; + +/** + * vGPU GUEST info state + */ +typedef enum nvmlVgpuGuestInfoState_enum +{ + NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //!< Guest-dependent fields uninitialized + NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED = 1 //!< Guest-dependent fields initialized +} nvmlVgpuGuestInfoState_t; + +/** + * vGPU software licensable features + */ +typedef enum { + NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN = 0, //!< Unknown + NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1, //!< Virtual GPU + NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX = 2, //!< Nvidia RTX + NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX, //!< Deprecated, do not use. + NVML_GRID_LICENSE_FEATURE_CODE_GAMING = 3, //!< Gaming + NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE = 4 //!< Compute +} nvmlGridLicenseFeatureCode_t; + +/** + * Status codes for license expiry + */ +#define NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE 0 //!< Expiry information not available +#define NVML_GRID_LICENSE_EXPIRY_INVALID 1 //!< Invalid expiry or error fetching expiry +#define NVML_GRID_LICENSE_EXPIRY_VALID 2 //!< Valid expiry +#define NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE 3 //!< Expiry not applicable +#define NVML_GRID_LICENSE_EXPIRY_PERMANENT 4 //!< Permanent expiry + +/** + * vGPU queryable capabilities + */ +typedef enum nvmlVgpuCapability_enum +{ + NVML_VGPU_CAP_NVLINK_P2P = 0, //!< P2P over NVLink is supported + NVML_VGPU_CAP_GPUDIRECT = 1, //!< GPUDirect capability is supported + // Keep this last + NVML_VGPU_CAP_COUNT +} nvmlVgpuCapability_t; + +/** @} */ + +/***************************************************************************************************/ + +/** @defgroup nvmlVgpuConstants vGPU Constants + * @{ + */ +/***************************************************************************************************/ + +/** + * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense + */ +#define NVML_GRID_LICENSE_BUFFER_SIZE 128 + +#define NVML_VGPU_NAME_BUFFER_SIZE 64 + +#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 + +#define INVALID_GPU_INSTANCE_PROFILE_ID 0xFFFFFFFF + +#define INVALID_GPU_INSTANCE_ID 0xFFFFFFFF + +/*! + * Macros for vGPU instance's virtualization capabilities bitfield. + */ +#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 +#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 +#define NVML_VGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 + +/*! + * Macros for pGPU's virtualization capabilities bitfield. + */ +#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 +#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 +#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpuStructs vGPU Structs + * @{ + */ +/***************************************************************************************************/ + +typedef unsigned int nvmlVgpuTypeId_t; + +typedef unsigned int nvmlVgpuInstance_t; + +/** + * Structure to store Utilization Value and vgpuInstance + */ +typedef struct nvmlVgpuInstanceUtilizationSample_st +{ + nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value + nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value + nvmlValue_t encUtil; //!< Encoder Util Value + nvmlValue_t decUtil; //!< Decoder Util Value +} nvmlVgpuInstanceUtilizationSample_t; + +/** + * Structure to store Utilization Value, vgpuInstance and subprocess information + */ +typedef struct nvmlVgpuProcessUtilizationSample_st +{ + nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance + unsigned int pid; //!< PID of process running within the vGPU VM + char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + unsigned int smUtil; //!< SM (3D/Compute) Util Value + unsigned int memUtil; //!< Frame Buffer Memory Util Value + unsigned int encUtil; //!< Encoder Util Value + unsigned int decUtil; //!< Decoder Util Value +} nvmlVgpuProcessUtilizationSample_t; + +/** + * Structure to store the vGPU license expiry details + */ +typedef struct nvmlVgpuLicenseExpiry_st +{ + unsigned int year; //!< Year of license expiry + unsigned short month; //!< Month of license expiry + unsigned short day; //!< Day of license expiry + unsigned short hour; //!< Hour of license expiry + unsigned short min; //!< Minutes of license expiry + unsigned short sec; //!< Seconds of license expiry + unsigned char status; //!< License expiry status +} nvmlVgpuLicenseExpiry_t; + +/** + * vGPU license state + */ +#define NVML_GRID_LICENSE_STATE_UNKNOWN 0 //!< Unknown state +#define NVML_GRID_LICENSE_STATE_UNINITIALIZED 1 //!< Uninitialized state +#define NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED 2 //!< Unlicensed unrestricted state +#define NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED 3 //!< Unlicensed restricted state +#define NVML_GRID_LICENSE_STATE_UNLICENSED 4 //!< Unlicensed state +#define NVML_GRID_LICENSE_STATE_LICENSED 5 //!< Licensed state + +typedef struct nvmlVgpuLicenseInfo_st +{ + unsigned char isLicensed; //!< License status + nvmlVgpuLicenseExpiry_t licenseExpiry; //!< License expiry information + unsigned int currentState; //!< Current license state +} nvmlVgpuLicenseInfo_t; + +/** + * Structure to store utilization value and process Id + */ +typedef struct nvmlProcessUtilizationSample_st +{ + unsigned int pid; //!< PID of process + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + unsigned int smUtil; //!< SM (3D/Compute) Util Value + unsigned int memUtil; //!< Frame Buffer Memory Util Value + unsigned int encUtil; //!< Encoder Util Value + unsigned int decUtil; //!< Decoder Util Value +} nvmlProcessUtilizationSample_t; + +/** + * Structure to store license expiry date and time values + */ +typedef struct nvmlGridLicenseExpiry_st +{ + unsigned int year; //!< Year value of license expiry + unsigned short month; //!< Month value of license expiry + unsigned short day; //!< Day value of license expiry + unsigned short hour; //!< Hour value of license expiry + unsigned short min; //!< Minutes value of license expiry + unsigned short sec; //!< Seconds value of license expiry + unsigned char status; //!< License expiry status +} nvmlGridLicenseExpiry_t; + +/** + * Structure containing vGPU software licensable feature information + */ +typedef struct nvmlGridLicensableFeature_st +{ + nvmlGridLicenseFeatureCode_t featureCode; //!< Licensed feature code + unsigned int featureState; //!< Non-zero if feature is currently licensed, otherwise zero + char licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE]; //!< Deprecated. + char productName[NVML_GRID_LICENSE_BUFFER_SIZE]; //!< Product name of feature + unsigned int featureEnabled; //!< Non-zero if feature is enabled, otherwise zero + nvmlGridLicenseExpiry_t licenseExpiry; //!< License expiry structure containing date and time +} nvmlGridLicensableFeature_t; + +/** + * Structure to store vGPU software licensable features + */ +typedef struct nvmlGridLicensableFeatures_st +{ + int isGridLicenseSupported; //!< Non-zero if vGPU Software Licensing is supported on the system, otherwise zero + unsigned int licensableFeaturesCount; //!< Entries returned in \a gridLicensableFeatures array + nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT]; //!< Array of vGPU software licensable features. +} nvmlGridLicensableFeatures_t; + +/** + * GSP firmware + */ +#define NVML_GSP_FIRMWARE_VERSION_BUF_SIZE 0x40 + +/** + * Simplified chip architecture + */ +#define NVML_DEVICE_ARCH_KEPLER 2 // Devices based on the NVIDIA Kepler architecture +#define NVML_DEVICE_ARCH_MAXWELL 3 // Devices based on the NVIDIA Maxwell architecture +#define NVML_DEVICE_ARCH_PASCAL 4 // Devices based on the NVIDIA Pascal architecture +#define NVML_DEVICE_ARCH_VOLTA 5 // Devices based on the NVIDIA Volta architecture +#define NVML_DEVICE_ARCH_TURING 6 // Devices based on the NVIDIA Turing architecture + +#define NVML_DEVICE_ARCH_AMPERE 7 // Devices based on the NVIDIA Ampere architecture + +#define NVML_DEVICE_ARCH_ADA 8 // Devices based on the NVIDIA Ada architecture + +#define NVML_DEVICE_ARCH_HOPPER 9 // Devices based on the NVIDIA Hopper architecture + +#define NVML_DEVICE_ARCH_UNKNOWN 0xffffffff // Anything else, presumably something newer + +typedef unsigned int nvmlDeviceArchitecture_t; + +/** + * PCI bus types + */ +#define NVML_BUS_TYPE_UNKNOWN 0 +#define NVML_BUS_TYPE_PCI 1 +#define NVML_BUS_TYPE_PCIE 2 +#define NVML_BUS_TYPE_FPCI 3 +#define NVML_BUS_TYPE_AGP 4 + +typedef unsigned int nvmlBusType_t; + +/** + * Device Power Modes + */ +#define NVML_POWER_MODE_ID_BALANCED 0 +#define NVML_POWER_MODE_ID_MAX 1 + +/** + * Device Power Source + */ +#define NVML_POWER_SOURCE_AC 0x00000000 +#define NVML_POWER_SOURCE_BATTERY 0x00000001 + +typedef unsigned int nvmlPowerSource_t; + +/* + * Device PCIE link Max Speed + */ +#define NVML_PCIE_LINK_MAX_SPEED_INVALID 0x00000000 +#define NVML_PCIE_LINK_MAX_SPEED_2500MBPS 0x00000001 +#define NVML_PCIE_LINK_MAX_SPEED_5000MBPS 0x00000002 +#define NVML_PCIE_LINK_MAX_SPEED_8000MBPS 0x00000003 +#define NVML_PCIE_LINK_MAX_SPEED_16000MBPS 0x00000004 +#define NVML_PCIE_LINK_MAX_SPEED_32000MBPS 0x00000005 + +/* + * Adaptive clocking status + */ +#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED 0x00000000 +#define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED 0x00000001 + +#define NVML_MAX_GPU_UTILIZATIONS 8 +typedef enum nvmlGpuUtilizationDomainId_t +{ + NVML_GPU_UTILIZATION_DOMAIN_GPU = 0, //!< Graphics engine domain + NVML_GPU_UTILIZATION_DOMAIN_FB = 1, //!< Frame buffer domain + NVML_GPU_UTILIZATION_DOMAIN_VID = 2, //!< Video engine domain + NVML_GPU_UTILIZATION_DOMAIN_BUS = 3, //!< Bus interface domain +} nvmlGpuUtilizationDomainId_t; + +typedef struct nvmlGpuDynamicPstatesInfo_st +{ + unsigned int flags; //!< Reserved for future use + struct + { + unsigned int bIsPresent; //!< Set if this utilization domain is present on this GPU + unsigned int percentage; //!< Percentage of time where the domain is considered busy in the last 1-second interval + unsigned int incThreshold; //!< Utilization threshold that can trigger a perf-increasing P-State change when crossed + unsigned int decThreshold; //!< Utilization threshold that can trigger a perf-decreasing P-State change when crossed + } utilization[NVML_MAX_GPU_UTILIZATIONS]; +} nvmlGpuDynamicPstatesInfo_t; + +/** @} */ +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlFieldValueEnums Field Value Enums + * @{ + */ +/***************************************************************************************************/ + +/** + * Field Identifiers. + * + * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. + */ +#define NVML_FI_DEV_ECC_CURRENT 1 //!< Current ECC mode. 1=Active. 0=Inactive +#define NVML_FI_DEV_ECC_PENDING 2 //!< Pending ECC mode. 1=Active. 0=Inactive +/* ECC Count Totals */ +#define NVML_FI_DEV_ECC_SBE_VOL_TOTAL 3 //!< Total single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_TOTAL 4 //!< Total double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_TOTAL 5 //!< Total single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_TOTAL 6 //!< Total double bit aggregate (persistent) ECC errors +/* Individual ECC locations */ +#define NVML_FI_DEV_ECC_SBE_VOL_L1 7 //!< L1 cache single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_L1 8 //!< L1 cache double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_VOL_L2 9 //!< L2 cache single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_L2 10 //!< L2 cache double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_VOL_DEV 11 //!< Device memory single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_DEV 12 //!< Device memory double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_VOL_REG 13 //!< Register file single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_REG 14 //!< Register file double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_VOL_TEX 15 //!< Texture memory single bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_TEX 16 //!< Texture memory double bit volatile ECC errors +#define NVML_FI_DEV_ECC_DBE_VOL_CBU 17 //!< CBU double bit volatile ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_L1 18 //!< L1 cache single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_L1 19 //!< L1 cache double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_L2 20 //!< L2 cache single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_L2 21 //!< L2 cache double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_DEV 22 //!< Device memory single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_DEV 23 //!< Device memory double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_REG 24 //!< Register File single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_REG 25 //!< Register File double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_SBE_AGG_TEX 26 //!< Texture memory single bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_TEX 27 //!< Texture memory double bit aggregate (persistent) ECC errors +#define NVML_FI_DEV_ECC_DBE_AGG_CBU 28 //!< CBU double bit aggregate ECC errors + +/* Page Retirement */ +#define NVML_FI_DEV_RETIRED_SBE 29 //!< Number of retired pages because of single bit errors +#define NVML_FI_DEV_RETIRED_DBE 30 //!< Number of retired pages because of double bit errors +#define NVML_FI_DEV_RETIRED_PENDING 31 //!< If any pages are pending retirement. 1=yes. 0=no. + +/* NvLink Flit Error Counters */ +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 32 //!< NVLink flow control CRC Error Counter for Lane 0 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 33 //!< NVLink flow control CRC Error Counter for Lane 1 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 34 //!< NVLink flow control CRC Error Counter for Lane 2 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 35 //!< NVLink flow control CRC Error Counter for Lane 3 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 36 //!< NVLink flow control CRC Error Counter for Lane 4 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 37 //!< NVLink flow control CRC Error Counter for Lane 5 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38 //!< NVLink flow control CRC Error Counter total for all Lanes + +/* NvLink CRC Data Error Counters */ +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 39 //!< NVLink data CRC Error Counter for Lane 0 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 40 //!< NVLink data CRC Error Counter for Lane 1 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 41 //!< NVLink data CRC Error Counter for Lane 2 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 42 //!< NVLink data CRC Error Counter for Lane 3 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 43 //!< NVLink data CRC Error Counter for Lane 4 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 44 //!< NVLink data CRC Error Counter for Lane 5 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 45 //!< NvLink data CRC Error Counter total for all Lanes + +/* NvLink Replay Error Counters */ +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 46 //!< NVLink Replay Error Counter for Lane 0 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 47 //!< NVLink Replay Error Counter for Lane 1 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 48 //!< NVLink Replay Error Counter for Lane 2 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 49 //!< NVLink Replay Error Counter for Lane 3 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 50 //!< NVLink Replay Error Counter for Lane 4 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 51 //!< NVLink Replay Error Counter for Lane 5 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 52 //!< NVLink Replay Error Counter total for all Lanes + +/* NvLink Recovery Error Counters */ +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 53 //!< NVLink Recovery Error Counter for Lane 0 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 54 //!< NVLink Recovery Error Counter for Lane 1 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 55 //!< NVLink Recovery Error Counter for Lane 2 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 56 //!< NVLink Recovery Error Counter for Lane 3 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 57 //!< NVLink Recovery Error Counter for Lane 4 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 58 //!< NVLink Recovery Error Counter for Lane 5 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 59 //!< NVLink Recovery Error Counter total for all Lanes + +/* NvLink Bandwidth Counters */ +/* + * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. + * Please use the following field values instead: + * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX + * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX + * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX + * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX + */ +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0 60 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 0 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1 61 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 1 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2 62 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 2 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3 63 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 3 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4 64 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 4 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5 65 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 5 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL 66 //!< NVLink Bandwidth Counter Total for Counter Set 0, All Lanes + +/* NvLink Bandwidth Counters */ +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0 67 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 0 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1 68 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 1 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2 69 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 2 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3 70 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 3 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4 71 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 4 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5 72 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 5 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL 73 //!< NVLink Bandwidth Counter Total for Counter Set 1, All Lanes + +/* NVML Perf Policy Counters */ +#define NVML_FI_DEV_PERF_POLICY_POWER 74 //!< Perf Policy Counter for Power Policy +#define NVML_FI_DEV_PERF_POLICY_THERMAL 75 //!< Perf Policy Counter for Thermal Policy +#define NVML_FI_DEV_PERF_POLICY_SYNC_BOOST 76 //!< Perf Policy Counter for Sync boost Policy +#define NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT 77 //!< Perf Policy Counter for Board Limit +#define NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION 78 //!< Perf Policy Counter for Low GPU Utilization Policy +#define NVML_FI_DEV_PERF_POLICY_RELIABILITY 79 //!< Perf Policy Counter for Reliability Policy +#define NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS 80 //!< Perf Policy Counter for Total App Clock Policy +#define NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS 81 //!< Perf Policy Counter for Total Base Clocks Policy + +/* Memory temperatures */ +#define NVML_FI_DEV_MEMORY_TEMP 82 //!< Memory temperature for the device + +/* Energy Counter */ +#define NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION 83 //!< Total energy consumption for the GPU in mJ since the driver was last reloaded + +/* NVLink Speed */ +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L0 84 //!< NVLink Speed in MBps for Link 0 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L1 85 //!< NVLink Speed in MBps for Link 1 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L2 86 //!< NVLink Speed in MBps for Link 2 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L3 87 //!< NVLink Speed in MBps for Link 3 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L4 88 //!< NVLink Speed in MBps for Link 4 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L5 89 //!< NVLink Speed in MBps for Link 5 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links + +#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device + +#define NVML_FI_DEV_RETIRED_PENDING_SBE 92 //!< If any pages are pending retirement due to SBE. 1=yes. 0=no. +#define NVML_FI_DEV_RETIRED_PENDING_DBE 93 //!< If any pages are pending retirement due to DBE. 1=yes. 0=no. + +#define NVML_FI_DEV_PCIE_REPLAY_COUNTER 94 //!< PCIe replay counter +#define NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER 95 //!< PCIe replay rollover counter + +/* NvLink Flit Error Counters */ +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 96 //!< NVLink flow control CRC Error Counter for Lane 6 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 97 //!< NVLink flow control CRC Error Counter for Lane 7 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 98 //!< NVLink flow control CRC Error Counter for Lane 8 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 99 //!< NVLink flow control CRC Error Counter for Lane 9 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 100 //!< NVLink flow control CRC Error Counter for Lane 10 +#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 101 //!< NVLink flow control CRC Error Counter for Lane 11 + +/* NvLink CRC Data Error Counters */ +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 102 //!< NVLink data CRC Error Counter for Lane 6 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 103 //!< NVLink data CRC Error Counter for Lane 7 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 104 //!< NVLink data CRC Error Counter for Lane 8 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 105 //!< NVLink data CRC Error Counter for Lane 9 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 106 //!< NVLink data CRC Error Counter for Lane 10 +#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 107 //!< NVLink data CRC Error Counter for Lane 11 + +/* NvLink Replay Error Counters */ +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 108 //!< NVLink Replay Error Counter for Lane 6 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 109 //!< NVLink Replay Error Counter for Lane 7 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 110 //!< NVLink Replay Error Counter for Lane 8 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 111 //!< NVLink Replay Error Counter for Lane 9 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 112 //!< NVLink Replay Error Counter for Lane 10 +#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 113 //!< NVLink Replay Error Counter for Lane 11 + +/* NvLink Recovery Error Counters */ +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 114 //!< NVLink Recovery Error Counter for Lane 6 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 115 //!< NVLink Recovery Error Counter for Lane 7 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 116 //!< NVLink Recovery Error Counter for Lane 8 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 117 //!< NVLink Recovery Error Counter for Lane 9 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 118 //!< NVLink Recovery Error Counter for Lane 10 +#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 119 //!< NVLink Recovery Error Counter for Lane 11 + +/* NvLink Bandwidth Counters */ +/* + * NVML_FI_DEV_NVLINK_BANDWIDTH_* field values are now deprecated. + * Please use the following field values instead: + * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX + * NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX + * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX + * NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX + */ +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6 120 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 6 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7 121 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 7 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8 122 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 8 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9 123 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 9 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10 124 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 10 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11 125 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 11 + +/* NvLink Bandwidth Counters */ +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6 126 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 6 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7 127 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 7 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8 128 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 8 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9 129 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 9 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10 130 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 10 +#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11 131 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 11 + +/* NVLink Speed */ +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L6 132 //!< NVLink Speed in MBps for Link 6 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L7 133 //!< NVLink Speed in MBps for Link 7 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L8 134 //!< NVLink Speed in MBps for Link 8 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L9 135 //!< NVLink Speed in MBps for Link 9 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L10 136 //!< NVLink Speed in MBps for Link 10 +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L11 137 //!< NVLink Speed in MBps for Link 11 + +/** + * NVLink throughput counters field values + * + * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. + * A scopeId of UINT_MAX returns aggregate value summed up across all links + * for the specified counter type in fieldId. + */ +#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX 138 //!< NVLink TX Data throughput in KiB +#define NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX 139 //!< NVLink RX Data throughput in KiB +#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX 140 //!< NVLink TX Data + protocol overhead in KiB +#define NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX 141 //!< NVLink RX Data + protocol overhead in KiB + +/* Row Remapper */ +#define NVML_FI_DEV_REMAPPED_COR 142 //!< Number of remapped rows due to correctable errors +#define NVML_FI_DEV_REMAPPED_UNC 143 //!< Number of remapped rows due to uncorrectable errors +#define NVML_FI_DEV_REMAPPED_PENDING 144 //!< If any rows are pending remapping. 1=yes 0=no +#define NVML_FI_DEV_REMAPPED_FAILURE 145 //!< If any rows failed to be remapped 1=yes 0=no + +/** + * Remote device NVLink ID + * + * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. + */ +#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID + +/** + * NVSwitch: connected NVLink count + */ +#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch + +/* NvLink ECC Data Error Counters + * + * Lane ID needs to be specified in the scopeId field in nvmlFieldValue_t. + * + */ +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0 148 //!< NVLink data ECC Error Counter for Link 0 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1 149 //!< NVLink data ECC Error Counter for Link 1 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2 150 //!< NVLink data ECC Error Counter for Link 2 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3 151 //!< NVLink data ECC Error Counter for Link 3 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4 152 //!< NVLink data ECC Error Counter for Link 4 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5 153 //!< NVLink data ECC Error Counter for Link 5 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6 154 //!< NVLink data ECC Error Counter for Link 6 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7 155 //!< NVLink data ECC Error Counter for Link 7 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8 156 //!< NVLink data ECC Error Counter for Link 8 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9 157 //!< NVLink data ECC Error Counter for Link 9 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10 158 //!< NVLink data ECC Error Counter for Link 10 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 159 //!< NVLink data ECC Error Counter for Link 11 +#define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160 //!< NvLink data ECC Error Counter total for all Links + +#define NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY 161 +#define NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY 162 +#define NVML_FI_DEV_NVLINK_ERROR_DL_CRC 163 +#define NVML_FI_DEV_NVLINK_GET_SPEED 164 +#define NVML_FI_DEV_NVLINK_GET_STATE 165 +#define NVML_FI_DEV_NVLINK_GET_VERSION 166 + +#define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device +#define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE +#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links + +/** + * Retrieves power usage for this GPU in milliwatts. + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode and + * \ref nvmlDeviceGetPowerUsage. + * + * scopeId needs to be specified. It signifies: + * 0 - GPU Only Scope - Metrics for GPU are retrieved + * 1 - Module scope - Metrics for the module (e.g. CPU + GPU) are retrieved. + * Note: CPU here refers to NVIDIA CPU (e.g. Grace). x86 or non-NVIDIA ARM is not supported + */ +#define NVML_FI_DEV_POWER_AVERAGE 185 //!< GPU power averaged over 1 sec interval, supported on Ampere (except GA100) or newer architectures. +#define NVML_FI_DEV_POWER_INSTANT 186 //!< Current GPU power, supported on all architectures. +#define NVML_FI_DEV_POWER_MIN_LIMIT 187 //!< Minimum power limit in milliwatts. +#define NVML_FI_DEV_POWER_MAX_LIMIT 188 //!< Maximum power limit in milliwatts. +#define NVML_FI_DEV_POWER_DEFAULT_LIMIT 189 //!< Default power limit in milliwatts (limit which device boots with). +#define NVML_FI_DEV_POWER_CURRENT_LIMIT 190 //!< Limit currently enforced in milliwatts (This includes other limits set elsewhere. E.g. Out-of-band). +#define NVML_FI_DEV_ENERGY 191 //!< Total energy consumption (in mJ) since the driver was last reloaded. Same as \ref NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION for the GPU. +#define NVML_FI_DEV_POWER_REQUESTED_LIMIT 192 //!< Power limit requested by NVML or any other userspace client. +#define NVML_FI_MAX 193 //!< One greater than the largest field ID defined above + +/** + * Information for a Field Value Sample + */ +typedef struct nvmlFieldValue_st +{ + unsigned int fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above. + unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId. + long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970 + long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call. + nvmlValueType_t valueType; //!< Type of the value stored in value + nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS + nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS +} nvmlFieldValue_t; + + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUnitStructs Unit Structs + * @{ + */ +/***************************************************************************************************/ + +typedef struct nvmlUnit_st* nvmlUnit_t; + +/** + * Description of HWBC entry + */ +typedef struct nvmlHwbcEntry_st +{ + unsigned int hwbcId; + char firmwareVersion[32]; +} nvmlHwbcEntry_t; + +/** + * Fan state enum. + */ +typedef enum nvmlFanState_enum +{ + NVML_FAN_NORMAL = 0, //!< Fan is working properly + NVML_FAN_FAILED = 1 //!< Fan has failed +} nvmlFanState_t; + +/** + * Led color enum. + */ +typedef enum nvmlLedColor_enum +{ + NVML_LED_COLOR_GREEN = 0, //!< GREEN, indicates good health + NVML_LED_COLOR_AMBER = 1 //!< AMBER, indicates problem +} nvmlLedColor_t; + + +/** + * LED states for an S-class unit. + */ +typedef struct nvmlLedState_st +{ + char cause[256]; //!< If amber, a text description of the cause + nvmlLedColor_t color; //!< GREEN or AMBER +} nvmlLedState_t; + +/** + * Static S-class unit info. + */ +typedef struct nvmlUnitInfo_st +{ + char name[96]; //!< Product name + char id[96]; //!< Product identifier + char serial[96]; //!< Product serial number + char firmwareVersion[96]; //!< Firmware version +} nvmlUnitInfo_t; + +/** + * Power usage information for an S-class unit. + * The power supply state is a human readable string that equals "Normal" or contains + * a combination of "Abnormal" plus one or more of the following: + * + * - High voltage + * - Fan failure + * - Heatsink temperature + * - Current limit + * - Voltage below UV alarm threshold + * - Low-voltage + * - SI2C remote off command + * - MOD_DISABLE input + * - Short pin transition +*/ +typedef struct nvmlPSUInfo_st +{ + char state[256]; //!< The power supply state + unsigned int current; //!< PSU current (A) + unsigned int voltage; //!< PSU voltage (V) + unsigned int power; //!< PSU power draw (W) +} nvmlPSUInfo_t; + +/** + * Fan speed reading for a single fan in an S-class unit. + */ +typedef struct nvmlUnitFanInfo_st +{ + unsigned int speed; //!< Fan speed (RPM) + nvmlFanState_t state; //!< Flag that indicates whether fan is working properly +} nvmlUnitFanInfo_t; + +/** + * Fan speed readings for an entire S-class unit. + */ +typedef struct nvmlUnitFanSpeeds_st +{ + nvmlUnitFanInfo_t fans[24]; //!< Fan speed data for each fan + unsigned int count; //!< Number of fans in unit +} nvmlUnitFanSpeeds_t; + +/** @} */ + +/***************************************************************************************************/ +/** @addtogroup nvmlEvents + * @{ + */ +/***************************************************************************************************/ + +/** + * Handle to an event set + */ +typedef struct nvmlEventSet_st* nvmlEventSet_t; + +/** @defgroup nvmlEventType Event Types + * @{ + * Event Types which user can be notified about. + * See description of particular functions for details. + * + * See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices + * support each event. + * + * Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents + */ +//! Event about single bit ECC errors +/** + * \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event + */ +#define nvmlEventTypeSingleBitEccError 0x0000000000000001LL + +//! Event about double bit ECC errors +/** + * \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event + */ +#define nvmlEventTypeDoubleBitEccError 0x0000000000000002LL + +//! Event about PState changes +/** + * \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to + * no work being executed on the GPU, power capping or thermal capping. In a typical situation, + * Fermi-based GPU should stay in P0 for the duration of the execution of the compute process. + */ +#define nvmlEventTypePState 0x0000000000000004LL + +//! Event that Xid critical error occurred +#define nvmlEventTypeXidCriticalError 0x0000000000000008LL + +//! Event about clock changes +/** + * Kepler only + */ +#define nvmlEventTypeClock 0x0000000000000010LL + +//! Event about AC/Battery power source changes +#define nvmlEventTypePowerSourceChange 0x0000000000000080LL + +//! Event about MIG configuration changes +#define nvmlEventMigConfigChange 0x0000000000000100LL + +//! Mask with no events +#define nvmlEventTypeNone 0x0000000000000000LL + +//! Mask of all events +#define nvmlEventTypeAll (nvmlEventTypeNone \ + | nvmlEventTypeSingleBitEccError \ + | nvmlEventTypeDoubleBitEccError \ + | nvmlEventTypePState \ + | nvmlEventTypeClock \ + | nvmlEventTypeXidCriticalError \ + | nvmlEventTypePowerSourceChange \ + | nvmlEventMigConfigChange \ + ) +/** @} */ + +/** + * Information about occurred event + */ +typedef struct nvmlEventData_st +{ + nvmlDevice_t device; //!< Specific device where the event occurred + unsigned long long eventType; //!< Information about what specific event occurred + unsigned long long eventData; //!< Stores XID error for the device in the event of nvmlEventTypeXidCriticalError, + // eventData is 0 for any other event. eventData is set as 999 for unknown xid error. + unsigned int gpuInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a GPU + // instance, stores a valid GPU instance ID. gpuInstanceId is set to 0xFFFFFFFF + // otherwise. + unsigned int computeInstanceId; //!< If MIG is enabled and nvmlEventTypeXidCriticalError event is attributable to a + // compute instance, stores a valid compute instance ID. computeInstanceId is set to + // 0xFFFFFFFF otherwise. +} nvmlEventData_t; + +/** @} */ + +/***************************************************************************************************/ +/** @addtogroup nvmlClocksThrottleReasons + * @{ + */ +/***************************************************************************************************/ + +/** Nothing is running on the GPU and the clocks are dropping to Idle state + * \note This limiter may be removed in a later release + */ +#define nvmlClocksThrottleReasonGpuIdle 0x0000000000000001LL + +/** GPU clocks are limited by current setting of applications clocks + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetApplicationsClock + */ +#define nvmlClocksThrottleReasonApplicationsClocksSetting 0x0000000000000002LL + +/** + * @deprecated Renamed to \ref nvmlClocksThrottleReasonApplicationsClocksSetting + * as the name describes the situation more accurately. + */ +#define nvmlClocksThrottleReasonUserDefinedClocks nvmlClocksThrottleReasonApplicationsClocksSetting + +/** SW Power Scaling algorithm is reducing the clocks below requested clocks + * + * @see nvmlDeviceGetPowerUsage + * @see nvmlDeviceSetPowerManagementLimit + * @see nvmlDeviceGetPowerManagementLimit + */ +#define nvmlClocksThrottleReasonSwPowerCap 0x0000000000000004LL + +/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + * + * This is an indicator of: + * - temperature being too high + * - External Power Brake Assertion is triggered (e.g. by the system power supply) + * - Power draw is too high and Fast Trigger protection is reducing the clocks + * - May be also reported during PState or clock change + * - This behavior may be removed in a later release. + * + * @see nvmlDeviceGetTemperature + * @see nvmlDeviceGetTemperatureThreshold + * @see nvmlDeviceGetPowerUsage + */ +#define nvmlClocksThrottleReasonHwSlowdown 0x0000000000000008LL + +/** Sync Boost + * + * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in + * order to maximize performance per watt. All GPUs in the sync boost group + * will boost to the minimum possible clocks across the entire group. Look at + * the throttle reasons for other GPUs in the system to see why those GPUs are + * holding this one at lower clocks. + * + */ +#define nvmlClocksThrottleReasonSyncBoost 0x0000000000000010LL + +/** SW Thermal Slowdown + * + * This is an indicator of one or more of the following: + * - Current GPU temperature above the GPU Max Operating Temperature + * - Current memory temperature above the Memory Max Operating Temperature + * + */ +#define nvmlClocksThrottleReasonSwThermalSlowdown 0x0000000000000020LL + +/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + * + * This is an indicator of: + * - temperature being too high + * + * @see nvmlDeviceGetTemperature + * @see nvmlDeviceGetTemperatureThreshold + * @see nvmlDeviceGetPowerUsage + */ +#define nvmlClocksThrottleReasonHwThermalSlowdown 0x0000000000000040LL + +/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged + * + * This is an indicator of: + * - External Power Brake Assertion being triggered (e.g. by the system power supply) + * + * @see nvmlDeviceGetTemperature + * @see nvmlDeviceGetTemperatureThreshold + * @see nvmlDeviceGetPowerUsage + */ +#define nvmlClocksThrottleReasonHwPowerBrakeSlowdown 0x0000000000000080LL + +/** GPU clocks are limited by current setting of Display clocks + * + * @see bug 1997531 + */ +#define nvmlClocksThrottleReasonDisplayClockSetting 0x0000000000000100LL + +/** Bit mask representing no clocks throttling + * + * Clocks are as high as possible. + * */ +#define nvmlClocksThrottleReasonNone 0x0000000000000000LL + +/** Bit mask representing all supported clocks throttling reasons + * New reasons might be added to this list in the future + */ +#define nvmlClocksThrottleReasonAll (nvmlClocksThrottleReasonNone \ + | nvmlClocksThrottleReasonGpuIdle \ + | nvmlClocksThrottleReasonApplicationsClocksSetting \ + | nvmlClocksThrottleReasonSwPowerCap \ + | nvmlClocksThrottleReasonHwSlowdown \ + | nvmlClocksThrottleReasonSyncBoost \ + | nvmlClocksThrottleReasonSwThermalSlowdown \ + | nvmlClocksThrottleReasonHwThermalSlowdown \ + | nvmlClocksThrottleReasonHwPowerBrakeSlowdown \ + | nvmlClocksThrottleReasonDisplayClockSetting \ +) +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlAccountingStats Accounting Statistics + * @{ + * + * Set of APIs designed to provide per process information about usage of GPU. + * + * @note All accounting statistics and accounting mode live in nvidia driver and reset + * to default (Disabled) when driver unloads. + * It is advised to run with persistence mode enabled. + * + * @note Enabling accounting mode has no negative impact on the GPU performance. + */ +/***************************************************************************************************/ + +/** + * Describes accounting statistics of a process. + */ +typedef struct nvmlAccountingStats_st { + unsigned int gpuUtilization; //!< Percent of time over the process's lifetime during which one or more kernels was executing on the GPU. + //! Utilization stats just like returned by \ref nvmlDeviceGetUtilizationRates but for the life time of a + //! process (not just the last sample period). + //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported + + unsigned int memoryUtilization; //!< Percent of time over the process's lifetime during which global (device) memory was being read or written. + //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported + + unsigned long long maxMemoryUsage; //!< Maximum total memory in bytes that was ever allocated by the process. + //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlProcessInfo_t->usedGpuMemory is not supported + + + unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if + //!< the process is not terminated + + unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process + + unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) + + unsigned int reserved[5]; //!< Reserved for future use +} nvmlAccountingStats_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlEncoderStructs Encoder Structs + * @{ + */ +/***************************************************************************************************/ + +/** + * Represents type of encoder for capacity can be queried + */ +typedef enum nvmlEncoderQueryType_enum +{ + NVML_ENCODER_QUERY_H264 = 0, //!< H264 encoder + NVML_ENCODER_QUERY_HEVC = 1 //!< HEVC encoder +}nvmlEncoderType_t; + +/** + * Structure to hold encoder session data + */ +typedef struct nvmlEncoderSessionInfo_st +{ + unsigned int sessionId; //!< Unique session ID + unsigned int pid; //!< Owning process ID + nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) + nvmlEncoderType_t codecType; //!< Video encoder type + unsigned int hResolution; //!< Current encode horizontal resolution + unsigned int vResolution; //!< Current encode vertical resolution + unsigned int averageFps; //!< Moving average encode frames per second + unsigned int averageLatency; //!< Moving average encode latency in microseconds +}nvmlEncoderSessionInfo_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures +* @{ +*/ +/***************************************************************************************************/ + +/** + * Represents frame buffer capture session type + */ +typedef enum nvmlFBCSessionType_enum +{ + NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknwon + NVML_FBC_SESSION_TYPE_TOSYS, //!< ToSys + NVML_FBC_SESSION_TYPE_CUDA, //!< Cuda + NVML_FBC_SESSION_TYPE_VID, //!< Vid + NVML_FBC_SESSION_TYPE_HWENC //!< HEnc +} nvmlFBCSessionType_t; + +/** + * Structure to hold frame buffer capture sessions stats + */ +typedef struct nvmlFBCStats_st +{ + unsigned int sessionsCount; //!< Total no of sessions + unsigned int averageFPS; //!< Moving average new frames captured per second + unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds +} nvmlFBCStats_t; + +#define NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED 0x00000001 //!< Bit specifying differential map state. +#define NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED 0x00000002 //!< Bit specifying classification map state. +#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT 0x00000004 //!< Bit specifying if capture was requested as non-blocking call. +#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE 0x00000008 //!< Bit specifying if capture was requested as blocking call. +#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT 0x00000010 //!< Bit specifying if capture was requested as blocking call with timeout period. + +/** + * Structure to hold FBC session data + */ +typedef struct nvmlFBCSessionInfo_st +{ + unsigned int sessionId; //!< Unique session ID + unsigned int pid; //!< Owning process ID + nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) + unsigned int displayOrdinal; //!< Display identifier + nvmlFBCSessionType_t sessionType; //!< Type of frame buffer capture session + unsigned int sessionFlags; //!< Session flags (one or more of NVML_NVFBC_SESSION_FLAG_XXX). + unsigned int hMaxResolution; //!< Max horizontal resolution supported by the capture session + unsigned int vMaxResolution; //!< Max vertical resolution supported by the capture session + unsigned int hResolution; //!< Horizontal resolution requested by caller in capture call + unsigned int vResolution; //!< Vertical resolution requested by caller in capture call + unsigned int averageFPS; //!< Moving average new frames captured per second + unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds +} nvmlFBCSessionInfo_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDrainDefs definitions related to the drain state + * @{ + */ +/***************************************************************************************************/ + +/** + * Is the GPU device to be removed from the kernel by nvmlDeviceRemoveGpu() + */ +typedef enum nvmlDetachGpuState_enum +{ + NVML_DETACH_GPU_KEEP = 0, + NVML_DETACH_GPU_REMOVE +} nvmlDetachGpuState_t; + +/** + * Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu() + */ +typedef enum nvmlPcieLinkState_enum +{ + NVML_PCIE_LINK_KEEP = 0, + NVML_PCIE_LINK_SHUT_DOWN +} nvmlPcieLinkState_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlSystem/nvmlDevice definitions related to Confidential Computing + * @{ + */ +/***************************************************************************************************/ +/** + * Confidential Compute CPU Capabilities values + */ +#define NVML_CC_SYSTEM_CPU_CAPS_NONE 0 +#define NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV 1 + +/** + * Confidenial Compute GPU Capabilities values + */ +#define NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE 0 +#define NVML_CC_SYSTEM_GPUS_CC_CAPABLE 1 + +typedef struct nvmlConfComputeSystemCaps_st { + unsigned int cpuCaps; + unsigned int gpusCaps; +} nvmlConfComputeSystemCaps_t; + +/** + * Confidential Compute DEV Mode values + */ +#define NVML_CC_SYSTEM_DEV_MODE_OFF 0 +#define NVML_CC_SYSTEM_DEV_MODE_ON 1 + +/** + * Confidential Compute Environment values + */ +#define NVML_CC_SYSTEM_ENVIRONMENT_UNAVAILABLE 0 +#define NVML_CC_SYSTEM_ENVIRONMENT_SIM 1 +#define NVML_CC_SYSTEM_ENVIRONMENT_PROD 2 + +/** + * Confidential Compute Feature Status values + */ +#define NVML_CC_SYSTEM_FEATURE_DISABLED 0 +#define NVML_CC_SYSTEM_FEATURE_ENABLED 1 + +typedef struct nvmlConfComputeSystemState_st { + unsigned int environment; + unsigned int ccFeature; + unsigned int devMode; +} nvmlConfComputeSystemState_t; + +/** + * Protected memory size + */ +typedef struct +nvmlConfComputeMemSizeInfo_st +{ + unsigned long long protectedMemSizeKib; + unsigned long long unprotectedMemSizeKib; +} nvmlConfComputeMemSizeInfo_t; + +/** + * Confidential Compute GPUs/System Ready State values + */ +#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE 0 +#define NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE 1 + +/** + * GPU Certificate Details + */ +#define NVML_GPU_CERT_CHAIN_SIZE 0x1000 +#define NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE 0x1000 + +typedef struct nvmlConfComputeGpuCertificate_st { + unsigned int certChainSize; + unsigned int attestationCertChainSize; + unsigned char certChain[NVML_GPU_CERT_CHAIN_SIZE]; + unsigned char attestationCertChain[NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE]; +} nvmlConfComputeGpuCertificate_t; + +/** + * GPU Attestation Report + */ +#define NVML_CC_GPU_CEC_NONCE_SIZE 0x20 +#define NVML_CC_GPU_ATTESTATION_REPORT_SIZE 0x2000 +#define NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE 0x1000 +#define NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT 0 +#define NVML_CC_CEC_ATTESTATION_REPORT_PRESENT 1 + +typedef struct nvmlConfComputeGpuAttestationReport_st { + unsigned int isCecAttestationReportPresent; + unsigned int attestationReportSize; + unsigned int cecAttestationReportSize; + unsigned char nonce[NVML_CC_GPU_CEC_NONCE_SIZE]; + unsigned char attestationReport[NVML_CC_GPU_ATTESTATION_REPORT_SIZE]; + unsigned char cecAttestationReport[NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE]; +} nvmlConfComputeGpuAttestationReport_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup + * This chapter describes the methods that handle NVML initialization and cleanup. + * It is the user's responsibility to call \ref nvmlInit_v2() before calling any other methods, and + * nvmlShutdown() once NVML is no longer being used. + * @{ + */ +/***************************************************************************************************/ + +#define NVML_INIT_FLAG_NO_GPUS 1 //!< Don't fail nvmlInit() when no GPUs are found +#define NVML_INIT_FLAG_NO_ATTACH 2 //!< Don't attach GPUs + +/** + * Initialize NVML, but don't initialize any GPUs yet. + * + * \note nvmlInit_v3 introduces a "flags" argument, that allows passing boolean values + * modifying the behaviour of nvmlInit(). + * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that + * did initialize all GPU devices in the system. + * + * This allows NVML to communicate with a GPU + * when other GPUs in the system are unstable or in a bad state. When using this API, GPUs are + * discovered and initialized in nvmlDeviceGetHandleBy* functions instead. + * + * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in + * a bad or unstable state. + * + * For all products. + * + * This method, should be called once before invoking any other methods in the library. + * A reference count of the number of initializations is maintained. Shutdown only occurs + * when the reference count reaches zero. + * + * @return + * - \ref NVML_SUCCESS if NVML has been properly initialized + * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running + * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlInit_v2(void); + +/** + * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values + * modifying the behaviour of nvmlInit(). + * Other than the "flags" parameter it is completely similar to \ref nvmlInit_v2. + * + * For all products. + * + * @param flags behaviour modifier flags + * + * @return + * - \ref NVML_SUCCESS if NVML has been properly initialized + * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running + * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags); + +/** + * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit_v2(). + * + * For all products. + * + * This method should be called after NVML work is done, once for each call to \ref nvmlInit_v2() + * A reference count of the number of initializations is maintained. Shutdown only occurs + * when the reference count reaches zero. For backwards compatibility, no error is reported if + * nvmlShutdown() is called more times than nvmlInit(). + * + * @return + * - \ref NVML_SUCCESS if NVML has been properly shut down + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlShutdown(void); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlErrorReporting Error reporting + * This chapter describes helper functions for error reporting routines. + * @{ + */ +/***************************************************************************************************/ + +/** + * Helper method for converting NVML error codes into readable strings. + * + * For all products. + * + * @param result NVML error code to convert + * + * @return String representation of the error. + * + */ +const DECLDIR char* nvmlErrorString(nvmlReturn_t result); +/** @} */ + + +/***************************************************************************************************/ +/** @defgroup nvmlConstants Constants + * @{ + */ +/***************************************************************************************************/ + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion + */ +#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE 16 + +/** + * Buffer size guaranteed to be large enough for storing GPU identifiers. + */ +#define NVML_DEVICE_UUID_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID + */ +#define NVML_DEVICE_UUID_V2_BUFFER_SIZE 96 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber + */ +#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion + */ +#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion + */ +#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80 + +/** + * Buffer size guaranteed to be large enough for storing GPU device names. + */ +#define NVML_DEVICE_NAME_BUFFER_SIZE 64 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName + */ +#define NVML_DEVICE_NAME_V2_BUFFER_SIZE 96 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial + */ +#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30 + +/** + * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion + */ +#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32 + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlSystemQueries System Queries + * This chapter describes the queries that NVML can perform against the local system. These queries + * are not device-specific. + * @{ + */ +/***************************************************************************************************/ + +/** + * Retrieves the version of the system's graphics driver. + * + * For all products. + * + * The version identifier is an alphanumeric string. It will not exceed 80 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. + * + * @param version Reference in which to return the version identifier + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + */ +nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length); + +/** + * Retrieves the version of the NVML library. + * + * For all products. + * + * The version identifier is an alphanumeric string. It will not exceed 80 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE. + * + * @param version Reference in which to return the version identifier + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + */ +nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length); + +/** + * Retrieves the version of the CUDA driver. + * + * For all products. + * + * The CUDA driver version returned will be retreived from the currently installed version of CUDA. + * If the cuda library is not found, this function will return a known supported version number. + * + * @param cudaDriverVersion Reference in which to return the version identifier + * + * @return + * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL + */ +nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion(int *cudaDriverVersion); + +/** + * Retrieves the version of the CUDA driver from the shared library. + * + * For all products. + * + * The returned CUDA driver version by calling cuDriverGetVersion() + * + * @param cudaDriverVersion Reference in which to return the version identifier + * + * @return + * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL + * - \ref NVML_ERROR_LIBRARY_NOT_FOUND if \a libcuda.so.1 or libcuda.dll is not found + * - \ref NVML_ERROR_FUNCTION_NOT_FOUND if \a cuDriverGetVersion() is not found in the shared library + */ +nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion_v2(int *cudaDriverVersion); + +/** + * Macros for converting the CUDA driver version number to Major and Minor version numbers. + */ +#define NVML_CUDA_DRIVER_VERSION_MAJOR(v) ((v)/1000) +#define NVML_CUDA_DRIVER_VERSION_MINOR(v) (((v)%1000)/10) + +/** + * Gets name of the process with provided process id + * + * For all products. + * + * Returned process name is cropped to provided length. + * name string is encoded in ANSI. + * + * @param pid The identifier of the process + * @param name Reference in which to return the process name + * @param length The maximum allowed length of the string returned in \a name + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a name is NULL or \a length is 0. + * - \ref NVML_ERROR_NOT_FOUND if process doesn't exists + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUnitQueries Unit Queries + * This chapter describes that queries that NVML can perform against each unit. For S-class systems only. + * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by + * calling \ref nvmlUnitGetHandleByIndex(). + * @{ + */ +/***************************************************************************************************/ + + /** + * Retrieves the number of units in the system. + * + * For S-class products. + * + * @param unitCount Reference in which to return the number of units + * + * @return + * - \ref NVML_SUCCESS if \a unitCount has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unitCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount); + +/** + * Acquire the handle for a particular unit, based on its index. + * + * For S-class products. + * + * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). + * For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1. + * + * The order in which NVML enumerates units has no guarantees of consistency between reboots. + * + * @param index The index of the target unit, >= 0 and < \a unitCount + * @param unit Reference in which to return the unit handle + * + * @return + * - \ref NVML_SUCCESS if \a unit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); + +/** + * Retrieves the static information associated with a unit. + * + * For S-class products. + * + * See \ref nvmlUnitInfo_t for details on available unit info. + * + * @param unit The identifier of the target unit + * @param info Reference in which to return the unit information + * + * @return + * - \ref NVML_SUCCESS if \a info has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL + */ +nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); + +/** + * Retrieves the LED state associated with this unit. + * + * For S-class products. + * + * See \ref nvmlLedState_t for details on allowed states. + * + * @param unit The identifier of the target unit + * @param state Reference in which to return the current LED state + * + * @return + * - \ref NVML_SUCCESS if \a state has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlUnitSetLedState() + */ +nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); + +/** + * Retrieves the PSU stats for the unit. + * + * For S-class products. + * + * See \ref nvmlPSUInfo_t for details on available PSU info. + * + * @param unit The identifier of the target unit + * @param psu Reference in which to return the PSU information + * + * @return + * - \ref NVML_SUCCESS if \a psu has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); + +/** + * Retrieves the temperature readings for the unit, in degrees C. + * + * For S-class products. + * + * Depending on the product, readings may be available for intake (type=0), + * exhaust (type=1) and board (type=2). + * + * @param unit The identifier of the target unit + * @param type The type of reading to take + * @param temp Reference in which to return the intake temperature + * + * @return + * - \ref NVML_SUCCESS if \a temp has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); + +/** + * Retrieves the fan speed readings for the unit. + * + * For S-class products. + * + * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. + * + * @param unit The identifier of the target unit + * @param fanSpeeds Reference in which to return the fan speed information + * + * @return + * - \ref NVML_SUCCESS if \a fanSpeeds has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); + +/** + * Retrieves the set of GPU devices that are attached to the specified unit. + * + * For S-class products. + * + * The \a deviceCount argument is expected to be set to the size of the input \a devices array. + * + * @param unit The identifier of the target unit + * @param deviceCount Reference in which to provide the \a devices array size, and + * to return the number of attached GPU devices + * @param devices Reference in which to return the references to the attached GPU devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); + +/** + * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. + * + * For S-class products. + * + * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. + * The HIC must be connected to an S-class system for it to be reported by this function. + * + * @param hwbcCount Size of hwbcEntries array + * @param hwbcEntries Array holding information about hwbc + * + * @return + * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small + */ +nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceQueries Device Queries + * This chapter describes that queries that NVML can perform against each device. + * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by + * calling one of \ref nvmlDeviceGetHandleByIndex_v2(), \ref nvmlDeviceGetHandleBySerial(), + * \ref nvmlDeviceGetHandleByPciBusId_v2(). or \ref nvmlDeviceGetHandleByUUID(). + * @{ + */ +/***************************************************************************************************/ + + /** + * Retrieves the number of compute devices in the system. A compute device is a single GPU. + * + * For all products. + * + * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system + * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. + * Update your code to handle this error, or use NVML 4.304 or older nvml header file. + * For backward binary compatibility reasons _v1 version of the API is still present in the shared + * library. + * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. + * + * @param deviceCount Reference in which to return the number of accessible devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCount_v2(unsigned int *deviceCount); + +/** + * Get attributes (engine counts etc.) for the given NVML device handle. + * + * @note This API currently only supports MIG device handles. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device NVML device handle + * @param attributes Device attributes + * + * @return + * - \ref NVML_SUCCESS if \a device attributes were successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is invalid + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAttributes_v2(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); + +/** + * Acquire the handle for a particular device, based on its index. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or UUID. See + * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId_v2(). + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs if: + * - The target GPU is an SLI slave + * + * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system + * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. + * Update your code to handle this error, or use NVML 4.304 or older nvml header file. + * For backward binary compatibility reasons _v1 version of the API is still present in the shared + * library. + * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. + * + * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. + * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't + * need to worry about that. + * + * @param index The index of the target GPU, >= 0 and < \a accessibleDevices + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetIndex + * @see nvmlDeviceGetCount + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex_v2(unsigned int index, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its board serial number. + * + * For Fermi &tm; or newer fully supported devices. + * + * This number corresponds to the value printed directly on the board, and to the value returned by + * \ref nvmlDeviceGetSerial(). + * + * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor + * of \ref nvmlDeviceGetHandleByUUID. + * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @param serial The board serial number of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one + * device has the same serial (dual GPU boards) + * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetSerial + * @see nvmlDeviceGetHandleByUUID + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. + * + * For all products. + * + * @param uuid The UUID of the target GPU or MIG instance + * @param device Reference in which to return the device handle or MIG device handle + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null + * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetUUID + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its PCI bus id. + * + * For all products. + * + * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo_v3(). + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs if: + * - The target GPU is an SLI slave + * + * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND + * instead of NVML_ERROR_NO_PERMISSION. + * + * @param pciBusId The PCI bus id of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId, nvmlDevice_t *device); + +/** + * Retrieves the name of this device. + * + * For all products. + * + * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not + * exceed 96 characters in length (including the NULL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_V2_BUFFER_SIZE. + * + * When used with MIG device handles the API returns MIG device names which can be used to identify devices + * based on their attributes. + * + * @param device The identifier of the target device + * @param name Reference in which to return the product name + * @param length The maximum allowed length of the string returned in \a name + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); + +/** + * Retrieves the brand of this device. + * + * For all products. + * + * The type is a member of \ref nvmlBrandType_t defined above. + * + * @param device The identifier of the target device + * @param type Reference in which to return the product brand type + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); + +/** + * Retrieves the NVML index of this device. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount_v2(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or GPU UUID. See + * \ref nvmlDeviceGetHandleByPciBusId_v2() and \ref nvmlDeviceGetHandleByUUID(). + * + * When used with MIG device handles this API returns indices that can be + * passed to \ref nvmlDeviceGetMigDeviceHandleByIndex to retrieve an identical handle. + * MIG device indices are unique within a device. + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * @param device The identifier of the target device + * @param index Reference in which to return the NVML index of the device + * + * @return + * - \ref NVML_SUCCESS if \a index has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetHandleByIndex() + * @see nvmlDeviceGetCount() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); + +/** + * Retrieves the globally unique board serial number associated with this device's board. + * + * For all products with an inforom. + * + * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). + * This number matches the serial number tag that is physically attached to the board. See \ref + * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param serial Reference in which to return the board/module serial number + * @param length The maximum allowed length of the string returned in \a serial + * + * @return + * - \ref NVML_SUCCESS if \a serial has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); + + +/***************************************************************************************************/ + +/** @defgroup nvmlAffinity CPU and Memory Affinity + * This chapter describes NVML operations that are associated with CPU and memory + * affinity. + * @{ + */ +/***************************************************************************************************/ + +//! Scope of NUMA node for affinity queries +#define NVML_AFFINITY_SCOPE_NODE 0 +//! Scope of processor socket for affinity queries +#define NVML_AFFINITY_SCOPE_SOCKET 1 + +typedef unsigned int nvmlAffinityScope_t; + +/** + * Retrieves an array of unsigned ints (sized to nodeSetSize) of bitmasks with + * the ideal memory affinity within node or socket for the device. + * For example, if NUMA node 0, 1 are ideal within the socket for the device and nodeSetSize == 1, + * result[0] = 0x3 + * + * \note If requested scope is not applicable to the target topology, the API + * will fall back to reporting the memory affinity for the immediate non-I/O + * ancestor of the device. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param nodeSetSize The size of the nodeSet array that is safe to access + * @param nodeSet Array reference in which to return a bitmask of NODEs, 64 NODEs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * @param scope Scope that change the default behavior + * + * @return + * - \ref NVML_SUCCESS if \a NUMA node Affinity has been filled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, nodeSetSize == 0, nodeSet is NULL or scope is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryAffinity(nvmlDevice_t device, unsigned int nodeSetSize, unsigned long *nodeSet, nvmlAffinityScope_t scope); + +/** + * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the + * ideal CPU affinity within node or socket for the device. + * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, + * result[0] = 0x3, result[1] = 0x3 + * + * \note If requested scope is not applicable to the target topology, the API + * will fall back to reporting the CPU affinity for the immediate non-I/O + * ancestor of the device. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param cpuSetSize The size of the cpuSet array that is safe to access + * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * @param scope Scope that change the default behavior + * + * @return + * - \ref NVML_SUCCESS if \a cpuAffinity has been filled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, cpuSet is NULL or sope is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ + +nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinityWithinScope(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet, nvmlAffinityScope_t scope); + +/** + * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device + * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, + * result[0] = 0x3, result[1] = 0x3 + * This is equivalent to calling \ref nvmlDeviceGetCpuAffinityWithinScope with \ref NVML_AFFINITY_SCOPE_NODE. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param cpuSetSize The size of the cpuSet array that is safe to access + * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * + * @return + * - \ref NVML_SUCCESS if \a cpuAffinity has been filled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); + +/** + * Sets the ideal affinity for the calling thread and device using the guidelines + * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. + * Older versions set the affinity for a calling process and all children. + * Currently supports up to 1024 processors. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully bound + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); + +/** + * Clear all affinity bindings for the calling thread. Note, this is a change as of version + * 8.0 as older versions cleared the affinity for a calling process and all children. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully unbound + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); + +/** + * Retrieve the common ancestor for two devices + * For all products. + * Supported on Linux only. + * + * @param device1 The identifier of the first device + * @param device2 The identifier of the second device + * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type + * + * @return + * - \ref NVML_SUCCESS if \a pathInfo has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ + +/** @} */ +nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); + +/** + * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level + * For all products. + * Supported on Linux only. + * + * @param device The identifier of the first device + * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found at \a level + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); + +/** + * Retrieve the set of GPUs that have a CPU affinity with the given CPU number + * For all products. + * Supported on Linux only. + * + * @param cpuNumber The CPU number + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); + +/** + * Retrieve the status for a given p2p capability index between a given pair of GPU + * + * @param device1 The first device + * @param device2 The second device + * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 + * @param p2pStatus Reference in which to return the status of the \a p2pIndex + * between \a device1 and \a device2 + * @return + * - \ref NVML_SUCCESS if \a p2pStatus has been populated + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); + +/** + * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, + * that augments the immutable, board serial identifier. + * + * For all products. + * + * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. + * It does NOT correspond to any identifier printed on the board. It will not exceed 96 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_V2_BUFFER_SIZE. + * + * When used with MIG device handles the API returns globally unique UUIDs which can be used to identify MIG + * devices across both GPU and MIG devices. UUIDs are immutable for the lifetime of a MIG device. + * + * @param device The identifier of the target device + * @param uuid Reference in which to return the GPU UUID + * @param length The maximum allowed length of the string returned in \a uuid + * + * @return + * - \ref NVML_SUCCESS if \a uuid has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); + +/** + * Retrieve the MDEV UUID of a vGPU instance. + * + * The MDEV UUID is a globally unique identifier of the mdev device assigned to the VM, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * MDEV UUID is displayed only on KVM platform. + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param mdevUuid Pointer to caller-supplied buffer to hold MDEV UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED on any hypervisor other than KVM + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mdevUuid is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMdevUUID(nvmlVgpuInstance_t vgpuInstance, char *mdevUuid, unsigned int size); + +/** + * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for + * each GPU will have the form /dev/nvidia[minor number]. + * + * For all products. + * Supported only for Linux + * + * @param device The identifier of the target device + * @param minorNumber Reference in which to return the minor number for the device + * @return + * - \ref NVML_SUCCESS if the minor number is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); + +/** + * Retrieves the the device board part number which is programmed into the board's InfoROM + * + * For all products. + * + * @param device Identifier of the target device + * @param partNumber Reference to the buffer to return + * @param length Length of the buffer reference + * + * @return + * - \ref NVML_SUCCESS if \a partNumber has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); + +/** + * Retrieves the version information for the device's infoROM object. + * + * For all products with an inforom. + * + * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate + * ECC counts. The version of the data structures in this memory may change from time to time. It will not + * exceed 16 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. + * + * See \ref nvmlInforomObject_t for details on the available infoROM objects. + * + * @param device The identifier of the target device + * @param object The target infoROM object + * @param version Reference in which to return the infoROM version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomImageVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); + +/** + * Retrieves the global infoROM image version + * + * For all products with an inforom. + * + * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board + * in contrast to infoROM object version which is only an indicator of supported features. + * Version string will not exceed 16 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference in which to return the infoROM image version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); + +/** + * Retrieves the checksum of the configuration stored in the device's infoROM. + * + * For all products with an inforom. + * + * Can be used to make sure that two GPUs have the exact same configuration. + * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. + * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) + * + * @param device The identifier of the target device + * @param checksum Reference in which to return the infoROM configuration checksum + * + * @return + * - \ref NVML_SUCCESS if \a checksum has been set + * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); + +/** + * Reads the infoROM from the flash and verifies the checksums. + * + * For all products with an inforom. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if infoROM is not corrupted + * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); + +/** + * Retrieves the display mode for the device. + * + * For all products. + * + * This method indicates whether a physical display (e.g. monitor) is currently connected to + * any of the device's connectors. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param display Reference in which to return the display mode + * + * @return + * - \ref NVML_SUCCESS if \a display has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); + +/** + * Retrieves the display active state for the device. + * + * For all products. + * + * This method indicates whether a display is initialized on the device. + * For example whether X Server is attached to this device and has allocated memory for the screen. + * + * Display can be active even when no monitor is physically attached. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param isActive Reference in which to return the display active state + * + * @return + * - \ref NVML_SUCCESS if \a isActive has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); + +/** + * Retrieves the persistence mode associated with this device. + * + * For all products. + * For Linux only. + * + * When driver persistence mode is enabled the driver software state is not torn down when the last + * client disconnects. By default this feature is disabled. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current driver persistence mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetPersistenceMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Retrieves the PCI attributes of this device. + * + * For all products. + * + * See \ref nvmlPciInfo_t for details on the available PCI info. + * + * @param device The identifier of the target device + * @param pci Reference in which to return the PCI info + * + * @return + * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t *pci); + +/** + * Retrieves the maximum PCIe link generation possible with this device and system + * + * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will + * report is generation 1. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkGen Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); + +/** + * Retrieves the maximum PCIe link width possible with this device and system + * + * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report + * a max link width of 8. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkWidth Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); + +/** + * Retrieves the current PCIe link generation + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkGen Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); + +/** + * Retrieves the current PCIe link width + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkWidth Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); + +/** + * Retrieve PCIe utilization information. + * This function is querying a byte counter over a 20ms interval and thus is the + * PCIe throughput over that interval. + * + * For Maxwell &tm; or newer fully supported devices. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t + * @param value Reference in which to return throughput in KB/s + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); + +/** + * Retrieve the PCIe replay counter. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param value Reference in which to return the counter's value + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); + +/** + * Retrieves the current clock speeds for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlClockType_t for details on available clock information. + * + * @param device The identifier of the target device + * @param type Identify which clock domain to query + * @param clock Reference in which to return the clock speed in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clock has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + +/** + * Retrieves the maximum clock speeds for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlClockType_t for details on available clock information. + * + * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks + * by few MHz. + * + * @param device The identifier of the target device + * @param type Identify which clock domain to query + * @param clock Reference in which to return the clock speed in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clock has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + +/** + * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. + * Can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Retrieves the default applications clock that GPU boots with or + * defaults to after \ref nvmlDeviceResetApplicationsClocks call. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the default clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * \see nvmlDeviceGetApplicationsClock + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Resets the application clock to the default value + * + * This is the applications clock that will be used after system reboot or driver reload. + * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, + * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above + * base clocks as thermal limits allow. + * + * @see nvmlDeviceGetApplicationsClock + * @see nvmlDeviceSetApplicationsClocks + * + * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); + +/** + * Retrieves the clock speed for the clock specified by the clock type and clock ID. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockId Identify which clock in the domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); + +/** + * Retrieves the customer defined maximum boost clock speed specified by the given clock type. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param count Reference in which to provide the \a clocksMHz array size, and + * to return the number of elements + * @param clocksMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of + * required elements) + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetSupportedGraphicsClocks + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); + +/** + * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param memoryClockMHz Memory clock for which to return possible graphics clocks + * @param count Reference in which to provide the \a clocksMHz array size, and + * to return the number of elements + * @param clocksMHz Reference in which to return the clocks in MHz + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetSupportedMemoryClocks + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); + +/** + * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled + * + * For Kepler &tm; or newer fully supported devices. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. + * + * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device + * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will + * revert to when no applications are using the GPU + * + * @return + * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); + +/** + * Try to set the current state of Auto Boosted clocks on a device. + * + * For Kepler &tm; or newer fully supported devices. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * Non-root users may use this API by default but can be restricted by root from using this API by calling + * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. + * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param enabled What state to try to set Auto Boosted clocks of the target device to + * + * @return + * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); + +/** + * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will + * return to when no compute running processes (e.g. CUDA application which have an active context) are running + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param enabled What state to try to set default Auto Boosted clocks of the target device to + * @param flags Flags that change the default behavior. Currently Unused. + * + * @return + * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); + + +/** + * Retrieves the intended operating speed of the device's fan. + * + * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the + * output will not match the actual fan speed. + * + * For all discrete products with dedicated fans. + * + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. + * + * @param device The identifier of the target device + * @param speed Reference in which to return the fan speed percentage + * + * @return + * - \ref NVML_SUCCESS if \a speed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); + + +/** + * Retrieves the intended operating speed of the device's specified fan. + * + * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the + * output will not match the actual fan speed. + * + * For all discrete products with dedicated fans. + * + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. + * + * @param device The identifier of the target device + * @param fan The index of the target fan, zero indexed. + * @param speed Reference in which to return the fan speed percentage + * + * @return + * - \ref NVML_SUCCESS if \a speed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int * speed); + +/** + * Retrieves the intended target speed of the device's specified fan. + * + * Normally, the driver dynamically adjusts the fan based on + * the needs of the GPU. But when user set fan speed using nvmlDeviceSetFanSpeed_v2, + * the driver will attempt to make the fan achieve the setting in + * nvmlDeviceSetFanSpeed_v2. The actual current speed of the fan + * is reported in nvmlDeviceGetFanSpeed_v2. + * + * For all discrete products with dedicated fans. + * + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. + * + * @param device The identifier of the target device + * @param fan The index of the target fan, zero indexed. + * @param targetSpeed Reference in which to return the fan speed percentage + * + * @return + * - \ref NVML_SUCCESS if \a speed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int *targetSpeed); + +/** + * Sets the speed of the fan control policy to default. + * + * For all cuda-capable discrete products with fans + * + * @param device The identifier of the target device + * @param fan The index of the fan, starting at zero + * + * return + * NVML_SUCCESS if speed has been adjusted + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if device is invalid + * NVML_ERROR_NOT_SUPPORTED if the device does not support this + * (doesn't have fans) + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan); + +/** + * Retrieves the min and max fan speed that user can set for the GPU fan. + * + * For all cuda-capable discrete products with fans + * + * @param device The identifier of the target device + * @param minSpeed The minimum speed allowed to set + * @param maxSpeed The maximum speed allowed to set + * + * return + * NVML_SUCCESS if speed has been adjusted + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if device is invalid + * NVML_ERROR_NOT_SUPPORTED if the device does not support this + * (doesn't have fans) + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int * minSpeed, + unsigned int * maxSpeed); + +/** + * Retrieves the number of fans on the device. + * + * For all discrete products with dedicated fans. + * + * @param device The identifier of the target device + * @param numFans The number of fans + * + * @return + * - \ref NVML_SUCCESS if \a fan number query was successful + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a numFans is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNumFans(nvmlDevice_t device, unsigned int *numFans); + +/** + * Retrieves the current temperature readings for the device, in degrees C. + * + * For all products. + * + * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. + * + * @param device The identifier of the target device + * @param sensorType Flag that indicates which sensor reading to retrieve + * @param temp Reference in which to return the temperature reading + * + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); + +/** + * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. + * + * For Kepler &tm; or newer fully supported devices. + * + * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. + * + * @param device The identifier of the target device + * @param thresholdType The type of threshold value queried + * @param temp Reference in which to return the temperature reading + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); + +/** + * Sets the temperature threshold for the GPU with the specified threshold type in degrees C. + * + * For Maxwell &tm; or newer fully supported devices. + * + * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. + * + * @param device The identifier of the target device + * @param thresholdType The type of threshold value to be set + * @param temp Reference which hold the value to be set + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp); + +/** + * Used to execute a list of thermal system instructions. + * + * @param device The identifier of the target device + * @param sensorIndex The index of the thermal sensor + * @param pThermalSettings Reference in which to return the thermal sensor information + * + * @return + * - \ref NVML_SUCCESS if \a pThermalSettings has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pThermalSettings is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sensorIndex, nvmlGpuThermalSettings_t *pThermalSettings); + +/** + * Retrieves the current performance state for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlPstates_t for details on allowed performance states. + * + * @param device The identifier of the target device + * @param pState Reference in which to return the performance state reading + * + * @return + * - \ref NVML_SUCCESS if \a pState has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); + +/** + * Retrieves current clocks throttling reasons. + * + * For all fully supported products. + * + * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. + * + * @param device The identifier of the target device + * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle + * reasons + * + * @return + * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlClocksThrottleReasons + * @see nvmlDeviceGetSupportedClocksThrottleReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); + +/** + * Retrieves bitmask of supported clocks throttle reasons that can be returned by + * \ref nvmlDeviceGetCurrentClocksThrottleReasons + * + * For all fully supported products. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported + * clocks throttle reasons + * + * @return + * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlClocksThrottleReasons + * @see nvmlDeviceGetCurrentClocksThrottleReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); + +/** + * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. + * + * Retrieve the current performance state for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlPstates_t for details on allowed performance states. + * + * @param device The identifier of the target device + * @param pState Reference in which to return the performance state reading + * + * @return + * - \ref NVML_SUCCESS if \a pState has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); + +/** + * This API has been deprecated. + * + * Retrieves the power management mode associated with this device. + * + * For products from the Fermi family. + * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. + * + * For from the Kepler or newer families. + * - Does not require \a NVML_INFOROM_POWER object. + * + * This flag indicates whether any power management algorithm is currently active on the device. An + * enabled state does not necessarily mean the device is being actively throttled -- only that + * that the driver will do so if the appropriate conditions are met. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current power management mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Retrieves the power management limit associated with this device. + * + * For Fermi &tm; or newer fully supported devices. + * + * The power limit defines the upper boundary for the card's power draw. If + * the card's total power draw reaches this limit the power management algorithm kicks in. + * + * This reading is only available if power management mode is supported. + * See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param limit Reference in which to return the power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); + +/** + * Retrieves information about possible values of power management limits on this device. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param minLimit Reference in which to return the minimum power management limit in milliwatts + * @param maxLimit Reference in which to return the maximum power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetPowerManagementLimit + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); + +/** + * Retrieves default power management limit on this device, in milliwatts. + * Default power management limit is a power management limit that the device boots with. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param defaultLimit Reference in which to return the default power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a defaultLimit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); + +/** + * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * + * For Fermi &tm; or newer fully supported devices. + * + * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param power Reference in which to return the power usage information + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); + +/** + * Retrieves current power mode on this device. + * + * %ADA_OR_NEWER% + * + * @param device The identifier of the target device + * @param powerModeId Reference in which to return the power mode + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerMode(nvmlDevice_t device, unsigned int *powerModeId); + +/** + * Retrieves bitmask of supported power modes on this device. + * + * %ADA_OR_NEWER% + * + * @param device The identifier of the target device + * @param supportedPowerModes Reference in which to return the bitmask of supported power mode + * + * @return + * - \ref NVML_SUCCESS if \a bitmask of supported power mode has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPowerModes(nvmlDevice_t device, unsigned int *supportedPowerModes); + +/** + * Sets new power mode. + * + * %ADA_OR_NEWER% + * + * @param device The identifier of the target device + * @param powerModeId Power mode to set. + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPowerMode(nvmlDevice_t device, unsigned int powerModeId); + +/** + * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded + * + * For Volta &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param energy Reference in which to return the energy consumption information + * + * @return + * - \ref NVML_SUCCESS if \a energy has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); + +/** + * Get the effective power limit that the driver enforces after taking into account all limiters + * + * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere + * This includes the out of band power limit interface + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The device to communicate with + * @param limit Reference in which to return the power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); + +/** + * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). + * + * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. + * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. + * Not supported on Quadro ® and Tesla &tm; C-class products. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current GOM + * @param pending Reference in which to return the pending GOM + * + * @return + * - \ref NVML_SUCCESS if \a mode has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlGpuOperationMode_t + * @see nvmlDeviceSetGpuOperationMode + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); + +/** + * Retrieves the amount of used, free, reserved and total memory available on the device, in bytes. + * The reserved amount is supported on version 2 only. + * + * For all products. + * + * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. + * Under WDDM most device memory is allocated and managed on startup by Windows. + * + * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated + * by all active channels on the device. + * + * See \ref nvmlMemory_v2_t for details on available memory info. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate + * information, only if the caller has appropriate privileges. Per-instance + * information can be queried by using specific MIG device handles. + * + * @note nvmlDeviceGetMemoryInfo_v2 adds additional memory information. + * + * @param device The identifier of the target device + * @param memory Reference in which to return the memory information + * + * @return + * - \ref NVML_SUCCESS if \a memory has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo_v2(nvmlDevice_t device, nvmlMemory_v2_t *memory); + +/** + * Retrieves the current compute mode for the device. + * + * For all products. + * + * See \ref nvmlComputeMode_t for details on allowed compute modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current compute mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetComputeMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); + +/** + * Retrieves the CUDA compute capability of the device. + * + * For all products. + * + * Returns the major and minor compute capability version numbers of the + * device. The major and minor versions are equivalent to the + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be + * returned by CUDA's cuDeviceGetAttribute(). + * + * @param device The identifier of the target device + * @param major Reference in which to return the major CUDA compute capability + * @param minor Reference in which to return the minor CUDA compute capability + * + * @return + * - \ref NVML_SUCCESS if \a major and \a minor have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); + +/** + * Retrieves the current and pending ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following + * the next reboot. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current ECC mode + * @param pending Reference in which to return the pending ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); + +/** + * Retrieves the default ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param defaultMode Reference in which to return the default ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a default is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode); + +/** + * Retrieves the device boardId from 0-N. + * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with + * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. + * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across + * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and + * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will + * always return those values but they will always be different from each other). + * + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param boardId Reference in which to return the device's board ID + * + * @return + * - \ref NVML_SUCCESS if \a boardId has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); + +/** + * Retrieves whether the device is on a Multi-GPU Board + * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param multiGpuBool Reference in which to return a zero or non-zero value + * to indicate whether the device is on a multi GPU board + * + * @return + * - \ref NVML_SUCCESS if \a multiGpuBool has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); + +/** + * Retrieves the total ECC error counts for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires ECC Mode to be enabled. + * + * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of + * errors across the entire device. + * + * See \ref nvmlMemoryErrorType_t for a description of available error types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types. + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. + * @param eccCounts Reference in which to return the specified ECC errors + * + * @return + * - \ref NVML_SUCCESS if \a eccCounts has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceClearEccErrorCounts() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); + +/** + * Retrieves the detailed ECC error counts for the device. + * + * @deprecated This API supports only a fixed set of ECC error locations + * On different GPU architectures different locations are supported + * See \ref nvmlDeviceGetMemoryErrorCounter + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. + * Requires ECC Mode to be enabled. + * + * Detailed errors provide separate ECC counts for specific parts of the memory system. + * + * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. + * + * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types.\n + * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. + * @param eccCounts Reference in which to return the specified ECC errors + * + * @return + * - \ref NVML_SUCCESS if \a eccCounts has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceClearEccErrorCounts() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); + +/** + * Retrieves the requested memory error counter for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. + * + * Only applicable to devices with ECC. + * + * Requires ECC Mode to be enabled. + * + * @note On MIG-enabled GPUs, per instance information can be queried using specific + * MIG device handles. Per instance information is currently only supported for + * non-DRAM uncorrectable volatile errors. Querying volatile errors using device + * handles is currently not supported. + * + * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types.\n + * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of error. + * @param counterType Flag that specifies the counter-type of the errors. + * @param locationType Specifies the location of the counter. + * @param count Reference in which to return the ECC counter + * + * @return + * - \ref NVML_SUCCESS if \a count has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is + * invalid, or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, + nvmlEccCounterType_t counterType, + nvmlMemoryLocation_t locationType, unsigned long long *count); + +/** + * Retrieves the current utilization rates for the device's major subsystems. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlUtilization_t for details on available utilization rates. + * + * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. + * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. + * + * @note On MIG-enabled GPUs, querying device utilization rates is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference in which to return the utilization information + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); + +/** + * Retrieves the current utilization and sampling size in microseconds for the Encoder + * + * For Kepler &tm; or newer fully supported devices. + * + * @note On MIG-enabled GPUs, querying encoder utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for encoder utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** + * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param encoderQueryType Type of encoder to query + * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); + +/** + * Retrieves the current encoder statistics for a given device. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * + * @return + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, + * or \a averageLatency is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); + +/** + * Retrieves information about active encoder sessions on a target device. + * + * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The + * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. + * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return + * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. + * @param sessionInfos Reference in which to return the session information + * + * @return + * - \ref NVML_SUCCESS if \a sessionInfos is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); + +/** + * Retrieves the current utilization and sampling size in microseconds for the Decoder + * + * For Kepler &tm; or newer fully supported devices. + * + * @note On MIG-enabled GPUs, querying decoder utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for decoder utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** +* Retrieves the active frame buffer capture sessions statistics for a given device. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param device The identifier of the target device +* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats +* +* @return +* - \ref NVML_SUCCESS if \a fbcStats is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a fbcStats is NULL +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats); + +/** +* Retrieves information about active frame buffer capture sessions on a target device. +* +* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The +* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions +* written to the buffer. +* +* If the supplied buffer is not large enough to accomodate the active session array, the function returns +* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. +* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return +* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may +* be zero if there are no new frames captured since the session started. +* +* @param device The identifier of the target device +* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. +* @param sessionInfo Reference in which to return the session information +* +* @return +* - \ref NVML_SUCCESS if \a sessionInfo is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); + +/** + * Retrieves the current and pending driver model for the device. + * + * For Fermi &tm; or newer fully supported devices. + * For windows only. + * + * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached + * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. + * + * See \ref nvmlDriverModel_t for details on available driver models. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current driver model + * @param pending Reference in which to return the pending driver model + * + * @return + * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetDriverModel() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); + +/** + * Get VBIOS version of the device. + * + * For all products. + * + * The VBIOS version may change from time to time. It will not exceed 32 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference to which to return the VBIOS version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); + +/** + * Get Bridge Chip Information for all the bridge chips on the board. + * + * For all fully supported products. + * Only applicable to multi-GPU products. + * + * @param device The identifier of the target device + * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy + * + * @return + * - \ref NVML_SUCCESS if bridge chip exists + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); + +/** + * Get information about processes with a compute context on a device + * + * For Fermi &tm; or newer fully supported devices. + * + * This function returns information only about compute running processes (e.g. CUDA application which have + * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. + * + * To query the current number of running compute processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new compute processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * + * @param device The device handle or MIG device handle + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Get information about processes with a graphics context on a device + * + * For Kepler &tm; or newer fully supported devices. + * + * This function returns information only about graphics based processes + * (eg. applications using OpenGL, DirectX) + * + * To query the current number of running graphics processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new graphics processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * + * @param device The device handle or MIG device handle + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Get information about processes with a MPS compute context on a device + * + * For Volta &tm; or newer fully supported devices. + * + * This function returns information only about compute running processes (e.g. CUDA application which have + * active context) utilizing MPS. Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by + * this function. + * + * To query the current number of running compute processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new compute processes are spawned. + * + * @note In MIG mode, if device handle is provided, the API returns aggregate information, only if + * the caller has appropriate privileges. Per-instance information can be queried by using + * specific MIG device handles. + * Querying per-instance information using MIG device handles is not supported if the device is in vGPU Host virtualization mode. + * + * @param device The device handle or MIG device handle + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v3(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Check if the GPU devices are on the same physical board. + * + * For all fully supported products. + * + * @param device1 The first GPU device + * @param device2 The second GPU device + * @param onSameBoard Reference in which to return the status. + * Non-zero indicates that the GPUs are on the same board. + * + * @return + * - \ref NVML_SUCCESS if \a onSameBoard has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); + +/** + * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. + * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. + * + * For all fully supported products. + * + * @param device The identifier of the target device + * @param apiType Target API type for this operation + * @param isRestricted Reference in which to return the current restriction + * NVML_FEATURE_ENABLED indicates that the API is root-only + * NVML_FEATURE_DISABLED indicates that the API is accessible to all users + * + * @return + * - \ref NVML_SUCCESS if \a isRestricted has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support + * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is + * not supported by the device) + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlRestrictedAPI_t + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); + +/** + * Gets recent samples for the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by + * the driver. + * + * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. + * + * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. + * The returned samplesCount will provide the number of samples that can be queried. The user needs to + * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). + * + * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the + * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query + * to get more recent samples. + * + * This method fetches the number of entries which can be accommodated in the provided samples array, and the + * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this + * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. + * + * @note On MIG-enabled GPUs, querying the following sample types, NVML_GPU_UTILIZATION_SAMPLES, NVML_MEMORY_UTILIZATION_SAMPLES + * NVML_ENC_UTILIZATION_SAMPLES and NVML_DEC_UTILIZATION_SAMPLES, is not currently supported. + * + * @param device The identifier for the target device + * @param type Type of sampling event + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t + * @param sampleCount Reference to provide the number of elements which can be queried in samples array + * @param samples Reference in which samples are returned + + * @return + * - \ref NVML_SUCCESS if samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or + * reference to \a sampleCount is 0 for non null \a samples + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, + nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); + +/** + * Gets Total, Available and Used size of BAR1 memory. + * + * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party + * devices (peer-to-peer on the PCIE bus). + * + * @note In MIG mode, if device handle is provided, the API returns aggregate + * information, only if the caller has appropriate privileges. Per-instance + * information can be queried by using specific MIG device handles. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param bar1Memory Reference in which BAR1 memory + * information is returned. + * + * @return + * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); + +/** + * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power + * or thermal constraints. + * + * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The + * difference in violation times at two different reference times gives the indication of GPU throttling event. + * + * Violation for thermal capping is not supported at this time. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param perfPolicyType Represents Performance policy which can trigger GPU throttling + * @param violTime Reference to which violation time related information is returned + * + * + * @return + * - \ref NVML_SUCCESS if violation time is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); + +/** + * Gets the device's interrupt number + * + * @param device The identifier of the target device + * @param irqNum The interrupt number associated with the specified device + * + * @return + * - \ref NVML_SUCCESS if irq number is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a irqNum is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetIrqNum(nvmlDevice_t device, unsigned int *irqNum); + +/** + * Gets the device's core count + * + * @param device The identifier of the target device + * @param numCores The number of cores for the specified device + * + * @return + * - \ref NVML_SUCCESS if Gpu core count is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a numCores is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNumGpuCores(nvmlDevice_t device, unsigned int *numCores); + +/** + * Gets the devices power source + * + * @param device The identifier of the target device + * @param powerSource The power source of the device + * + * @return + * - \ref NVML_SUCCESS if the current power source was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a powerSource is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSource_t *powerSource); + +/** + * Gets the device's memory bus width + * + * @param device The identifier of the target device + * @param maxSpeed The devices's memory bus width + * + * @return + * - \ref NVML_SUCCESS if the memory bus width is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a busWidth is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned int *busWidth); + +/** + * Gets the device's PCIE Max Link speed in MBPS + * + * @param device The identifier of the target device + * @param maxSpeed The devices's PCIE Max Link speed in MBPS + * + * @return + * - \ref NVML_SUCCESS if Pcie Max Link Speed is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a maxSpeed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int *maxSpeed); + +/** + * Gets the device's PCIe Link speed in Mbps + * + * @param device The identifier of the target device + * @param pcieSpeed The devices's PCIe Max Link speed in Mbps + * + * @return + * - \ref NVML_SUCCESS if \a pcieSpeed has been retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pcieSpeed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support PCIe speed getting + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *pcieSpeed); + +/** + * Gets the device's Adaptive Clock status + * + * @param device The identifier of the target device + * @param adaptiveClockStatus The current adaptive clocking status + * + * @return + * - \ref NVML_SUCCESS if the current adaptive clocking status is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a adaptiveClockStatus is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAdaptiveClockInfoStatus(nvmlDevice_t device, unsigned int *adaptiveClockStatus); + +/** + * @} + */ + +/** @addtogroup nvmlAccountingStats + * @{ + */ + +/** + * Queries the state of per process accounting mode. + * + * For Kepler &tm; or newer fully supported devices. + * + * See \ref nvmlDeviceGetAccountingStats for more details. + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current accounting mode + * + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Queries process's accounting stats. + * + * For Kepler &tm; or newer fully supported devices. + * + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. + * Accounting stats can be queried during life time of the process and after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. + * + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. + * + * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * + * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. + * + * @param device The identifier of the target device + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats + * + * @return + * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL + * - \ref NVML_ERROR_NOT_FOUND if process stats were not found + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); + +/** + * Queries list of processes that can be queried for accounting stats. The list of processes returned + * can be in running or terminated state. + * + * For Kepler &tm; or newer fully supported devices. + * + * To just query the number of processes ready to be queried, call this function with *count = 0 and + * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * + * For more details see \ref nvmlDeviceGetAccountingStats. + * + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * + * @param device The identifier of the target device + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids + * + * @return + * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature or accounting mode is disabled + * or on vGPU host. + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to + * expected value) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); + +/** + * Returns the number of processes that the circular buffer with accounting pids can hold. + * + * For Kepler &tm; or newer fully supported devices. + * + * This is the maximum number of processes that accounting information will be stored for before information + * about oldest processes will get overwritten by information about new processes. + * + * @param device The identifier of the target device + * @param bufferSize Reference in which to provide the size (in number of elements) + * of the circular buffer for accounting stats. + * + * @return + * - \ref NVML_SUCCESS if buffer size was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingStats + * @see nvmlDeviceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); + +/** @} */ + +/** @addtogroup nvmlDeviceQueries + * @{ + */ + +/** + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into + * + * @return + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * \a addresses is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, + unsigned int *pageCount, unsigned long long *addresses); + +/** + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * + * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's + * retirement. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into + * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 + * + * @return + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * \a addresses is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, + unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps); + +/** + * Check if any pages are pending retirement and need a reboot to fully retire. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param isPending Reference in which to return the pending status + * + * @return + * - \ref NVML_SUCCESS if \a isPending was populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); + +/** + * Get number of remapped rows. The number of rows reported will be based on + * the cause of the remapping. isPending indicates whether or not there are + * pending remappings. A reset will be required to actually remap the row. + * failureOccurred will be set if a row remapping ever failed in the past. A + * pending remapping won't affect future work on the GPU since + * error-containment and dynamic page blacklisting will take care of that. + * + * @note On MIG-enabled GPUs with active instances, querying the number of + * remapped rows is not supported + * + * For Ampere &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param corrRows Reference for number of rows remapped due to correctable errors + * @param uncRows Reference for number of rows remapped due to uncorrectable errors + * @param isPending Reference for whether or not remappings are pending + * @param failureOccurred Reference that is set when a remapping has failed in the past + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a corrRows, \a uncRows, \a isPending or \a failureOccurred is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If MIG is enabled or if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN Unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRemappedRows(nvmlDevice_t device, unsigned int *corrRows, unsigned int *uncRows, + unsigned int *isPending, unsigned int *failureOccurred); + +/** + * Get the row remapper histogram. Returns the remap availability for each bank + * on the GPU. + * + * @param device Device handle + * @param values Histogram values + * + * @return + * - \ref NVML_SUCCESS On success + * - \ref NVML_ERROR_UNKNOWN On any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRowRemapperHistogram(nvmlDevice_t device, nvmlRowRemapperHistogramValues_t *values); + +/** + * Get architecture for device + * + * @param device The identifier of the target device + * @param arch Reference where architecture is returned, if call successful. + * Set to NVML_DEVICE_ARCH_* upon success + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device or \a arch (output refererence) are invalid + */ +nvmlReturn_t DECLDIR nvmlDeviceGetArchitecture(nvmlDevice_t device, nvmlDeviceArchitecture_t *arch); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUnitCommands Unit Commands + * This chapter describes NVML operations that change the state of the unit. For S-class products. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. + * @{ + */ +/***************************************************************************************************/ + +/** + * Set the LED state for the unit. The LED can be either green (0) or amber (1). + * + * For S-class products. + * Requires root/admin permissions. + * + * This operation takes effect immediately. + * + * + * Current S-Class products don't provide unique LEDs for each unit. As such, both front + * and back LEDs will be toggled in unison regardless of which unit is specified with this command. + * + * See \ref nvmlLedColor_t for available colors. + * + * @param unit The identifier of the target unit + * @param color The target LED color + * + * @return + * - \ref NVML_SUCCESS if the LED color has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlUnitGetLedState() + */ +nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceCommands Device Commands + * This chapter describes NVML operations that change the state of the device. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. + * @{ + */ +/***************************************************************************************************/ + +/** + * Set the persistence mode for the device. + * + * For all products. + * For Linux only. + * Requires root/admin permissions. + * + * The persistence mode determines whether the GPU driver software is torn down after the last client + * exits. + * + * This operation takes effect immediately. It is not persistent across reboots. After each reboot the + * persistence mode is reset to "Disabled". + * + * See \ref nvmlEnableState_t for available modes. + * + * After calling this API with mode set to NVML_FEATURE_DISABLED on a device that has its own NUMA + * memory, the given device handle will no longer be valid, and to continue to interact with this + * device, a new handle should be obtained from one of the nvmlDeviceGetHandleBy*() APIs. This + * limitation is currently only applicable to devices that have a coherent NVLink connection to + * system memory. + * + * @param device The identifier of the target device + * @param mode The target persistence mode + * + * @return + * - \ref NVML_SUCCESS if the persistence mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPersistenceMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); + +/** + * Set the compute mode for the device. + * + * For all products. + * Requires root/admin permissions. + * + * The compute mode determines whether a GPU can be used for compute operations and whether it can + * be shared across contexts. + * + * This operation takes effect immediately. Under Linux it is not persistent across reboots and + * always resets to "Default". Under windows it is persistent. + * + * Under windows compute mode may only be set to DEFAULT when running in WDDM + * + * @note On MIG-enabled GPUs, compute mode would be set to DEFAULT and changing it is not supported. + * + * See \ref nvmlComputeMode_t for details on available compute modes. + * + * @param device The identifier of the target device + * @param mode The target compute mode + * + * @return + * - \ref NVML_SUCCESS if the compute mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetComputeMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); + +/** + * Set the ECC mode for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires root/admin permissions. + * + * The ECC mode determines whether the GPU enables its ECC support. + * + * This operation takes effect after the next reboot. + * + * See \ref nvmlEnableState_t for details on available modes. + * + * @param device The identifier of the target device + * @param ecc The target ECC mode + * + * @return + * - \ref NVML_SUCCESS if the ECC mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); + +/** + * Clear the ECC error and other memory error counts for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. + * Requires root/admin permissions. + * Requires ECC Mode to be enabled. + * + * Sets all of the specified ECC counters to 0, including both detailed and total counts. + * + * This operation takes effect immediately. + * + * See \ref nvmlMemoryErrorType_t for details on available counter types. + * + * @param device The identifier of the target device + * @param counterType Flag that indicates which type of errors should be cleared. + * + * @return + * - \ref NVML_SUCCESS if the error counts were cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see + * - nvmlDeviceGetDetailedEccErrors() + * - nvmlDeviceGetTotalEccErrors() + */ +nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); + +/** + * Set the driver model for the device. + * + * For Fermi &tm; or newer fully supported devices. + * For windows only. + * Requires root/admin permissions. + * + * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached + * to the device it must run in WDDM mode. + * + * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). + * This should only be done if the host is subsequently powered down and the display is detached from the device + * before the next reboot. + * + * This operation takes effect after the next reboot. + * + * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. + * + * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or + * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. + * + * See \ref nvmlDriverModel_t for details on available driver models. + * See \ref nvmlFlagDefault and \ref nvmlFlagForce + * + * @param device The identifier of the target device + * @param driverModel The target driver model + * @param flags Flags that change the default behavior + * + * @return + * - \ref NVML_SUCCESS if the driver model has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetDriverModel() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); + +typedef enum nvmlClockLimitId_enum { + NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00, + NVML_CLOCK_LIMIT_ID_TDP, + NVML_CLOCK_LIMIT_ID_UNLIMITED +} nvmlClockLimitId_t; + +/** + * Set clocks that device will lock to. + * + * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. + * Setting this will supercede application clock values and take effect regardless if a cuda app is running. + * See /ref nvmlDeviceSetApplicationsClocks + * + * Can be used as a setting to request constant performance. + * + * This can be called with a pair of integer clock frequencies in MHz, or a pair of /ref nvmlClockLimitId_t values. + * See the table below for valid combinations of these values. + * + * minGpuClock | maxGpuClock | Effect + * ------------+-------------+-------------------------------------------------- + * tdp | tdp | Lock clock to TDP + * unlimited | tdp | Upper bound is TDP but clock may drift below this + * tdp | unlimited | Lower bound is TDP but clock may boost above this + * unlimited | unlimited | Unlocked (== nvmlDeviceResetGpuLockedClocks) + * + * If one arg takes one of these values, the other must be one of these values as + * well. Mixed numeric and symbolic calls return NVML_ERROR_INVALID_ARGUMENT. + * + * Requires root/admin permissions. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetGpuLockedClocks. + * + * For Volta &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param minGpuClockMHz Requested minimum gpu clock in MHz + * @param maxGpuClockMHz Requested maximum gpu clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz); + +/** + * Resets the gpu clock to the default value + * + * This is the gpu clock that will be used after system reboot or driver reload. + * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * @see nvmlDeviceSetGpuLockedClocks + * + * For Volta &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); + +/** + * Set memory clocks that device will lock to. + * + * Sets the device's memory clocks to the value in the range of minMemClockMHz to maxMemClockMHz. + * Setting this will supersede application clock values and take effect regardless of whether a cuda app is running. + * See /ref nvmlDeviceSetApplicationsClocks + * + * Can be used as a setting to request constant performance. + * + * Requires root/admin permissions. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetMemoryLockedClocks. + * + * For Ampere &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param minMemClockMHz Requested minimum memory clock in MHz + * @param maxMemClockMHz Requested maximum memory clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMemoryLockedClocks(nvmlDevice_t device, unsigned int minMemClockMHz, unsigned int maxMemClockMHz); + +/** + * Resets the memory clock to the default value + * + * This is the memory clock that will be used after system reboot or driver reload. + * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * @see nvmlDeviceSetMemoryLockedClocks + * + * For Ampere &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetMemoryLockedClocks(nvmlDevice_t device); + +/** + * Set clocks that applications will lock to. + * + * Sets the clocks that compute and graphics applications will be running at. + * e.g. CUDA driver requests these clocks during context creation which means this property + * defines clocks at which CUDA applications will be running unless some overspec event + * occurs (e.g. over power, over thermal or external HW brake). + * + * Can be used as a setting to request constant performance. + * + * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. + * + * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call + * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting + * above the clock value being set. + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks + * for details on how to list available clocks combinations. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetApplicationsClocks. + * + * @param device The identifier of the target device + * @param memClockMHz Requested memory clock in MHz + * @param graphicsClockMHz Requested graphics clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); + +/** + * Retrieves the frequency monitor fault status for the device. + * + * For Ampere &tm; or newer fully supported devices. + * Requires root user. + * + * See \ref nvmlClkMonStatus_t for details on decoding the status output. + * + * @param device The identifier of the target device + * @param status Reference in which to return the clkmon fault status + * + * @return + * - \ref NVML_SUCCESS if \a status has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a status is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetClkMonStatus() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClkMonStatus(nvmlDevice_t device, nvmlClkMonStatus_t *status); + +/** + * Set new power limit of this device. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. + * + * \note Limit is not persistent across reboots or driver unloads. + * Enable persistent mode to prevent driver from unloading when no application is using the device. + * + * @param device The identifier of the target device + * @param limit Power management limit in milliwatts to set + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPowerManagementLimitConstraints + * @see nvmlDeviceGetPowerManagementDefaultLimit + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); + +/** + * Sets new GOM. See \a nvmlGpuOperationMode_t for details. + * + * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. + * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. + * Not supported on Quadro ® and Tesla &tm; C-class products. + * Requires root/admin permissions. + * + * Changing GOMs requires a reboot. + * The reboot requirement might be removed in the future. + * + * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when + * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. + * + * @param device The identifier of the target device + * @param mode Target GOM + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlGpuOperationMode_t + * @see nvmlDeviceGetGpuOperationMode + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); + +/** + * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. + * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. + * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction + * to query the current restriction settings. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @param device The identifier of the target device + * @param apiType Target API type for this operation + * @param isRestricted The target restriction + * + * @return + * - \ref NVML_SUCCESS if \a isRestricted has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support + * the feature that api restrictions are being set for (E.G. Enabling/disabling auto + * boosted clocks is not supported by the device) + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlRestrictedAPI_t + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); + +/** + * @} + */ + +/** @addtogroup nvmlAccountingStats + * @{ + */ + +/** + * Enables or disables per process accounting. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @note This setting is not persistent and will default to disabled after driver unloads. + * Enable persistence mode to be sure the setting doesn't switch off to disabled. + * + * @note Enabling accounting mode has no negative impact on the GPU performance. + * + * @note Disabling accounting clears all accounting pids information. + * + * @note On MIG-enabled GPUs, accounting mode would be set to DISABLED and changing it is not supported. + * + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceClearAccountingPids + * + * @param device The identifier of the target device + * @param mode The target accounting mode + * + * @return + * - \ref NVML_SUCCESS if the new mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); + +/** + * Clears accounting information about all processes that have already terminated. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if accounting information has been cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup NvLink NvLink Methods + * This chapter describes methods that NVML can perform on NVLINK enabled devices. + * @{ + */ +/***************************************************************************************************/ + +/** + * Retrieves the state of the device's NvLink for the link specified + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that + * the link is active and NVML_FEATURE_DISABLED indicates it + * is inactive + * + * @return + * - \ref NVML_SUCCESS if \a isActive has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); + +/** + * Retrieves the version of the device's NvLink for the link specified + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param version Requested NvLink version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); + +/** + * Retrieves the requested capability from the device's NvLink for the link specified + * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried + * The return value should be treated as a boolean. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried + * @param capResult A boolean for the queried capability indicating that feature is available + * + * @return + * - \ref NVML_SUCCESS if \a capResult has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, + nvmlNvLinkCapability_t capability, unsigned int *capResult); + +/** + * Retrieves the PCI information for the remote node on a NvLink link + * Note: pciSubSystemId is not filled in this function and is indeterminate + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param pci \a nvmlPciInfo_t of the remote node for the specified link + * + * @return + * - \ref NVML_SUCCESS if \a pci has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); + +/** + * Retrieves the specified error counter value + * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the NvLink counter to be queried + * @param counterValue Returned counter value + * + * @return + * - \ref NVML_SUCCESS if \a counter has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, + nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); + +/** + * Resets all error counters to zero + * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * + * @return + * - \ref NVML_SUCCESS if the reset is successful + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); + +/** + * Deprecated: Setting utilization counter control is no longer supported. + * + * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset + * of the counters if the reset parameter is non-zero. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set + * @param reset Resets the counters on set if non-zero + * + * @return + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control, unsigned int reset); + +/** + * Deprecated: Getting utilization counter control is no longer supported. + * + * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information + * + * @return + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control); + + +/** + * Deprecated: Use \ref nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead. + * + * Retrieve the NVLINK utilization counter based on the current control for a specified counter. + * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl + * before reading the utilization counters as they have no default state + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be read (0 or 1). + * @param rxcounter Receive counter return value + * @param txcounter Transmit counter return value + * + * @return + * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, + unsigned long long *rxcounter, unsigned long long *txcounter); + +/** + * Deprecated: Freezing NVLINK utilization counters is no longer supported. + * + * Freeze the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be frozen (0 or 1). + * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters + * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters + * + * @return + * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, + unsigned int counter, nvmlEnableState_t freeze); + +/** + * Deprecated: Resetting NVLINK utilization counters is no longer supported. + * + * Reset the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be reset + * @param counter Specifies the counter that should be reset (0 or 1) + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); + +/** +* Get the NVLink device type of the remote device connected over the given link. +* +* @param device The device handle of the target GPU +* @param link The NVLink link index on the target GPU +* @param pNvLinkDeviceType Pointer in which the output remote device type is returned +* +* @return +* - \ref NVML_SUCCESS if \a pNvLinkDeviceType has been set +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_NOT_SUPPORTED if NVLink is not supported +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid, or +* \a pNvLinkDeviceType is NULL +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is +* otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlEvents Event Handling Methods + * This chapter describes methods that NVML can perform against each device to register and wait for + * some event to occur. + * @{ + */ +/***************************************************************************************************/ + +/** + * Create an empty set of events. + * Event set should be freed by \ref nvmlEventSetFree + * + * For Fermi &tm; or newer fully supported devices. + * @param set Reference in which to return the event handle + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventSetFree + */ +nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); + +/** + * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t + * + * For Fermi &tm; or newer fully supported devices. + * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) + * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) + * + * For Linux only. + * + * \b IMPORTANT: Operations on \a set are not thread safe + * + * This call starts recording of events on specific device. + * All events that occurred before this call are not recorded. + * Checking if some event occurred can be done with \ref nvmlEventSetWait_v2 + * + * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. + * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes + * are registered in that case. + * + * @param device The identifier of the target device + * @param eventTypes Bitmask of \ref nvmlEventType to record + * @param set Set to which add new event types + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceGetSupportedEventTypes + * @see nvmlEventSetWait + * @see nvmlEventSetFree + */ +nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); + +/** + * Returns information about events supported on device + * + * For Fermi &tm; or newer fully supported devices. + * + * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. + * + * @param device The identifier of the target device + * @param eventTypes Reference in which to return bitmask of supported events + * + * @return + * - \ref NVML_SUCCESS if the eventTypes has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); + +/** + * Waits on events and delivers events + * + * For Fermi &tm; or newer fully supported devices. + * + * If some events are ready to be delivered at the time of the call, function returns immediately. + * If there are no events ready to be delivered, function sleeps till event arrives + * but not longer than specified timeout. This function in certain conditions can return before + * specified timeout passes (e.g. when interrupt arrives) + * + * On Windows, in case of xid error, the function returns the most recent xid error type seen by the system. + * If there are multiple xid errors generated before nvmlEventSetWait is invoked then the last seen xid error + * type is returned for all xid error events. + * + * On Linux, every xid error event would return the associated event data and other information if applicable. + * + * In MIG mode, if device handle is provided, the API reports all the events for the available instances, + * only if the caller has appropriate privileges. In absence of required privileges, only the events which + * affect all the instances (i.e. whole device) are reported. + * + * This API does not currently support per-instance event reporting using MIG device handles. + * + * @param set Reference to set of events to wait on + * @param data Reference in which to return event data + * @param timeoutms Maximum amount of wait time in milliseconds for registered event + * + * @return + * - \ref NVML_SUCCESS if the data has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL + * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived + * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlEventSetWait_v2(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); + +/** + * Releases events in the set + * + * For Fermi &tm; or newer fully supported devices. + * + * @param set Reference to events to be released + * + * @return + * - \ref NVML_SUCCESS if the event has been successfully released + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlZPI Drain states + * This chapter describes methods that NVML can perform against each device to control their drain state + * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to + * power on/off GPUs, enable robust reset scenarios, etc. + * @{ + */ +/***************************************************************************************************/ + +/** + * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. + * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before + * this call is made. + * Must be called as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU drain state to be modified + * @param newState The drain state that should be entered, see \ref nvmlEnableState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); + +/** + * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining + * state. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU drain state to be queried + * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); + +/** + * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver + * as long as no other processes are attached. If other processes are attached, this call will return + * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the + * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called + * to initiate the draining state is if that process was using, and is still using, a GPU before the + * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled + * prior to this call. + * + * For long-running NVML processes please note that this will change the enumeration of current GPUs. + * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. + * Also, device handles after the removed GPU will not be valid and must be re-established. + * Must be run as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU to be removed + * @param gpuState Whether the GPU is to be removed, from the OS + * see \ref nvmlDetachGpuState_t + * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed + */ +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu_v2(nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); + +/** + * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that + * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. + * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes + * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. + * + * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds + * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. + * + * Must be run as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device + * fields are used in this call. + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature + * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlFieldValueQueries Field Value Queries + * This chapter describes NVML operations that are associated with retrieving Field Values from NVML + * @{ + */ +/***************************************************************************************************/ + +/** + * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. + * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs + * will be populated from a single call rather than making a driver call for each fieldId. + * + * @param device The device handle of the GPU to request field values for + * @param valuesCount Number of entries in values that should be retrieved + * @param values Array of \a valuesCount structures to hold field values. + * Each value's fieldId must be populated prior to this call + * + * @return + * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must + * check the nvmlReturn field of each value for each individual + * status + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); + + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup vGPU Enums, Constants and Structs + * @{ + */ +/** @} */ +/***************************************************************************************************/ + +/***************************************************************************************************/ +/** @defgroup nvmlVirtualGpuQueries vGPU APIs + * This chapter describes operations that are associated with NVIDIA vGPU Software products. + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to get the virtualization mode corresponding to the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device Identifier of the target device + * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * + * @return + * - \ref NVML_SUCCESS if \a pVirtualMode is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); + +/** + * Queries if SR-IOV host operation is supported on a vGPU supported device. + * + * Checks whether SR-IOV host capability is supported by the device and the + * driver, and indicates device is in SR-IOV mode if both of these conditions + * are true. + * + * @param device The identifier of the target device + * @param pHostVgpuMode Reference in which to return the current vGPU mode + * + * @return + * - \ref NVML_SUCCESS if device's vGPU mode has been successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle is 0 or \a pVgpuMode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device doesn't support this feature. + * - \ref NVML_ERROR_UNKNOWN if any unexpected error occurred + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHostVgpuMode(nvmlDevice_t device, nvmlHostVgpuMode_t *pHostVgpuMode); + +/** + * This method is used to set the virtualization mode corresponding to the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device Identifier of the target device + * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * + * @return + * - \ref NVML_SUCCESS if \a pVirtualMode is set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. + * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); + +/** + * Retrieve the vGPU Software licensable features. + * + * Identifies whether the system supports vGPU Software Licensing. If it does, return the list of licensable feature(s) + * and their current license status. + * + * @param device Identifier of the target device + * @param pGridLicensableFeatures Pointer to structure in which vGPU software licensable features are returned + * + * @return + * - \ref NVML_SUCCESS if licensable features are successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); + +/** + * Retrieves the current utilization and process ID + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. + * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at + * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization + * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values + * are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilization set to NULL. The caller should allocate a buffer of size + * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed + * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. + * + * On successful return, the function updates \a processSamplesCount with the number of process utilization sample + * structures that were actually written. This may differ from a previously read value as instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @note On MIG-enabled GPUs, querying process utilization is not currently supported. + * + * @param device The identifier of the target device + * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned + * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, + unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); + +/** + * Retrieve GSP firmware version. + * + * The caller passes in buffer via \a version and corresponding GSP firmware numbered version + * is returned with the same parameter in string format. + * + * @param device Device handle + * @param version The retrieved GSP firmware version + * + * @return + * - \ref NVML_SUCCESS if GSP firmware version is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or GSP \a version pointer is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version); + +/** + * Retrieve GSP firmware mode. + * + * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with + * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean. + * + * @param device Device handle + * @param isEnabled Pointer to specify if GSP firmware is enabled + * @param defaultMode Pointer to specify if GSP firmware is supported by default on \a device + * + * @return + * - \ref NVML_SUCCESS if GSP firmware mode is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpu vGPU Management + * @{ + * + * This chapter describes APIs supporting NVIDIA vGPU. + */ +/***************************************************************************************************/ + +/** + * Retrieve the supported vGPU types on a physical GPU (device). + * + * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the currently creatable vGPU types on a physical GPU (device). + * + * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types + * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable + * list will be restricted to whatever vGPU type is already running on the device. + * + * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeClass Pointer to string array to return class in + * @param size Size of string + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); + +/** + * Retrieve the vGPU type name. + * + * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not + * exceed 64 characters in length (including the NUL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeName Pointer to buffer to return name + * @param size Size of buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); + +/** + * Retrieve the GPU Instance Profile ID for the given vGPU type ID. + * The API will return a valid GPU Instance Profile ID for the MIG capable vGPU types, else INVALID_GPU_INSTANCE_PROFILE_ID is + * returned. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param gpuInstanceProfileId GPU Instance Profile ID + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_NOT_SUPPORTED if \a device is not in vGPU Host virtualization mode + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a gpuInstanceProfileId is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetGpuInstanceProfileId(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *gpuInstanceProfileId); + +/** + * Retrieve the device ID of a vGPU type. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value + * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); + +/** + * Retrieve the vGPU framebuffer size in bytes. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param fbSize Pointer to framebuffer size in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); + +/** + * Retrieve count of vGPU's supported display heads. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param numDisplayHeads Pointer to number of display heads + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); + +/** + * Retrieve vGPU display head's maximum supported resolution. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param displayIndex Zero-based index of display head + * @param xdim Pointer to maximum number of pixels in X dimension + * @param ydim Pointer to maximum number of pixels in Y dimension + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex + * is out of range. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); + +/** + * Retrieve license requirements for a vGPU type + * + * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form + * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, + * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". + * + * The total length of the returned string will not exceed 128 characters, including the NUL terminator. + * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeLicenseString Pointer to buffer to return license info + * @param size Size of \a vgpuTypeLicenseString buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); + +/** + * Retrieve the static frame rate limit value of the vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param frameRateLimit Reference to return the frame rate limit value + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); + +/** + * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuTypeId Handle to vGPU type + * @param vgpuInstanceCount Pointer to get the max number of vGPU instances + * that can be created on a deicve for given vgpuTypeId + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, + * or \a vgpuInstanceCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); + +/** + * Retrieve the maximum number of vGPU instances supported per VM for given vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuInstanceCountPerVm Pointer to get the max number of vGPU instances supported per VM for given \a vgpuTypeId + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuInstanceCountPerVm is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstancesPerVm(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCountPerVm); + +/** + * Retrieve the active vGPU instances on a device. + * + * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The + * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. + * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return + * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer which passes in the array size as well as get + * back the number of types + * @param vgpuInstances Pointer to array in which to return list of vGPU instances + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); + +/** + * Retrieve the VM ID associated with a vGPU instance. + * + * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vmId Pointer to caller-supplied buffer to hold VM ID + * @param size Size of buffer in bytes + * @param vmIdType Pointer to hold VM ID type + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0 + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); + +/** + * Retrieve the UUID of a vGPU instance. + * + * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a uuid is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); + +/** + * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. + * + * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version + * string will not exceed 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. + * + * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is + * returned as "Not Available" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the + * NVIDIA driver is loaded and initialized. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param version Caller-supplied buffer to return driver version string + * @param length Size of \a version buffer + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); + +/** + * Retrieve the framebuffer usage in bytes. + * + * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance The identifier of the target instance + * @param fbUsage Pointer to framebuffer usage in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbUsage is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); + +/** + * @deprecated Use \ref nvmlVgpuInstanceGetLicenseInfo_v2. + * + * Retrieve the current licensing state of the vGPU instance. + * + * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param licensed Reference to return the licensing status + * + * @return + * - \ref NVML_SUCCESS if \a licensed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licensed is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); + +/** + * Retrieve the vGPU type of a vGPU instance. + * + * Returns the vGPU type ID of vgpu assigned to the vGPU instance. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vgpuTypeId Reference to return the vgpuTypeId + * + * @return + * - \ref NVML_SUCCESS if \a vgpuTypeId has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuTypeId is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); + +/** + * Retrieve the frame rate limit set for the vGPU instance. + * + * Returns the value of the frame rate limit set for the vGPU instance + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param frameRateLimit Reference to return the frame rate limit + * + * @return + * - \ref NVML_SUCCESS if \a frameRateLimit has been set + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); + +/** + * Retrieve the current ECC mode of vGPU instance. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param eccMode Reference in which to return the current ECC mode + * + * @return + * - \ref NVML_SUCCESS if the vgpuInstance's ECC mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEccMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *eccMode); + +/** + * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); + +/** + * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Unsigned int for the encoder capacity value + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderCapacity is out of range of 0-100. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); + +/** + * Retrieves the current encoder statistics of a vGPU Instance + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * + * @return + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL + * or \a vgpuInstance is 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); + +/** + * Retrieves information about all active encoder sessions on a vGPU Instance. + * + * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The + * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. + * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return + * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to caller supplied array size, and returns + * the number of sessions. + * @param sessionInfo Reference to caller supplied array in which the list + * of session information us returned. + * + * @return + * - \ref NVML_SUCCESS if \a sessionInfo is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is + returned in \a sessionCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL, or \a vgpuInstance is 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); + +/** +* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats +* +* @return +* - \ref NVML_SUCCESS if \a fbcStats is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbcStats is NULL +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats); + +/** +* Retrieves information about active frame buffer capture sessions on a vGPU Instance. +* +* An array of active FBC sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The +* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions +* written to the buffer. +* +* If the supplied buffer is not large enough to accomodate the active session array, the function returns +* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. +* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return +* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may +* be zero if there are no new frames captured since the session started. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. +* @param sessionInfo Reference in which to return the session information +* +* @return +* - \ref NVML_SUCCESS if \a sessionInfo is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a sessionCount is NULL. +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); + +/** +* Retrieve the GPU Instance ID for the given vGPU Instance. +* The API will return a valid GPU Instance ID for MIG backed vGPU Instance, else INVALID_GPU_INSTANCE_ID is returned. +* +* For Kepler &tm; or newer fully supported devices. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param gpuInstanceId GPU Instance ID +* +* @return +* - \ref NVML_SUCCESS successful completion +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a gpuInstanceId is NULL. +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuInstance, unsigned int *gpuInstanceId); + +/** +* Retrieves the PCI Id of the given vGPU Instance i.e. the PCI Id of the GPU as seen inside the VM. +* +* The vGPU PCI id is returned as "00000000:00:00.0" if NVIDIA driver is not installed on the vGPU instance. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param vgpuPciId Caller-supplied buffer to return vGPU PCI Id string +* @param length Size of the vgpuPciId buffer +* +* @return +* - \ref NVML_SUCCESS if vGPU PCI Id is sucessfully retrieved +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuPciId is NULL +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance +* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small, \a length is set to required length +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char *vgpuPciId, unsigned int *length); + +/** +* Retrieve the requested capability for a given vGPU type. Refer to the \a nvmlVgpuCapability_t structure +* for the specific capabilities that can be queried. The return value in \a capResult should be treated as +* a boolean, with a non-zero value indicating that the capability is supported. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param vgpuTypeId Handle to vGPU type +* @param capability Specifies the \a nvmlVgpuCapability_t to be queried +* @param capResult A boolean for the queried capability indicating that feature is supported +* +* @return +* - \ref NVML_SUCCESS successful completion +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a capability is invalid, or \a capResult is NULL +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int *capResult); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvml vGPU Migration + * This chapter describes operations that are associated with vGPU Migration. + * @{ + */ +/***************************************************************************************************/ + +/** + * Structure representing range of vGPU versions. + */ +typedef struct nvmlVgpuVersion_st +{ + unsigned int minVersion; //!< Minimum vGPU version. + unsigned int maxVersion; //!< Maximum vGPU version. +} nvmlVgpuVersion_t; + +/** + * vGPU metadata structure. + */ +typedef struct nvmlVgpuMetadata_st +{ + unsigned int version; //!< Current version of the structure + unsigned int revision; //!< Current revision of the structure + nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields + char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest + char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host + unsigned int reserved[6]; //!< Reserved for internal use + unsigned int vgpuVirtualizationCaps; //!< vGPU virtualizaion capabilities bitfileld + unsigned int guestVgpuVersion; //!< vGPU version of guest driver + unsigned int opaqueDataSize; //!< Size of opaque data field in bytes + char opaqueData[4]; //!< Opaque data +} nvmlVgpuMetadata_t; + +/** + * Physical GPU metadata structure + */ +typedef struct nvmlVgpuPgpuMetadata_st +{ + unsigned int version; //!< Current version of the structure + unsigned int revision; //!< Current revision of the structure + char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version + unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld + unsigned int reserved[5]; //!< Reserved for internal use + nvmlVgpuVersion_t hostSupportedVgpuRange; //!< vGPU version range supported by host driver + unsigned int opaqueDataSize; //!< Size of opaque data field in bytes + char opaqueData[4]; //!< Opaque data +} nvmlVgpuPgpuMetadata_t; + +/** + * vGPU VM compatibility codes + */ +typedef enum nvmlVgpuVmCompatibility_enum +{ + NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable + NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) + NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) + NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) + NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8 //!< vGPU is runnable from a live/paused (ACPI S0) +} nvmlVgpuVmCompatibility_t; + +/** + * vGPU-pGPU compatibility limit codes + */ +typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum +{ + NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. + NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< ompatibility is limited by host driver version. + NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. + NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. + NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000 //!< Compatibility is limited by an undefined factor. +} nvmlVgpuPgpuCompatibilityLimitCode_t; + +/** + * vGPU-pGPU compatibility structure + */ +typedef struct nvmlVgpuPgpuCompatibility_st +{ + nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t + nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t +} nvmlVgpuPgpuCompatibility_t; + +/** + * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM + * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section + * containing internal state. + * + * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are + * dependent on information obtained from the guest VM, which may not yet have reached a state where that information + * is available. The current state of these dependent fields is reflected in the info structure's \ref nvmlVgpuGuestInfoState_t field. + * + * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide + * it to Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. + * + * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure + * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param vgpuInstance vGPU instance handle + * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written + * @param bufferSize Size of vgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); + +/** + * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about + * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section + * containing internal state. + * + * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata + * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param device The identifier of the target device + * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written + * @param bufferSize Pointer to size of \a pgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS GPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); + +/** + * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a + * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the + * physical GPU. + * + * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The + * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility + * with the physical GPU is limited, a limit code indicates the factor limiting compability. + * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). + * + * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to + * boot a given vGPU or associated VM. + * + * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure + * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure + * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info + * + * @return + * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); + +/** + * Returns the properties of the physical GPU indicated by the device in an ascii-encoded string format. + * + * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the + * string is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param device The identifier of the target device + * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written + * @param bufferSize Pointer to size of \a pgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS GPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a pgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char *pgpuMetadata, unsigned int *bufferSize); + +/* + * Virtual GPU (vGPU) version + * + * The NVIDIA vGPU Manager and the guest drivers are tagged with a range of supported vGPU versions. This determines the range of NVIDIA guest driver versions that + * are compatible for vGPU feature support with a given NVIDIA vGPU Manager. For vGPU feature support, the range of supported versions for the NVIDIA vGPU Manager + * and the guest driver must overlap. Otherwise, the guest driver fails to load in the VM. + * + * When the NVIDIA guest driver loads, either when the VM is booted or when the driver is installed or upgraded, a negotiation occurs between the guest driver + * and the NVIDIA vGPU Manager to select the highest mutually compatible vGPU version. The negotiated vGPU version stays the same across VM migration. + */ + +/** + * Query the ranges of supported vGPU versions. + * + * This function gets the linear range of supported vGPU versions that is preset for the NVIDIA vGPU Manager and the range set by an administrator. + * If the preset range has not been overridden by \ref nvmlSetVgpuVersion, both ranges are the same. + * + * The caller passes pointers to the following \ref nvmlVgpuVersion_t structures, into which the NVIDIA vGPU Manager writes the ranges: + * 1. \a supported structure that represents the preset range of vGPU versions supported by the NVIDIA vGPU Manager. + * 2. \a current structure that represents the range of supported vGPU versions set by an administrator. By default, this range is the same as the preset range. + * + * @param supported Pointer to the structure in which the preset range of vGPU versions supported by the NVIDIA vGPU Manager is written + * @param current Pointer to the structure in which the range of supported vGPU versions set by an administrator is written + * + * @return + * - \ref NVML_SUCCESS The vGPU version range structures were successfully obtained. + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. + * - \ref NVML_ERROR_INVALID_ARGUMENT The \a supported parameter or the \a current parameter is NULL. + * - \ref NVML_ERROR_UNKNOWN An error occurred while the data was being fetched. + */ +nvmlReturn_t DECLDIR nvmlGetVgpuVersion(nvmlVgpuVersion_t *supported, nvmlVgpuVersion_t *current); + +/** + * Override the preset range of vGPU versions supported by the NVIDIA vGPU Manager with a range set by an administrator. + * + * This function configures the NVIDIA vGPU Manager with a range of supported vGPU versions set by an administrator. This range must be a subset of the + * preset range that the NVIDIA vGPU Manager supports. The custom range set by an administrator takes precedence over the preset range and is advertised to + * the guest VM for negotiating the vGPU version. See \ref nvmlGetVgpuVersion for details of how to query the preset range of versions supported. + * + * This function takes a pointer to vGPU version range structure \ref nvmlVgpuVersion_t as input to override the preset vGPU version range that the NVIDIA vGPU Manager supports. + * + * After host system reboot or driver reload, the range of supported versions reverts to the range that is preset for the NVIDIA vGPU Manager. + * + * @note 1. The range set by the administrator must be a subset of the preset range that the NVIDIA vGPU Manager supports. Otherwise, an error is returned. + * 2. If the range of supported guest driver versions does not overlap the range set by the administrator, the guest driver fails to load. + * 3. If the range of supported guest driver versions overlaps the range set by the administrator, the guest driver will load with a negotiated + * vGPU version that is the maximum value in the overlapping range. + * 4. No VMs must be running on the host when this function is called. If a VM is running on the host, the call to this function fails. + * + * @param vgpuVersion Pointer to a caller-supplied range of supported vGPU versions. + * + * @return + * - \ref NVML_SUCCESS The preset range of supported vGPU versions was successfully overridden. + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported. + * - \ref NVML_ERROR_IN_USE The range was not overridden because a VM is running on the host. + * - \ref NVML_ERROR_INVALID_ARGUMENT The \a vgpuVersion parameter specifies a range that is outside the range supported by the NVIDIA vGPU Manager or if \a vgpuVersion is NULL. + */ +nvmlReturn_t DECLDIR nvmlSetVgpuVersion(nvmlVgpuVersion_t *vgpuVersion); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUtil vGPU Utilization and Accounting + * This chapter describes operations that are associated with vGPU Utilization and Accounting. + * @{ + */ +/***************************************************************************************************/ + +/** + * Retrieves current utilization for vGPUs on a physical GPU (device). + * + * For Kepler &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running + * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer + * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the + * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values + * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to + * indicate the returned value type. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate + * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample + * structures that were actually written. This may differ from a previously read value as vGPU instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values + * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is + * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, + nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); + +/** + * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on + * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the + * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running + * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which + * the samples were recorded. Individual utilization values are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size + * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample + * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active + * in any given sample period. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is + * passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + unsigned int *vgpuProcessSamplesCount, + nvmlVgpuProcessUtilizationSample_t *utilizationSamples); +/** + * Queries the state of per process accounting mode on vGPU. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param mode Reference in which to return the current accounting mode + * + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature + * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode); + +/** + * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes + * returned can be in running or terminated state. + * + * For Maxwell &tm; or newer fully supported devices. + * + * To just query the maximum number of processes that can be queried, call this function with *count = 0 and + * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * + * For more details see \ref nvmlVgpuInstanceGetAccountingStats. + * + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids + * + * @return + * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a count is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlVgpuInstanceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids); + +/** + * Queries process's accounting stats. + * + * For Maxwell &tm; or newer fully supported devices. + * + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and + * can be queried during life time of the process or after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. + * + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids. + * + * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * + * @param vgpuInstance The identifier of the target vGPU instance + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats + * + * @return + * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a stats is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * or \a stats is not found + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats); + +/** + * Clears accounting information of the vGPU instance that have already terminated. + * + * For Maxwell &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. + * @note Only compute and graphics applications stats are reported and can be cleared since monitoring applications + * stats don't contribute to GPU utilization. + * + * @param vgpuInstance The identifier of the target vGPU instance + * + * @return + * - \ref NVML_SUCCESS if accounting information has been cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceClearAccountingPids(nvmlVgpuInstance_t vgpuInstance); + +/** + * Query the license information of the vGPU instance. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param licenseInfo Pointer to vGPU license information structure + * + * @return + * - \ref NVML_SUCCESS if information is successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licenseInfo is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running on the vGPU instance + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo_v2(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlExcludedGpuQueries Excluded GPU Queries + * This chapter describes NVML operations that are associated with excluded GPUs. + * @{ + */ +/***************************************************************************************************/ + +/** + * Excluded GPU device information + **/ +typedef struct nvmlExcludedDeviceInfo_st +{ + nvmlPciInfo_t pciInfo; //!< The PCI information for the excluded GPU + char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the excluded GPU +} nvmlExcludedDeviceInfo_t; + + /** + * Retrieves the number of excluded GPU devices in the system. + * + * For all products. + * + * @param deviceCount Reference in which to return the number of excluded devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL + */ +nvmlReturn_t DECLDIR nvmlGetExcludedDeviceCount(unsigned int *deviceCount); + +/** + * Acquire the device information for an excluded GPU device, based on its index. + * + * For all products. + * + * Valid indices are derived from the \a deviceCount returned by + * \ref nvmlGetExcludedDeviceCount(). For example, if \a deviceCount is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * @param index The index of the target GPU, >= 0 and < \a deviceCount + * @param info Reference in which to return the device information + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a info is NULL + * + * @see nvmlGetExcludedDeviceCount + */ +nvmlReturn_t DECLDIR nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlExcludedDeviceInfo_t *info); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlMultiInstanceGPU Multi Instance GPU Management + * This chapter describes NVML operations that are associated with Multi Instance GPU management. + * @{ + */ +/***************************************************************************************************/ + +/** + * Disable Multi Instance GPU mode. + */ +#define NVML_DEVICE_MIG_DISABLE 0x0 + +/** + * Enable Multi Instance GPU mode. + */ +#define NVML_DEVICE_MIG_ENABLE 0x1 + +/** + * GPU instance profiles. + * + * These macros should be passed to \ref nvmlDeviceGetGpuInstanceProfileInfo to retrieve the + * detailed information about a GPU instance such as profile ID, engine counts. + */ +#define NVML_GPU_INSTANCE_PROFILE_1_SLICE 0x0 +#define NVML_GPU_INSTANCE_PROFILE_2_SLICE 0x1 +#define NVML_GPU_INSTANCE_PROFILE_3_SLICE 0x2 +#define NVML_GPU_INSTANCE_PROFILE_4_SLICE 0x3 +#define NVML_GPU_INSTANCE_PROFILE_7_SLICE 0x4 +#define NVML_GPU_INSTANCE_PROFILE_8_SLICE 0x5 +#define NVML_GPU_INSTANCE_PROFILE_6_SLICE 0x6 +#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 0x7 +#define NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 0x8 +#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 0x9 +#define NVML_GPU_INSTANCE_PROFILE_COUNT 0xA + +typedef struct nvmlGpuInstancePlacement_st +{ + unsigned int start; //!< Index of first occupied memory slice + unsigned int size; //!< Number of memory slices occupied +} nvmlGpuInstancePlacement_t; + +/** + * GPU instance profile information. + */ +typedef struct nvmlGpuInstanceProfileInfo_st +{ + unsigned int id; //!< Unique profile ID within the device + unsigned int isP2pSupported; //!< Peer-to-Peer support + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< GPU instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int copyEngineCount; //!< Copy Engine count + unsigned int decoderCount; //!< Decoder Engine count + unsigned int encoderCount; //!< Encoder Engine count + unsigned int jpegCount; //!< JPEG Engine count + unsigned int ofaCount; //!< OFA Engine count + unsigned long long memorySizeMB; //!< Memory size in MBytes +} nvmlGpuInstanceProfileInfo_t; + +/** + * GPU instance profile information (v2). + * + * Version 2 adds the \ref nvmlGpuInstanceProfileInfo_v2_t.version field + * to the start of the structure, and the \ref nvmlGpuInstanceProfileInfo_v2_t.name + * field to the end. This structure is not backwards-compatible with + * \ref nvmlGpuInstanceProfileInfo_t. + */ +typedef struct nvmlGpuInstanceProfileInfo_v2_st +{ + unsigned int version; //!< Structure version identifier (set to \ref nvmlGpuInstanceProfileInfo_v2) + unsigned int id; //!< Unique profile ID within the device + unsigned int isP2pSupported; //!< Peer-to-Peer support + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< GPU instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int copyEngineCount; //!< Copy Engine count + unsigned int decoderCount; //!< Decoder Engine count + unsigned int encoderCount; //!< Encoder Engine count + unsigned int jpegCount; //!< JPEG Engine count + unsigned int ofaCount; //!< OFA Engine count + unsigned long long memorySizeMB; //!< Memory size in MBytes + char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name +} nvmlGpuInstanceProfileInfo_v2_t; + +/** + * Version identifier value for \ref nvmlGpuInstanceProfileInfo_v2_t.version. + */ +#define nvmlGpuInstanceProfileInfo_v2 NVML_STRUCT_VERSION(GpuInstanceProfileInfo, 2) + +typedef struct nvmlGpuInstanceInfo_st +{ + nvmlDevice_t device; //!< Parent device + unsigned int id; //!< Unique instance ID within the device + unsigned int profileId; //!< Unique profile ID within the device + nvmlGpuInstancePlacement_t placement; //!< Placement for this instance +} nvmlGpuInstanceInfo_t; + +typedef struct nvmlGpuInstance_st* nvmlGpuInstance_t; + +/** + * Compute instance profiles. + * + * These macros should be passed to \ref nvmlGpuInstanceGetComputeInstanceProfileInfo to retrieve the + * detailed information about a compute instance such as profile ID, engine counts + */ +#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE 0x0 +#define NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE 0x1 +#define NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE 0x2 +#define NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE 0x3 +#define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE 0x4 +#define NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE 0x5 +#define NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE 0x6 +#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 0x7 +#define NVML_COMPUTE_INSTANCE_PROFILE_COUNT 0x8 + +#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED 0x0 //!< All the engines except multiprocessors would be shared +#define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT 0x1 + +typedef struct nvmlComputeInstancePlacement_st +{ + unsigned int start; //!< Index of first occupied compute slice + unsigned int size; //!< Number of compute slices occupied +} nvmlComputeInstancePlacement_t; + +/** + * Compute instance profile information. + */ +typedef struct nvmlComputeInstanceProfileInfo_st +{ + unsigned int id; //!< Unique profile ID within the GPU instance + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< Compute instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count +} nvmlComputeInstanceProfileInfo_t; + +/** + * Compute instance profile information (v2). + * + * Version 2 adds the \ref nvmlComputeInstanceProfileInfo_v2_t.version field + * to the start of the structure, and the \ref nvmlComputeInstanceProfileInfo_v2_t.name + * field to the end. This structure is not backwards-compatible with + * \ref nvmlComputeInstanceProfileInfo_t. + */ +typedef struct nvmlComputeInstanceProfileInfo_v2_st +{ + unsigned int version; //!< Structure version identifier (set to \ref nvmlComputeInstanceProfileInfo_v2) + unsigned int id; //!< Unique profile ID within the GPU instance + unsigned int sliceCount; //!< GPU Slice count + unsigned int instanceCount; //!< Compute instance count + unsigned int multiprocessorCount; //!< Streaming Multiprocessor count + unsigned int sharedCopyEngineCount; //!< Shared Copy Engine count + unsigned int sharedDecoderCount; //!< Shared Decoder Engine count + unsigned int sharedEncoderCount; //!< Shared Encoder Engine count + unsigned int sharedJpegCount; //!< Shared JPEG Engine count + unsigned int sharedOfaCount; //!< Shared OFA Engine count + char name[NVML_DEVICE_NAME_V2_BUFFER_SIZE]; //!< Profile name +} nvmlComputeInstanceProfileInfo_v2_t; + +/** + * Version identifier value for \ref nvmlComputeInstanceProfileInfo_v2_t.version. + */ +#define nvmlComputeInstanceProfileInfo_v2 NVML_STRUCT_VERSION(ComputeInstanceProfileInfo, 2) + +typedef struct nvmlComputeInstanceInfo_st +{ + nvmlDevice_t device; //!< Parent device + nvmlGpuInstance_t gpuInstance; //!< Parent GPU instance + unsigned int id; //!< Unique instance ID within the GPU instance + unsigned int profileId; //!< Unique profile ID within the GPU instance + nvmlComputeInstancePlacement_t placement; //!< Placement for this instance within the GPU instance's compute slice range {0, sliceCount} +} nvmlComputeInstanceInfo_t; + +typedef struct nvmlComputeInstance_st* nvmlComputeInstance_t; + +/** + * Set MIG mode for the device. + * + * For Ampere &tm; or newer fully supported devices. + * Requires root user. + * + * This mode determines whether a GPU instance can be created. + * + * This API may unbind or reset the device to activate the requested mode. Thus, the attributes associated with the + * device, such as minor number, might change. The caller of this API is expected to query such attributes again. + * + * On certain platforms like pass-through virtualization, where reset functionality may not be exposed directly, VM + * reboot is required. \a activationStatus would return \ref NVML_ERROR_RESET_REQUIRED for such cases. + * + * \a activationStatus would return the appropriate error code upon unsuccessful activation. For example, if device + * unbind fails because the device isn't idle, \ref NVML_ERROR_IN_USE would be returned. The caller of this API + * is expected to idle the device and retry setting the \a mode. + * + * @note On Windows, only disabling MIG mode is supported. \a activationStatus would return \ref + * NVML_ERROR_NOT_SUPPORTED as GPU reset is not supported on Windows through this API. + * + * @param device The identifier of the target device + * @param mode The mode to be set, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * @param activationStatus The activationStatus status + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device,\a mode or \a activationStatus are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMigMode(nvmlDevice_t device, unsigned int mode, nvmlReturn_t *activationStatus); + +/** + * Get MIG mode for the device. + * + * For Ampere &tm; or newer fully supported devices. + * + * Changing MIG modes may require device unbind or reset. The "pending" MIG mode refers to the target mode following the + * next activation trigger. + * + * @param device The identifier of the target device + * @param currentMode Returns the current mode, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * @param pendingMode Returns the pending mode, \ref NVML_DEVICE_MIG_DISABLE or + * \ref NVML_DEVICE_MIG_ENABLE + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a currentMode or \a pendingMode are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support MIG mode + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMigMode(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); + +/** + * Get GPU instance profile information. + * + * Information provided by this API is immutable throughout the lifetime of a MIG mode. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile or \a info are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfo(nvmlDevice_t device, unsigned int profile, + nvmlGpuInstanceProfileInfo_t *info); + +/** + * Versioned wrapper around \ref nvmlDeviceGetGpuInstanceProfileInfo that accepts a versioned + * \ref nvmlGpuInstanceProfileInfo_v2_t or later output structure. + * + * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the + * appropriate version prior to calling this function. For example: + * \code + * nvmlGpuInstanceProfileInfo_v2_t profileInfo = + * { .version = nvmlGpuInstanceProfileInfo_v2 }; + * nvmlReturn_t result = nvmlDeviceGetGpuInstanceProfileInfoV(device, + * profile, + * &profileInfo); + * \endcode + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param profile One of the NVML_GPU_INSTANCE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a info, or \a info->version are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceProfileInfoV(nvmlDevice_t device, unsigned int profile, + nvmlGpuInstanceProfileInfo_v2_t *info); + +/** + * Get GPU instance placements. + * + * A placement represents the location of a GPU instance within a device. This API only returns all the possible + * placements for the given profile. + * A created GPU instance occupies memory slices described by its placement. Creation of new GPU instance will + * fail if there is overlap with the already occupied memory slices. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param placements Returns placements allowed for the profile. Can be NULL to discover number + * of allowed placements for this profile. If non-NULL must be large enough + * to accommodate the placements supported by the profile. + * @param count Returns number of allowed placemenets for the profile. + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements_v2(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstancePlacement_t *placements, + unsigned int *count); + +/** + * Get GPU instance profile capacity. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param count Returns remaining instance count for the profile ID + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceRemainingCapacity(nvmlDevice_t device, unsigned int profileId, + unsigned int *count); + +/** + * Create GPU instance. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would + * become invalid. The GPU instance must be recreated to acquire a valid handle. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param gpuInstance Returns the GPU instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId or \a gpuInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created + */ +nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstance(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstance_t *gpuInstance); + +/** + * Create GPU instance with the specified placement. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the GPU instance is destroyed explicitly, the GPU instance handle would + * become invalid. The GPU instance must be recreated to acquire a valid handle. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param placement The requested placement. See \ref nvmlDeviceGetGpuInstancePossiblePlacements_v2 + * @param gpuInstance Returns the GPU instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profile, \a profileId, \a placement or \a gpuInstance + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested GPU instance could not be created + */ +nvmlReturn_t DECLDIR nvmlDeviceCreateGpuInstanceWithPlacement(nvmlDevice_t device, unsigned int profileId, + const nvmlGpuInstancePlacement_t *placement, + nvmlGpuInstance_t *gpuInstance); +/** + * Destroy GPU instance. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The GPU instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or in vGPU guest + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_IN_USE If the GPU instance is in use. This error would be returned if processes + * (e.g. CUDA application) or compute instances are active on the + * GPU instance. + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceDestroy(nvmlGpuInstance_t gpuInstance); + +/** + * Get GPU instances for given profile ID. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param profileId The GPU instance profile ID. See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param gpuInstances Returns pre-exiting GPU instances, the buffer must be large enough to + * accommodate the instances supported by the profile. + * See \ref nvmlDeviceGetGpuInstanceProfileInfo + * @param count The count of returned GPU instances + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a profileId, \a gpuInstances or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstances(nvmlDevice_t device, unsigned int profileId, + nvmlGpuInstance_t *gpuInstances, unsigned int *count); + +/** + * Get GPU instances for given instance ID. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param device The identifier of the target device + * @param id The GPU instance ID + * @param gpuInstance Returns GPU instance + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a id or \a gpuInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_FOUND If the GPU instance is not found. + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceById(nvmlDevice_t device, unsigned int id, nvmlGpuInstance_t *gpuInstance); + +/** + * Get GPU instance information. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param gpuInstance The GPU instance handle + * @param info Return GPU instance information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance or \a info are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetInfo(nvmlGpuInstance_t gpuInstance, nvmlGpuInstanceInfo_t *info); + +/** + * Get compute instance profile information. + * + * Information provided by this API is immutable throughout the lifetime of a MIG mode. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* + * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile or \a info are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfo(nvmlGpuInstance_t gpuInstance, unsigned int profile, + unsigned int engProfile, + nvmlComputeInstanceProfileInfo_t *info); + +/** + * Versioned wrapper around \ref nvmlGpuInstanceGetComputeInstanceProfileInfo that accepts a versioned + * \ref nvmlComputeInstanceProfileInfo_v2_t or later output structure. + * + * @note The caller must set the \ref nvmlGpuInstanceProfileInfo_v2_t.version field to the + * appropriate version prior to calling this function. For example: + * \code + * nvmlComputeInstanceProfileInfo_v2_t profileInfo = + * { .version = nvmlComputeInstanceProfileInfo_v2 }; + * nvmlReturn_t result = nvmlGpuInstanceGetComputeInstanceProfileInfoV(gpuInstance, + * profile, + * engProfile, + * &profileInfo); + * \endcode + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profile One of the NVML_COMPUTE_INSTANCE_PROFILE_* + * @param engProfile One of the NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_* + * @param info Returns detailed profile information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a engProfile, \a info, or \a info->version are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profile isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstance_t gpuInstance, unsigned int profile, + unsigned int engProfile, + nvmlComputeInstanceProfileInfo_v2_t *info); + +/** + * Get compute instance profile capacity. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param count Returns remaining instance count for the profile ID + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a availableCount are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, + unsigned int profileId, unsigned int *count); + +/** + * Create compute instance. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed + * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire + * a valid handle. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param computeInstance Returns the compute instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + nvmlComputeInstance_t *computeInstance); + +/** + * Destroy compute instance. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param computeInstance The compute instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance is invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_IN_USE If the compute instance is in use. This error would be returned if + * processes (e.g. CUDA application) are active on the compute instance. + */ +nvmlReturn_t DECLDIR nvmlComputeInstanceDestroy(nvmlComputeInstance_t computeInstance); + +/** + * Get compute instances for given profile ID. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param computeInstances Returns pre-exiting compute instances, the buffer must be large enough to + * accommodate the instances supported by the profile. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param count The count of returned compute instances + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId, \a computeInstances or \a count + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstances(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + nvmlComputeInstance_t *computeInstances, unsigned int *count); + +/** + * Get compute instance for given instance ID. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * @param gpuInstance The identifier of the target GPU instance + * @param id The compute instance ID + * @param computeInstance Returns compute instance + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a device, \a ID or \a computeInstance are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_NOT_FOUND If the compute instance is not found. + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceById(nvmlGpuInstance_t gpuInstance, unsigned int id, + nvmlComputeInstance_t *computeInstance); + +/** + * Get compute instance information. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param computeInstance The compute instance handle + * @param info Return compute instance information + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a computeInstance or \a info are invalid + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo_v2(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); + +/** + * Test if the given handle refers to a MIG device. + * + * A MIG device handle is an NVML abstraction which maps to a MIG compute instance. + * These overloaded references can be used (with some restrictions) interchangeably + * with a GPU device handle to execute queries at a per-compute instance granularity. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device NVML handle to test + * @param isMigDevice True when handle refers to a MIG device + * + * @return + * - \ref NVML_SUCCESS if \a device status was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device handle or \a isMigDevice reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice); + +/** + * Get GPU instance ID for the given MIG device handle. + * + * GPU instance IDs are unique per device and remain valid until the GPU instance is destroyed. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Target MIG device handle + * @param id GPU instance ID + * + * @return + * - \ref NVML_SUCCESS if instance ID was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstanceId(nvmlDevice_t device, unsigned int *id); + +/** + * Get compute instance ID for the given MIG device handle. + * + * Compute instance IDs are unique per GPU instance and remain valid until the compute instance + * is destroyed. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Target MIG device handle + * @param id Compute instance ID + * + * @return + * - \ref NVML_SUCCESS if instance ID was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a id reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeInstanceId(nvmlDevice_t device, unsigned int *id); + +/** + * Get the maximum number of MIG devices that can exist under a given parent NVML device. + * + * Returns zero if MIG is not supported or enabled. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Target device handle + * @param count Count of MIG devices + * + * @return + * - \ref NVML_SUCCESS if \a count was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a count reference is invalid + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count); + +/** + * Get MIG device handle for the given index under its parent NVML device. + * + * If the compute instance is destroyed either explicitly or by destroying, + * resetting or unbinding the parent GPU instance or the GPU device itself + * the MIG device handle would remain invalid and must be requested again + * using this API. Handles may be reused and their properties can change in + * the process. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device Reference to the parent GPU device handle + * @param index Index of the MIG device + * @param migDevice Reference to the MIG device handle + * + * @return + * - \ref NVML_SUCCESS if \a migDevice handle was successfully created + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a index or \a migDevice reference is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_NOT_FOUND if no valid MIG device was found at \a index + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index, + nvmlDevice_t *migDevice); + +/** + * Get parent device handle from a MIG device handle. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param migDevice MIG device handle + * @param device Device handle + * + * @return + * - \ref NVML_SUCCESS if \a device handle was successfully created + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a migDevice or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t migDevice, nvmlDevice_t *device); + +/** + * Get the type of the GPU Bus (PCIe, PCI, ...) + * + * @param device The identifier of the target device + * @param type The PCI Bus type + * + * return + * - \ref NVML_SUCCESS if the bus \a type is successfully retreived + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \device is invalid or \type is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type); + +/** + * Retrieve performance monitor samples from the associated subdevice. + * + * @param device + * @param pDynamicPstatesInfo + * + * @return + * - \ref NVML_SUCCESS if \a pDynamicPstatesInfo has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pDynamicPstatesInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo); + +/** + * Sets the speed of a specified fan. + * + * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor + * the temperature and adjust the fan speed accordingly. + * If you set the fan speed too low you can burn your GPU! + * Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy. + * + * For all cuda-capable discrete products with fans that are Maxwell or Newer. + * + * device The identifier of the target device + * fan The index of the fan, starting at zero + * speed The target speed of the fan [0-100] in % of max speed + * + * return + * NVML_SUCCESS if the fan speed has been set + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if the device is not valid, or the speed is outside acceptable ranges, + * or if the fan index doesn't reference an actual fan. + * NVML_ERROR_NOT_SUPPORTED if the device is older than Maxwell. + * NVML_ERROR_UNKNOWN if there was an unexpected error. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); + +/** + * Retrieve the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved GPCCLK VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); + +/** + * Set the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[in] offset The GPCCLK VF offset value to set + * + * @return + * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); + +/** + * Retrieve the MemClk (Memory Clock) VF offset value. + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved MemClk VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset); + +/** + * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. + * @param[in] device The identifier of the target device + * @param[in] offset The MemClk VF offset value to set + * + * @return + * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); + +/** + * Retrieve min and max clocks of some clock domain for a given PState + * + * @param device The identifier of the target device + * @param type Clock domain + * @param pstate PState to query + * @param minClockMHz Reference in which to return min clock frequency + * @param maxClockMHz Reference in which to return max clock frequency + * + * @return + * - \ref NVML_SUCCESS if everything worked + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both + * \a minClockMHz and \a maxClockMHz are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, + unsigned int * minClockMHz, unsigned int * maxClockMHz); + +/** + * Get all supported Performance States (P-States) for the device. + * + * The returned array would contain a contiguous list of valid P-States supported by + * the device. If the number of supported P-States is fewer than the size of the array + * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN. + * + * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES. + * + * @param device The identifier of the target device + * @param pstates Container to return the list of performance states + * supported by device + * @param size Size of the supplied \a pstates array in bytes + * + * @return + * - \ref NVML_SUCCESS if \a pstates array has been retrieved + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to + * hold the resulting list + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a pstates is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support performance state readings + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, + nvmlPstates_t *pstates, unsigned int size); + +/** + * Retrieve the GPCCLK min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved GPCCLK VF min offset value + * @param[out] maxOffset The retrieved GPCCLK VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + +/** + * Retrieve the MemClk (Memory Clock) min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved MemClk VF min offset value + * @param[out] maxOffset The retrieved MemClk VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + +/** + * Get Conf Computing System capabilities. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param capabilities System CC capabilities + * + * @return + * - \ref NVML_SUCCESS if \a capabilities were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capabilities is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeCapabilities(nvmlConfComputeSystemCaps_t *capabilities); + +/** + * Get Conf Computing System State. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param state System CC State + * + * @return + * - \ref NVML_SUCCESS if \a state were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a state is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeState(nvmlConfComputeSystemState_t *state); + +/** + * Get Conf Computing Protected and Unprotected Memory Sizes. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device Device handle + * @param memInfo Protected/Unprotected Memory sizes + * + * @return + * - \ref NVML_SUCCESS if \a memInfo were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a memInfo or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeMemSizeInfo(nvmlDevice_t device, nvmlConfComputeMemSizeInfo_t *memInfo); + +/** + * Set Conf Computing Protected Memory Size. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device Device Handle + * @param sizeKiB Protected Memory size to be set in KiB + * + * @return + * - \ref NVML_SUCCESS if \a sizeKiB successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlDeviceSetConfComputeProtectedMemSize(nvmlDevice_t device, unsigned long long sizeKiB); + +/** + * Set Conf Computing GPUs ready state. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param isAcceptingWork GPU accepting new work, NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE + * + * return + * - \ref NVML_SUCCESS if \a current GPUs ready state is successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemSetConfComputeGpusReadyState(unsigned int isAcceptingWork); + +/** + * Get Conf Computing GPUs ready state. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param isAcceptingWork Returns GPU current work accepting state, + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE or + * NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE + * + * return + * - \ref NVML_SUCCESS if \a current GPUs ready state were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a isAcceptingWork is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlSystemGetConfComputeGpusReadyState(unsigned int *isAcceptingWork); + +/** + * Get Conf Computing protected memory usage. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param memory Reference in which to return the memory information + * + * @return + * - \ref NVML_SUCCESS if \a memory has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeProtectedMemoryUsage(nvmlDevice_t device, nvmlMemory_t *memory); + +/** + * Get Conf Computing Gpu certificate details. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param gpuCert Reference in which to return the gpu certificate information + * + * @return + * - \ref NVML_SUCCESS if \a gpu certificate info has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuCertificate(nvmlDevice_t device, + nvmlConfComputeGpuCertificate_t *gpuCert); + +/** + * Get Conf Computing Gpu attestation report. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param gpuAtstReport Reference in which to return the gpu attestation report + * + * @return + * - \ref NVML_SUCCESS if \a gpu attestation report has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetConfComputeGpuAttestationReport(nvmlDevice_t device, + nvmlConfComputeGpuAttestationReport_t *gpuAtstReport); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup GPM NVML GPM + * @{ + */ +/***************************************************************************************************/ +/** @defgroup nvmlGpmEnums GPM Enums + * @{ + */ +/***************************************************************************************************/ + +/* GPM Metric Identifiers */ +typedef enum +{ + NVML_GPM_METRIC_GRAPHICS_UTIL = 1, /* Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 */ + NVML_GPM_METRIC_SM_UTIL = 2, /* Percentage of SMs that were busy. 0.0 - 100.0 */ + NVML_GPM_METRIC_SM_OCCUPANCY = 3, /* Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 */ + NVML_GPM_METRIC_INTEGER_UTIL = 4, /* Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5, /* Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6, /* Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7, /* Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9, /* Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_DRAM_BW_UTIL = 10, /* Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP64_UTIL = 11, /* Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP32_UTIL = 12, /* Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP16_UTIL = 13, /* Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 */ + NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20, /* PCIe traffic from this GPU in MiB/sec */ + NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21, /* PCIe traffic to this GPU in MiB/sec */ + NVML_GPM_METRIC_NVDEC_0_UTIL = 30, /* Percent utilization of NVDEC 0. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_1_UTIL = 31, /* Percent utilization of NVDEC 1. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_2_UTIL = 32, /* Percent utilization of NVDEC 2. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_3_UTIL = 33, /* Percent utilization of NVDEC 3. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_4_UTIL = 34, /* Percent utilization of NVDEC 4. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_5_UTIL = 35, /* Percent utilization of NVDEC 5. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_6_UTIL = 36, /* Percent utilization of NVDEC 6. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_7_UTIL = 37, /* Percent utilization of NVDEC 7. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_0_UTIL = 40, /* Percent utilization of NVJPG 0. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_1_UTIL = 41, /* Percent utilization of NVJPG 1. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_2_UTIL = 42, /* Percent utilization of NVJPG 2. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_3_UTIL = 43, /* Percent utilization of NVJPG 3. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_4_UTIL = 44, /* Percent utilization of NVJPG 4. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_5_UTIL = 45, /* Percent utilization of NVJPG 5. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_6_UTIL = 46, /* Percent utilization of NVJPG 6. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_7_UTIL = 47, /* Percent utilization of NVJPG 7. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVOFA_0_UTIL = 50, /* Percent utilization of NVOFA 0. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60, /* NvLink read bandwidth for all links in MiB/sec */ + NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61, /* NvLink write bandwidth for all links in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62, /* NvLink read bandwidth for link 0 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63, /* NvLink write bandwidth for link 0 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64, /* NvLink read bandwidth for link 1 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65, /* NvLink write bandwidth for link 1 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66, /* NvLink read bandwidth for link 2 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67, /* NvLink write bandwidth for link 2 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68, /* NvLink read bandwidth for link 3 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69, /* NvLink write bandwidth for link 3 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70, /* NvLink read bandwidth for link 4 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71, /* NvLink write bandwidth for link 4 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72, /* NvLink read bandwidth for link 5 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73, /* NvLink write bandwidth for link 5 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74, /* NvLink read bandwidth for link 6 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75, /* NvLink write bandwidth for link 6 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76, /* NvLink read bandwidth for link 7 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77, /* NvLink write bandwidth for link 7 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78, /* NvLink read bandwidth for link 8 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79, /* NvLink write bandwidth for link 8 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80, /* NvLink read bandwidth for link 9 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81, /* NvLink write bandwidth for link 9 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82, /* NvLink read bandwidth for link 10 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83, /* NvLink write bandwidth for link 10 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84, /* NvLink read bandwidth for link 11 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85, /* NvLink write bandwidth for link 11 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86, /* NvLink read bandwidth for link 12 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87, /* NvLink write bandwidth for link 12 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88, /* NvLink read bandwidth for link 13 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89, /* NvLink write bandwidth for link 13 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90, /* NvLink read bandwidth for link 14 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91, /* NvLink write bandwidth for link 14 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92, /* NvLink read bandwidth for link 15 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93, /* NvLink write bandwidth for link 15 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94, /* NvLink read bandwidth for link 16 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95, /* NvLink write bandwidth for link 16 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96, /* NvLink read bandwidth for link 17 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97, /* NvLink write bandwidth for link 17 in MiB/sec */ + NVML_GPM_METRIC_MAX = 98, /* Maximum value above +1. Note that changing this + should also change NVML_GPM_METRICS_GET_VERSION + due to struct size change */ +} nvmlGpmMetricId_t; + +/** @} */ // @defgroup nvmlGpmEnums + + +/***************************************************************************************************/ +/** @defgroup nvmlGpmStructs GPM Structs + * @{ + */ +/***************************************************************************************************/ + +/* Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc() + Free this with nvmlGpmSampleFree() */ +typedef struct nvmlGpmSample_st* nvmlGpmSample_t; + +typedef struct +{ + unsigned int metricId; /* IN: NVML_GPM_METRIC_? #define of which metric to retrieve */ + nvmlReturn_t nvmlReturn; /* OUT: Status of this metric. If this is nonzero, then value is not valid */ + double value; /* OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) */ + struct + { + char *shortName; + char *longName; + char *unit; + } metricInfo; /* OUT: Metric name and unit. Those can be NULL if not defined */ +} nvmlGpmMetric_t; + +typedef struct +{ + unsigned int version; /* IN: Set to NVML_GPM_METRICS_GET_VERSION */ + unsigned int numMetrics; /* IN: How many metrics to retrieve in metrics[] */ + nvmlGpmSample_t sample1; /* IN: Sample buffer */ + nvmlGpmSample_t sample2; /* IN: Sample buffer */ + nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX]; /* IN/OUT: Array of metrics. Set metricId on call. + see nvmlReturn and value on return */ +} nvmlGpmMetricsGet_t; + +#define NVML_GPM_METRICS_GET_VERSION 1 + +typedef struct +{ + unsigned int version; /* IN: Set to NVML_GPM_SUPPORT_VERSION */ + unsigned int isSupportedDevice; /* OUT: Indicates device support */ +} nvmlGpmSupport_t; + +#define NVML_GPM_SUPPORT_VERSION 1 + +/** @} */ // @defgroup nvmlGPMStructs + +/***************************************************************************************************/ +/** @defgroup nvmlGpmFunctions GPM Functions + * @{ + */ +/***************************************************************************************************/ + +/** + * Calculate GPM metrics from two samples. + * + * + * @param metricsGet IN/OUT: populated nvmlGpmMetricsGet_t struct + * + * %HOPPER_OR_NEWER% + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ +nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); + + +/** + * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() + * + * %HOPPER_OR_NEWER% + * + * @param gpmSample Sample to free + * + * @return + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided + */ +nvmlReturn_t DECLDIR nvmlGpmSampleFree(nvmlGpmSample_t gpmSample); + + +/** + * Allocate a sample buffer to be used with NVML GPM . You will need to allocate + * at least two of these buffers to use with the NVML GPM feature + * + * %HOPPER_OR_NEWER% + * + * @param gpmSample Where the allocated sample will be stored + * + * @return + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided + * - \ref NVML_ERROR_MEMORY if system memory is insufficient + */ +nvmlReturn_t DECLDIR nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample); + +/** + * Read a sample of GPM metrics into the provided \a gpmSample buffer. After + * two samples are gathered, you can call nvmlGpmMetricGet on those samples to + * retrive metrics + * + * %HOPPER_OR_NEWER% + * + * @param device Device to get samples for + * @param gpmSample Buffer to read samples into + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ +nvmlReturn_t DECLDIR nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample); + +/** + * Indicate whether the supplied device supports GPM + * + * @param device NVML device to query for + * @param gpmSupport Structure to indicate GPM support. Indicates + * GPM support per system for the supplied device + * + * @return + * - NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum if there is an error in processing the query + */ + +/** + * Read a sample of GPM metrics into the provided \a gpmSample buffer for a MIG GPU Instance. + * + * After two samples are gathered, you can call nvmlGpmMetricGet on those + * samples to retrive metrics + * + * %HOPPER_OR_NEWER% + * + * @param device Device to get samples for + * @param gpuInstanceId MIG GPU Instance ID + * @param gpmSample Buffer to read samples into + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ +nvmlReturn_t DECLDIR nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample); + +nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); + +/** @} */ // @defgroup nvmlGpmFunctions +/** @} */ // @defgroup GPM + +/** + * NVML API versioning support + */ + +#ifdef NVML_NO_UNVERSIONED_FUNC_DEFS +nvmlReturn_t DECLDIR nvmlInit(void); +nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v2(nvmlDevice_t device, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v2(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v3(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu(nvmlPciInfo_t *pciInfo); +nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); +nvmlReturn_t DECLDIR nvmlDeviceGetAttributes(nvmlDevice_t device, nvmlDeviceAttributes_t *attributes); +nvmlReturn_t DECLDIR nvmlComputeInstanceGetInfo(nvmlComputeInstance_t computeInstance, nvmlComputeInstanceInfo_t *info); +nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v1_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); +nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstancePlacement_t *placements, unsigned int *count); +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); +#endif // #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS + +#if defined(NVML_NO_UNVERSIONED_FUNC_DEFS) +// We don't define APIs to run new versions if this guard is present so there is +// no need to undef +#elif defined(__NVML_API_VERSION_INTERNAL) +#undef nvmlDeviceGetGraphicsRunningProcesses +#undef nvmlDeviceGetComputeRunningProcesses +#undef nvmlDeviceGetMPSComputeRunningProcesses +#undef nvmlDeviceGetAttributes +#undef nvmlComputeInstanceGetInfo +#undef nvmlEventSetWait +#undef nvmlDeviceGetGridLicensableFeatures +#undef nvmlDeviceRemoveGpu +#undef nvmlDeviceGetNvLinkRemotePciInfo +#undef nvmlDeviceGetPciInfo +#undef nvmlDeviceGetCount +#undef nvmlDeviceGetHandleByIndex +#undef nvmlDeviceGetHandleByPciBusId +#undef nvmlInit +#undef nvmlBlacklistDeviceInfo_t +#undef nvmlGetBlacklistDeviceCount +#undef nvmlGetBlacklistDeviceInfoByIndex +#undef nvmlDeviceGetGpuInstancePossiblePlacements +#undef nvmlVgpuInstanceGetLicenseInfo +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/policy.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/policy.go new file mode 100644 index 0000000000..ec14da1c14 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/policy.go @@ -0,0 +1,398 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" + +// wrapper for go callback function +extern int violationNotify(void* p); +*/ +import "C" +import ( + "context" + "encoding/binary" + "fmt" + "log" + "sync" + "time" + "unsafe" +) + +type policyCondition string + +const ( + DbePolicy = policyCondition("Double-bit ECC error") + PCIePolicy = policyCondition("PCI error") + MaxRtPgPolicy = policyCondition("Max Retired Pages Limit") + ThermalPolicy = policyCondition("Thermal Limit") + PowerPolicy = policyCondition("Power Limit") + NvlinkPolicy = policyCondition("Nvlink Error") + XidPolicy = policyCondition("XID Error") +) + +type PolicyViolation struct { + Condition policyCondition + Timestamp time.Time + Data interface{} +} + +type policyIndex int + +const ( + dbePolicyIndex policyIndex = iota + pciePolicyIndex + maxRtPgPolicyIndex + thermalPolicyIndex + powerPolicyIndex + nvlinkPolicyIndex + xidPolicyIndex +) + +type policyConditionParam struct { + typ uint32 + value uint32 +} + +type dbePolicyCondition struct { + Location string + NumErrors uint +} + +type pciPolicyCondition struct { + ReplayCounter uint +} + +type retiredPagesPolicyCondition struct { + SbePages uint + DbePages uint +} + +type thermalPolicyCondition struct { + ThermalViolation uint +} + +type powerPolicyCondition struct { + PowerViolation uint +} + +type nvlinkPolicyCondition struct { + FieldId uint16 + Counter uint +} + +type xidPolicyCondition struct { + ErrNum uint +} + +var ( + policyChanOnce sync.Once + policyMapOnce sync.Once + + // callbacks maps PolicyViolation channels with policy + // captures C callback() value for each violation condition + callbacks map[string]chan PolicyViolation + + // paramMap maps C.dcgmPolicy_t.parms index and limits + // to be used in setPolicy() for setting user selected policies + paramMap map[policyIndex]policyConditionParam +) + +func makePolicyChannels() { + policyChanOnce.Do(func() { + callbacks = make(map[string]chan PolicyViolation) + callbacks["dbe"] = make(chan PolicyViolation, 1) + callbacks["pcie"] = make(chan PolicyViolation, 1) + callbacks["maxrtpg"] = make(chan PolicyViolation, 1) + callbacks["thermal"] = make(chan PolicyViolation, 1) + callbacks["power"] = make(chan PolicyViolation, 1) + callbacks["nvlink"] = make(chan PolicyViolation, 1) + callbacks["xid"] = make(chan PolicyViolation, 1) + }) +} + +func makePolicyParmsMap() { + const ( + policyFieldTypeBool = 0 + policyFieldTypeLong = 1 + policyBoolValue = 1 + policyMaxRtPgThreshold = 10 + policyThermalThreshold = 100 + policyPowerThreshold = 250 + ) + + policyMapOnce.Do(func() { + paramMap = make(map[policyIndex]policyConditionParam) + paramMap[dbePolicyIndex] = policyConditionParam{ + typ: policyFieldTypeBool, + value: policyBoolValue, + } + + paramMap[pciePolicyIndex] = policyConditionParam{ + typ: policyFieldTypeBool, + value: policyBoolValue, + } + + paramMap[maxRtPgPolicyIndex] = policyConditionParam{ + typ: policyFieldTypeLong, + value: policyMaxRtPgThreshold, + } + + paramMap[thermalPolicyIndex] = policyConditionParam{ + typ: policyFieldTypeLong, + value: policyThermalThreshold, + } + + paramMap[powerPolicyIndex] = policyConditionParam{ + typ: policyFieldTypeLong, + value: policyPowerThreshold, + } + + paramMap[nvlinkPolicyIndex] = policyConditionParam{ + typ: policyFieldTypeBool, + value: policyBoolValue, + } + + paramMap[xidPolicyIndex] = policyConditionParam{ + typ: policyFieldTypeBool, + value: policyBoolValue, + } + }) +} + +// ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify() +// +//export ViolationRegistration +func ViolationRegistration(data unsafe.Pointer) int { + var con policyCondition + var timestamp time.Time + var val interface{} + + response := *(*C.dcgmPolicyCallbackResponse_t)(unsafe.Pointer(data)) + + switch response.condition { + case C.DCGM_POLICY_COND_DBE: + dbe := (*C.dcgmPolicyConditionDbe_t)(unsafe.Pointer(&response.val)) + con = DbePolicy + timestamp = createTimeStamp(dbe.timestamp) + val = dbePolicyCondition{ + Location: dbeLocation(int(dbe.location)), + NumErrors: *uintPtr(dbe.numerrors), + } + case C.DCGM_POLICY_COND_PCI: + pci := (*C.dcgmPolicyConditionPci_t)(unsafe.Pointer(&response.val)) + con = PCIePolicy + timestamp = createTimeStamp(pci.timestamp) + val = pciPolicyCondition{ + ReplayCounter: *uintPtr(pci.counter), + } + case C.DCGM_POLICY_COND_MAX_PAGES_RETIRED: + mpr := (*C.dcgmPolicyConditionMpr_t)(unsafe.Pointer(&response.val)) + con = MaxRtPgPolicy + timestamp = createTimeStamp(mpr.timestamp) + val = retiredPagesPolicyCondition{ + SbePages: *uintPtr(mpr.sbepages), + DbePages: *uintPtr(mpr.dbepages), + } + case C.DCGM_POLICY_COND_THERMAL: + thermal := (*C.dcgmPolicyConditionThermal_t)(unsafe.Pointer(&response.val)) + con = ThermalPolicy + timestamp = createTimeStamp(thermal.timestamp) + val = thermalPolicyCondition{ + ThermalViolation: *uintPtr(thermal.thermalViolation), + } + case C.DCGM_POLICY_COND_POWER: + pwr := (*C.dcgmPolicyConditionPower_t)(unsafe.Pointer(&response.val)) + con = PowerPolicy + timestamp = createTimeStamp(pwr.timestamp) + val = powerPolicyCondition{ + PowerViolation: *uintPtr(pwr.powerViolation), + } + case C.DCGM_POLICY_COND_NVLINK: + nvlink := (*C.dcgmPolicyConditionNvlink_t)(unsafe.Pointer(&response.val)) + con = NvlinkPolicy + timestamp = createTimeStamp(nvlink.timestamp) + val = nvlinkPolicyCondition{ + FieldId: uint16(nvlink.fieldId), + Counter: *uintPtr(nvlink.counter), + } + case C.DCGM_POLICY_COND_XID: + xid := (*C.dcgmPolicyConditionXID_t)(unsafe.Pointer(&response.val)) + con = XidPolicy + timestamp = createTimeStamp(xid.timestamp) + val = xidPolicyCondition{ + ErrNum: *uintPtr(xid.errnum), + } + } + + err := PolicyViolation{ + Condition: con, + Timestamp: timestamp, + Data: val, + } + + switch con { + case DbePolicy: + callbacks["dbe"] <- err + case PCIePolicy: + callbacks["pcie"] <- err + case MaxRtPgPolicy: + callbacks["maxrtpg"] <- err + case ThermalPolicy: + callbacks["thermal"] <- err + case PowerPolicy: + callbacks["power"] <- err + case NvlinkPolicy: + callbacks["nvlink"] <- err + case XidPolicy: + callbacks["xid"] <- err + } + return 0 +} + +func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList []policyIndex) (err error) { + var policy C.dcgmPolicy_t + policy.version = makeVersion1(unsafe.Sizeof(policy)) + policy.mode = C.dcgmPolicyMode_t(C.DCGM_OPERATION_MODE_AUTO) + policy.action = C.DCGM_POLICY_ACTION_NONE + policy.isolation = C.DCGM_POLICY_ISOLATION_NONE + policy.validation = C.DCGM_POLICY_VALID_NONE + policy.condition = condition + + // iterate on paramMap for given policy conditions + for _, key := range paramList { + conditionParam, exists := paramMap[policyIndex(key)] + if !exists { + return fmt.Errorf("Error: Invalid Policy condition, %v does not exist", key) + } + // set policy condition parameters + // set condition type (bool or longlong) + policy.parms[key].tag = conditionParam.typ + + // set condition val (violation threshold) + // policy.parms.val is a C union type + // cgo docs: Go doesn't have support for C's union type + // C union types are represented as a Go byte array + binary.LittleEndian.PutUint32(policy.parms[key].val[:], conditionParam.value) + } + + var statusHandle C.dcgmStatus_t + + result := C.dcgmPolicySet(handle.handle, groupId.handle, &policy, statusHandle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error setting policies: %s", err) + } + + log.Println("Policy successfully set.") + + return +} + +func registerPolicy(ctx context.Context, groupId GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error) { + // init policy globals for internal API + makePolicyChannels() + makePolicyParmsMap() + + // make a list of policy conditions for setting their parameters + var paramKeys []policyIndex + // get all conditions to be set in setPolicy() + var condition C.dcgmPolicyCondition_t = 0 + for _, t := range typ { + switch t { + case DbePolicy: + paramKeys = append(paramKeys, dbePolicyIndex) + condition |= C.DCGM_POLICY_COND_DBE + case PCIePolicy: + paramKeys = append(paramKeys, pciePolicyIndex) + condition |= C.DCGM_POLICY_COND_PCI + case MaxRtPgPolicy: + paramKeys = append(paramKeys, maxRtPgPolicyIndex) + condition |= C.DCGM_POLICY_COND_MAX_PAGES_RETIRED + case ThermalPolicy: + paramKeys = append(paramKeys, thermalPolicyIndex) + condition |= C.DCGM_POLICY_COND_THERMAL + case PowerPolicy: + paramKeys = append(paramKeys, powerPolicyIndex) + condition |= C.DCGM_POLICY_COND_POWER + case NvlinkPolicy: + paramKeys = append(paramKeys, nvlinkPolicyIndex) + condition |= C.DCGM_POLICY_COND_NVLINK + case XidPolicy: + paramKeys = append(paramKeys, xidPolicyIndex) + condition |= C.DCGM_POLICY_COND_XID + } + } + + var err error + if err = setPolicy(groupId, condition, paramKeys); err != nil { + return nil, err + } + + var finishCallback unsafe.Pointer + result := C.dcgmPolicyRegister(handle.handle, groupId.handle, C.dcgmPolicyCondition_t(condition), C.fpRecvUpdates(C.violationNotify), C.fpRecvUpdates(finishCallback)) + + if err = errorString(result); err != nil { + return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + log.Println("Listening for violations...") + + violation := make(chan PolicyViolation, len(typ)) + go func() { + defer func() { + log.Println("unregister policy violation...") + close(violation) + unregisterPolicy(groupId, condition) + }() + for { + select { + case dbe := <-callbacks["dbe"]: + violation <- dbe + case pcie := <-callbacks["pcie"]: + violation <- pcie + case maxrtpg := <-callbacks["maxrtpg"]: + violation <- maxrtpg + case thermal := <-callbacks["thermal"]: + violation <- thermal + case power := <-callbacks["power"]: + violation <- power + case nvlink := <-callbacks["nvlink"]: + violation <- nvlink + case xid := <-callbacks["xid"]: + violation <- xid + case <-ctx.Done(): + return + } + } + }() + + return violation, err +} + +func unregisterPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t) { + result := C.dcgmPolicyUnregister(handle.handle, groupId.handle, condition) + + if err := errorString(result); err != nil { + log.Println(fmt.Errorf("error unregistering policy: %s", err)) + } +} + +func createTimeStamp(t C.longlong) time.Time { + tm := int64(t) / 1000000 + ts := time.Unix(tm, 0) + return ts +} + +func dbeLocation(location int) string { + switch location { + case 0: + return "L1" + case 1: + return "L2" + case 2: + return "Device" + case 3: + return "Register" + case 4: + return "Texture" + } + return "N/A" +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/process_info.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/process_info.go new file mode 100644 index 0000000000..c5a710d03b --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/process_info.go @@ -0,0 +1,208 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "io/ioutil" + "math/rand" + "os" + "strings" + "time" + "unsafe" +) + +type Time uint64 + +func (t Time) String() string { + if t == 0 { + return "Running" + } + tm := time.Unix(int64(t), 0) + return tm.String() +} + +type ProcessUtilInfo struct { + StartTime Time + EndTime Time + EnergyConsumed *uint64 // Joules + SmUtil *float64 + MemUtil *float64 +} + +// ViolationTime measures amount of time (in ms) GPU was at reduced clocks +type ViolationTime struct { + Power *uint64 + Thermal *uint64 + Reliability *uint64 + BoardLimit *uint64 + LowUtilization *uint64 + SyncBoost *uint64 +} + +type XIDErrorInfo struct { + NumErrors int + Timestamp []uint64 +} + +type ProcessInfo struct { + GPU uint + PID uint + Name string + ProcessUtilization ProcessUtilInfo + PCI PCIStatusInfo + Memory MemoryInfo + GpuUtilization UtilizationInfo + Clocks ClockInfo + Violations ViolationTime + XIDErrors XIDErrorInfo +} + +// WatchPidFieldsEx is the same as WatchPidFields, but allows for modifying the update frequency, max samples, max +// sample age, and the GPUs on which to enable watches. +func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error) { + return watchPidFields(updateFreq, maxKeepAge, maxKeepSamples, gpus...) +} + +func watchPidFields(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (groupId GroupHandle, err error) { + groupName := fmt.Sprintf("watchPids%d", rand.Uint64()) + group, err := CreateGroup(groupName) + if err != nil { + return + } + numGpus := len(gpus) + + if numGpus == 0 { + gpus, err = getSupportedDevices() + if err != nil { + return + } + } + + for _, gpu := range gpus { + err = AddToGroup(group, gpu) + if err != nil { + return + } + + } + + result := C.dcgmWatchPidFields(handle.handle, group.handle, C.longlong(updateFreq.Microseconds()), C.double(maxKeepAge.Seconds()), C.int(maxKeepSamples)) + + if err = errorString(result); err != nil { + return groupId, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + _ = UpdateAllFields() + return group, nil +} + +func getProcessInfo(groupId GroupHandle, pid uint) (processInfo []ProcessInfo, err error) { + var pidInfo C.dcgmPidInfo_t + pidInfo.version = makeVersion2(unsafe.Sizeof(pidInfo)) + pidInfo.pid = C.uint(pid) + + result := C.dcgmGetPidInfo(handle.handle, groupId.handle, &pidInfo) + + if err = errorString(result); err != nil { + return processInfo, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + name, err := processName(pid) + if err != nil { + return processInfo, fmt.Errorf("Error getting process name: %s", err) + } + + for i := 0; i < int(pidInfo.numGpus); i++ { + + var energy uint64 + e := *uint64Ptr(pidInfo.gpus[i].energyConsumed) + if !IsInt64Blank(int64(e)) { + energy = e / 1000 // mWs to joules + } + + processUtil := ProcessUtilInfo{ + StartTime: Time(uint64(pidInfo.gpus[i].startTime) / 1000000), + EndTime: Time(uint64(pidInfo.gpus[i].endTime) / 1000000), + EnergyConsumed: &energy, + SmUtil: roundFloat(dblToFloat(pidInfo.gpus[i].processUtilization.smUtil)), + MemUtil: roundFloat(dblToFloat(pidInfo.gpus[i].processUtilization.memUtil)), + } + + // TODO figure out how to deal with blanks + pci := PCIStatusInfo{ + Throughput: PCIThroughputInfo{ + Rx: *int64Ptr(pidInfo.gpus[i].pcieRxBandwidth.average), + Tx: *int64Ptr(pidInfo.gpus[i].pcieTxBandwidth.average), + Replays: *int64Ptr(pidInfo.gpus[i].pcieReplays), + }, + } + + memory := MemoryInfo{ + GlobalUsed: *int64Ptr(pidInfo.gpus[i].maxGpuMemoryUsed), // max gpu memory used for this process + ECCErrors: ECCErrorsInfo{ + SingleBit: *int64Ptr(C.longlong(pidInfo.gpus[i].eccSingleBit)), + DoubleBit: *int64Ptr(C.longlong(pidInfo.gpus[i].eccDoubleBit)), + }, + } + + gpuUtil := UtilizationInfo{ + GPU: int64(pidInfo.gpus[i].smUtilization.average), + Memory: int64(pidInfo.gpus[i].memoryUtilization.average), + } + + violations := ViolationTime{ + Power: uint64Ptr(pidInfo.gpus[i].powerViolationTime), + Thermal: uint64Ptr(pidInfo.gpus[i].thermalViolationTime), + Reliability: uint64Ptr(pidInfo.gpus[i].reliabilityViolationTime), + BoardLimit: uint64Ptr(pidInfo.gpus[i].boardLimitViolationTime), + LowUtilization: uint64Ptr(pidInfo.gpus[i].lowUtilizationTime), + SyncBoost: uint64Ptr(pidInfo.gpus[i].syncBoostTime), + } + + clocks := ClockInfo{ + Cores: *int64Ptr(C.longlong(pidInfo.gpus[i].smClock.average)), + Memory: *int64Ptr(C.longlong(pidInfo.gpus[i].memoryClock.average)), + } + + numErrs := int(pidInfo.gpus[i].numXidCriticalErrors) + ts := make([]uint64, numErrs) + for i := 0; i < numErrs; i++ { + ts[i] = uint64(pidInfo.gpus[i].xidCriticalErrorsTs[i]) + } + xidErrs := XIDErrorInfo{ + NumErrors: numErrs, + Timestamp: ts, + } + + pInfo := ProcessInfo{ + GPU: uint(pidInfo.summary.gpuId), + PID: uint(pidInfo.pid), + Name: name, + ProcessUtilization: processUtil, + PCI: pci, + Memory: memory, + GpuUtilization: gpuUtil, + Clocks: clocks, + Violations: violations, + XIDErrors: xidErrs, + } + processInfo = append(processInfo, pInfo) + } + return +} + +func processName(pid uint) (string, error) { + f := fmt.Sprintf("/proc/%d/comm", pid) + b, err := ioutil.ReadFile(f) + if err != nil { + // TOCTOU: process terminated + if os.IsNotExist(err) { + return "", nil + } + return "", err + } + return strings.TrimSuffix(string(b), "\n"), nil +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/profile.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/profile.go new file mode 100644 index 0000000000..d3686f4db0 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/profile.go @@ -0,0 +1,47 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "unsafe" +) + +type MetricGroup struct { + Major uint + Minor uint + FieldIds []uint +} + +func getSupportedMetricGroups(gpuId uint) (groups []MetricGroup, err error) { + + var groupInfo C.dcgmProfGetMetricGroups_t + groupInfo.version = makeVersion3(unsafe.Sizeof(groupInfo)) + + groupInfo.gpuId = C.uint(gpuId) + + result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo) + + if err = errorString(result); err != nil { + return groups, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + var count = uint(groupInfo.numMetricGroups) + + for i := uint(0); i < count; i++ { + var group MetricGroup + group.Major = uint(groupInfo.metricGroups[i].majorId) + group.Minor = uint(groupInfo.metricGroups[i].minorId) + + var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds) + + for j := uint(0); j < fieldCount; j++ { + group.FieldIds = append(group.FieldIds, uint(groupInfo.metricGroups[i].fieldIds[j])) + } + groups = append(groups, group) + } + + return groups, nil +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/structs.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/structs.go new file mode 100644 index 0000000000..5b075efd90 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/structs.go @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +type MigProfile int + +const ( + MigProfileNone MigProfile = 0 /*!< No profile (for GPUs) */ + MigProfileGPUInstanceSlice1 MigProfile = 1 /*!< GPU instance slice 1 */ + MigProfileGPUInstanceSlice2 MigProfile = 2 /*!< GPU instance slice 2 */ + MigProfileGPUInstanceSlice3 MigProfile = 3 /*!< GPU instance slice 3 */ + MigProfileGPUInstanceSlice4 MigProfile = 4 /*!< GPU instance slice 4 */ + MigProfileGPUInstanceSlice7 MigProfile = 5 /*!< GPU instance slice 7 */ + MigProfileGPUInstanceSlice8 MigProfile = 6 /*!< GPU instance slice 8 */ + MigProfileGPUInstanceSlice6 MigProfile = 7 /*!< GPU instance slice 6 */ + MigProfileGPUInstanceSlice1Rev1 MigProfile = 8 /*!< GPU instance slice 1 revision 1 */ + MigProfileGPUInstanceSlice2Rev1 MigProfile = 9 /*!< GPU instance slice 2 revision 1 */ + MigProfileGPUInstanceSlice1Rev2 MigProfile = 10 /*!< GPU instance slice 1 revision 2 */ + MigProfileComputeInstanceSlice1 MigProfile = 30 /*!< compute instance slice 1 */ + MigProfileComputeInstanceSlice2 MigProfile = 31 /*!< compute instance slice 2 */ + MigProfileComputeInstanceSlice3 MigProfile = 32 /*!< compute instance slice 3 */ + MigProfileComputeInstanceSlice4 MigProfile = 33 /*!< compute instance slice 4*/ + MigProfileComputeInstanceSlice7 MigProfile = 34 /*!< compute instance slice 7 */ + MigProfileComputeInstanceSlice8 MigProfile = 35 /*!< compute instance slice 8 */ + MigProfileComputeInstanceSlice6 MigProfile = 36 /*!< compute instance slice 6 */ + MigProfileComputeInstanceSlice1Rev1 MigProfile = 37 /*!< compute instance slice 1 revision 1 */ +) diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/test_utils.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/test_utils.go new file mode 100644 index 0000000000..ae46bd2a70 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/test_utils.go @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dcgm + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func setupTest(t *testing.T) func(t *testing.T) { + cleanup, err := Init(Embedded) + assert.NoError(t, err) + + return func(t *testing.T) { + defer cleanup() + } +} + +func runOnlyWithLiveGPUs(t *testing.T) { + t.Helper() + gpus, err := getSupportedDevices() + assert.NoError(t, err) + if len(gpus) < 1 { + t.Skip("Skipping test that requires live GPUs. None were found") + } +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/topology.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/topology.go new file mode 100644 index 0000000000..24f5de40ed --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/topology.go @@ -0,0 +1,186 @@ +package dcgm + +/* +#include "dcgm_agent.h" +#include "dcgm_structs.h" +*/ +import "C" +import ( + "fmt" + "unsafe" +) + +type P2PLinkType uint + +const ( + P2PLinkUnknown P2PLinkType = iota + P2PLinkCrossCPU + P2PLinkSameCPU + P2PLinkHostBridge + P2PLinkMultiSwitch + P2PLinkSingleSwitch + P2PLinkSameBoard + SingleNVLINKLink + TwoNVLINKLinks + ThreeNVLINKLinks + FourNVLINKLinks +) + +func (l P2PLinkType) PCIPaths() string { + switch l { + case P2PLinkSameBoard: + return "PSB" + case P2PLinkSingleSwitch: + return "PIX" + case P2PLinkMultiSwitch: + return "PXB" + case P2PLinkHostBridge: + return "PHB" + case P2PLinkSameCPU: + return "NODE" + case P2PLinkCrossCPU: + return "SYS" + case SingleNVLINKLink: + return "NV1" + case TwoNVLINKLinks: + return "NV2" + case ThreeNVLINKLinks: + return "NV3" + case FourNVLINKLinks: + return "NV4" + case P2PLinkUnknown: + } + return "N/A" +} + +type P2PLink struct { + GPU uint + BusID string + Link P2PLinkType +} + +func getP2PLink(path uint) P2PLinkType { + switch path { + case C.DCGM_TOPOLOGY_BOARD: + return P2PLinkSameBoard + case C.DCGM_TOPOLOGY_SINGLE: + return P2PLinkSingleSwitch + case C.DCGM_TOPOLOGY_MULTIPLE: + return P2PLinkMultiSwitch + case C.DCGM_TOPOLOGY_HOSTBRIDGE: + return P2PLinkHostBridge + case C.DCGM_TOPOLOGY_CPU: + return P2PLinkSameCPU + case C.DCGM_TOPOLOGY_SYSTEM: + return P2PLinkCrossCPU + case C.DCGM_TOPOLOGY_NVLINK1: + return SingleNVLINKLink + case C.DCGM_TOPOLOGY_NVLINK2: + return TwoNVLINKLinks + case C.DCGM_TOPOLOGY_NVLINK3: + return ThreeNVLINKLinks + case C.DCGM_TOPOLOGY_NVLINK4: + return FourNVLINKLinks + } + return P2PLinkUnknown +} + +func getBusid(gpuid uint) (string, error) { + var device C.dcgmDeviceAttributes_v3 + device.version = makeVersion3(unsafe.Sizeof(device)) + + result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) + if err := errorString(result); err != nil { + return "", fmt.Errorf("Error getting device busid: %s", err) + } + return *stringPtr(&device.identifiers.pciBusId[0]), nil +} + +func getDeviceTopology(gpuid uint) (links []P2PLink, err error) { + var topology C.dcgmDeviceTopology_v1 + topology.version = makeVersion1(unsafe.Sizeof(topology)) + + result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology) + if result == C.DCGM_ST_NOT_SUPPORTED { + return links, nil + } + if result != C.DCGM_ST_OK { + return links, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + busid, err := getBusid(gpuid) + if err != nil { + return + } + + for i := uint(0); i < uint(topology.numGpus); i++ { + gpu := topology.gpuPaths[i].gpuId + p2pLink := P2PLink{ + GPU: uint(gpu), + BusID: busid, + Link: getP2PLink(uint(topology.gpuPaths[i].path)), + } + links = append(links, p2pLink) + } + return +} + +type Link_State uint + +const ( + LS_NOT_SUPPORTED Link_State = iota // Link is unsupported (Default for GPUs) + LS_DISABLED // Link is supported but disabled (Default for NvSwitches) + LS_DOWN // Link link is down (inactive) + LS_UP // Link link is up (active) +) + +type NvLinkStatus struct { + ParentId uint + ParentType Field_Entity_Group + State Link_State + Index uint +} + +func getNvLinkLinkStatus() ([]NvLinkStatus, error) { + var linkStatus C.dcgmNvLinkStatus_v3 + linkStatus.version = makeVersion3(unsafe.Sizeof(linkStatus)) + + var links []NvLinkStatus + + result := C.dcgmGetNvLinkLinkStatus(handle.handle, &linkStatus) + if result == C.DCGM_ST_NOT_SUPPORTED { + return links, nil + } + + if result != C.DCGM_ST_OK { + return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + for i := uint(0); i < uint(linkStatus.numGpus); i++ { + for j := 0; j < int(C.DCGM_NVLINK_MAX_LINKS_PER_GPU); j++ { + link := NvLinkStatus{ + uint(linkStatus.gpus[i].entityId), + FE_GPU, + Link_State(linkStatus.gpus[i].linkState[j]), + uint(j), + } + + links = append(links, link) + } + } + + for i := uint(0); i < uint(linkStatus.numNvSwitches); i++ { + for j := 0; j < C.DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH; j++ { + link := NvLinkStatus{ + uint(linkStatus.nvSwitches[i].entityId), + FE_SWITCH, + Link_State(linkStatus.nvSwitches[i].linkState[j]), + uint(j), + } + + links = append(links, link) + } + } + + return links, nil +} diff --git a/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/utils.go b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/utils.go new file mode 100644 index 0000000000..dd7170bbbb --- /dev/null +++ b/vendor/github.com/NVIDIA/go-dcgm/pkg/dcgm/utils.go @@ -0,0 +1,175 @@ +package dcgm + +/* +#include +#include "dcgm_structs.h" +*/ +import "C" + +import ( + "fmt" + "math" + "unsafe" +) + +const ( + dcgmInt32Blank = 0x7ffffff0 // 2147483632 + dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792 +) + +func uintPtr(c C.uint) *uint { + i := uint(c) + return &i +} + +func uintPtrInt(c C.int) *uint { + i := uint(c) + return &i +} + +func uintPtrUnsafe(p unsafe.Pointer) *uint { + if p == nil { + return nil + } + uintP := (*uint)(unsafe.Pointer(p)) + val := *uintP + return &val +} + +func uint64Ptr(c C.longlong) *uint64 { + i := uint64(c) + return &i +} + +func int64Ptr(c C.longlong) *int64 { + i := int64(c) + return &i +} + +func uint64PtrUint(c C.uint) *uint64 { + i := uint64(c) + return &i +} + +func uint64PtrUnsafe(p unsafe.Pointer) *uint64 { + if p == nil { + return nil + } + uintP := (*uint64)(unsafe.Pointer(p)) + val := *uintP + return &val +} + +func toInt64(c C.longlong) int64 { + i := int64(c) + return i +} + +func dblToUint(val C.double) *uint { + i := uint(val) + return &i +} + +func dblToFloat(val C.double) *float64 { + i := float64(val) + return &i +} + +func dblToFloatUnsafe(val unsafe.Pointer) *float64 { + if val == nil { + return nil + } + dblP := (*C.double)(unsafe.Pointer(val)) + floatP := float64(*dblP) + return &floatP +} + +func stringPtr(c *C.char) *string { + s := C.GoString(c) + return &s +} + +type DcgmError struct { + msg string // description of error + Code C.dcgmReturn_t // dcgmReturn_t value of error +} + +func (e *DcgmError) Error() string { return e.msg } + +func errorString(result C.dcgmReturn_t) error { + if result == C.DCGM_ST_OK { + return nil + } + err := C.GoString(C.errorString(result)) + return fmt.Errorf("%v", err) +} + +func freeCString(cStr *C.char) { + C.free(unsafe.Pointer(cStr)) +} + +func IsInt32Blank(value int) bool { + if value >= dcgmInt32Blank { + return true + } + return false +} + +func IsInt64Blank(value int64) bool { + if value >= dcgmInt64Blank { + return true + } + return false +} + +func blank64(val *int64) *int64 { + if val != nil && IsInt64Blank(*val) { + return nil + } + return val +} + +func blank32(val *uint) *uint { + if val != nil && IsInt32Blank(int(*val)) { + return nil + } + return val +} + +func makeVersion1(struct_type uintptr) C.uint { + version := C.uint(struct_type | 1<<24) + return version +} + +func makeVersion2(struct_type uintptr) C.uint { + version := C.uint(struct_type | 2<<24) + return version +} + +func makeVersion3(struct_type uintptr) C.uint { + version := C.uint(struct_type | 3<<24) + return version +} + +func makeVersion4(struct_type uintptr) C.uint { + version := C.uint(struct_type | 4<<24) + return version +} + +func makeVersion8(struct_type uintptr) C.uint { + version := C.uint(struct_type | 8<<24) + return version +} + +func makeVersion9(struct_type uintptr) C.uint { + version := C.uint(struct_type | 9<<24) + return version +} + +func roundFloat(f *float64) *float64 { + var val float64 + if f != nil { + val = math.Round(*f) + } + return &val +} diff --git a/vendor/github.com/bits-and-blooms/bitset/.gitignore b/vendor/github.com/bits-and-blooms/bitset/.gitignore new file mode 100644 index 0000000000..5c204d28b0 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/.gitignore @@ -0,0 +1,26 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test +*.prof + +target diff --git a/vendor/github.com/bits-and-blooms/bitset/.travis.yml b/vendor/github.com/bits-and-blooms/bitset/.travis.yml new file mode 100644 index 0000000000..094aa5ce07 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/.travis.yml @@ -0,0 +1,37 @@ +language: go + +sudo: false + +branches: + except: + - release + +branches: + only: + - master + - travis + +go: + - "1.11.x" + - tip + +matrix: + allow_failures: + - go: tip + +before_install: + - if [ -n "$GH_USER" ]; then git config --global github.user ${GH_USER}; fi; + - if [ -n "$GH_TOKEN" ]; then git config --global github.token ${GH_TOKEN}; fi; + - go get github.com/mattn/goveralls + +before_script: + - make deps + +script: + - make qa + +after_failure: + - cat ./target/test/report.xml + +after_success: + - if [ "$TRAVIS_GO_VERSION" = "1.11.1" ]; then $HOME/gopath/bin/goveralls -covermode=count -coverprofile=target/report/coverage.out -service=travis-ci; fi; diff --git a/vendor/github.com/bits-and-blooms/bitset/LICENSE b/vendor/github.com/bits-and-blooms/bitset/LICENSE new file mode 100644 index 0000000000..59cab8a939 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2014 Will Fitzgerald. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/bits-and-blooms/bitset/README.md b/vendor/github.com/bits-and-blooms/bitset/README.md new file mode 100644 index 0000000000..fe7bca65eb --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/README.md @@ -0,0 +1,159 @@ +# bitset + +*Go language library to map between non-negative integers and boolean values* + +[![Test](https://github.com/bits-and-blooms/bitset/workflows/Test/badge.svg)](https://github.com/willf/bitset/actions?query=workflow%3ATest) +[![Go Report Card](https://goreportcard.com/badge/github.com/willf/bitset)](https://goreportcard.com/report/github.com/willf/bitset) +[![PkgGoDev](https://pkg.go.dev/badge/github.com/bits-and-blooms/bitset?tab=doc)](https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc) + + +This library is part of the [awesome go collection](https://github.com/avelino/awesome-go). It is used in production by several important systems: + +* [beego](https://github.com/beego/beego) +* [CubeFS](https://github.com/cubefs/cubefs) +* [Amazon EKS Distro](https://github.com/aws/eks-distro) +* [sourcegraph](https://github.com/sourcegraph/sourcegraph) +* [torrent](https://github.com/anacrolix/torrent) + + +## Description + +Package bitset implements bitsets, a mapping between non-negative integers and boolean values. +It should be more efficient than map[uint] bool. + +It provides methods for setting, clearing, flipping, and testing individual integers. + +But it also provides set intersection, union, difference, complement, and symmetric operations, as well as tests to check whether any, all, or no bits are set, and querying a bitset's current length and number of positive bits. + +BitSets are expanded to the size of the largest set bit; the memory allocation is approximately Max bits, where Max is the largest set bit. BitSets are never shrunk. On creation, a hint can be given for the number of bits that will be used. + +Many of the methods, including Set, Clear, and Flip, return a BitSet pointer, which allows for chaining. + +### Example use: + +```go +package main + +import ( + "fmt" + "math/rand" + + "github.com/bits-and-blooms/bitset" +) + +func main() { + fmt.Printf("Hello from BitSet!\n") + var b bitset.BitSet + // play some Go Fish + for i := 0; i < 100; i++ { + card1 := uint(rand.Intn(52)) + card2 := uint(rand.Intn(52)) + b.Set(card1) + if b.Test(card2) { + fmt.Println("Go Fish!") + } + b.Clear(card1) + } + + // Chaining + b.Set(10).Set(11) + + for i, e := b.NextSet(0); e; i, e = b.NextSet(i + 1) { + fmt.Println("The following bit is set:", i) + } + if b.Intersection(bitset.New(100).Set(10)).Count() == 1 { + fmt.Println("Intersection works.") + } else { + fmt.Println("Intersection doesn't work???") + } +} +``` + + +Package documentation is at: https://pkg.go.dev/github.com/bits-and-blooms/bitset?tab=doc + +## Serialization + + +You may serialize a bitset safely and portably to a stream +of bytes as follows: +```Go + const length = 9585 + const oneEvery = 97 + bs := bitset.New(length) + // Add some bits + for i := uint(0); i < length; i += oneEvery { + bs = bs.Set(i) + } + + var buf bytes.Buffer + n, err := bs.WriteTo(&buf) + if err != nil { + // failure + } + // Here n == buf.Len() +``` +You can later deserialize the result as follows: + +```Go + // Read back from buf + bs = bitset.New() + n, err = bs.ReadFrom(&buf) + if err != nil { + // error + } + // n is the number of bytes read +``` + +The `ReadFrom` function attempts to read the data into the existing +BitSet instance, to minimize memory allocations. + + +*Performance tip*: +When reading and writing to a file or a network connection, you may get better performance by +wrapping your streams with `bufio` instances. + +E.g., +```Go + f, err := os.Create("myfile") + w := bufio.NewWriter(f) +``` +```Go + f, err := os.Open("myfile") + r := bufio.NewReader(f) +``` + +## Memory Usage + +The memory usage of a bitset using `N` bits is at least `N/8` bytes. The number of bits in a bitset is at least as large as one plus the greatest bit index you have accessed. Thus it is possible to run out of memory while using a bitset. If you have lots of bits, you might prefer compressed bitsets, like the [Roaring bitmaps](http://roaringbitmap.org) and its [Go implementation](https://github.com/RoaringBitmap/roaring). + +The `roaring` library allows you to go back and forth between compressed Roaring bitmaps and the conventional bitset instances: +```Go + mybitset := roaringbitmap.ToBitSet() + newroaringbitmap := roaring.FromBitSet(mybitset) +``` + + +## Implementation Note + +Go 1.9 introduced a native `math/bits` library. We provide backward compatibility to Go 1.7, which might be removed. + +It is possible that a later version will match the `math/bits` return signature for counts (which is `int`, rather than our library's `uint64`). If so, the version will be bumped. + +## Installation + +```bash +go get github.com/bits-and-blooms/bitset +``` + +## Contributing + +If you wish to contribute to this project, please branch and issue a pull request against master ("[GitHub Flow](https://guides.github.com/introduction/flow/)") + +## Running all tests + +Before committing the code, please check if it passes tests, has adequate coverage, etc. +```bash +go test +go test -cover +``` diff --git a/vendor/github.com/bits-and-blooms/bitset/SECURITY.md b/vendor/github.com/bits-and-blooms/bitset/SECURITY.md new file mode 100644 index 0000000000..f888420c3b --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/SECURITY.md @@ -0,0 +1,5 @@ +# Security Policy + +## Reporting a Vulnerability + +You can report privately a vulnerability by email at daniel@lemire.me (current maintainer). diff --git a/vendor/github.com/bits-and-blooms/bitset/azure-pipelines.yml b/vendor/github.com/bits-and-blooms/bitset/azure-pipelines.yml new file mode 100644 index 0000000000..f9b2959184 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/azure-pipelines.yml @@ -0,0 +1,39 @@ +# Go +# Build your Go project. +# Add steps that test, save build artifacts, deploy, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/go + +trigger: +- master + +pool: + vmImage: 'Ubuntu-16.04' + +variables: + GOBIN: '$(GOPATH)/bin' # Go binaries path + GOROOT: '/usr/local/go1.11' # Go installation path + GOPATH: '$(system.defaultWorkingDirectory)/gopath' # Go workspace path + modulePath: '$(GOPATH)/src/github.com/$(build.repository.name)' # Path to the module's code + +steps: +- script: | + mkdir -p '$(GOBIN)' + mkdir -p '$(GOPATH)/pkg' + mkdir -p '$(modulePath)' + shopt -s extglob + shopt -s dotglob + mv !(gopath) '$(modulePath)' + echo '##vso[task.prependpath]$(GOBIN)' + echo '##vso[task.prependpath]$(GOROOT)/bin' + displayName: 'Set up the Go workspace' + +- script: | + go version + go get -v -t -d ./... + if [ -f Gopkg.toml ]; then + curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh + dep ensure + fi + go build -v . + workingDirectory: '$(modulePath)' + displayName: 'Get dependencies, then build' diff --git a/vendor/github.com/bits-and-blooms/bitset/bitset.go b/vendor/github.com/bits-and-blooms/bitset/bitset.go new file mode 100644 index 0000000000..9f38ed3a9d --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/bitset.go @@ -0,0 +1,1184 @@ +/* +Package bitset implements bitsets, a mapping +between non-negative integers and boolean values. It should be more +efficient than map[uint] bool. + +It provides methods for setting, clearing, flipping, and testing +individual integers. + +But it also provides set intersection, union, difference, +complement, and symmetric operations, as well as tests to +check whether any, all, or no bits are set, and querying a +bitset's current length and number of positive bits. + +BitSets are expanded to the size of the largest set bit; the +memory allocation is approximately Max bits, where Max is +the largest set bit. BitSets are never shrunk. On creation, +a hint can be given for the number of bits that will be used. + +Many of the methods, including Set,Clear, and Flip, return +a BitSet pointer, which allows for chaining. + +Example use: + + import "bitset" + var b BitSet + b.Set(10).Set(11) + if b.Test(1000) { + b.Clear(1000) + } + if B.Intersection(bitset.New(100).Set(10)).Count() > 1 { + fmt.Println("Intersection works.") + } + +As an alternative to BitSets, one should check out the 'big' package, +which provides a (less set-theoretical) view of bitsets. +*/ +package bitset + +import ( + "bytes" + "encoding/base64" + "encoding/binary" + "encoding/json" + "errors" + "fmt" + "io" + "strconv" +) + +// the wordSize of a bit set +const wordSize = uint(64) + +// the wordSize of a bit set in bytes +const wordBytes = wordSize / 8 + +// log2WordSize is lg(wordSize) +const log2WordSize = uint(6) + +// allBits has every bit set +const allBits uint64 = 0xffffffffffffffff + +// default binary BigEndian +var binaryOrder binary.ByteOrder = binary.BigEndian + +// default json encoding base64.URLEncoding +var base64Encoding = base64.URLEncoding + +// Base64StdEncoding Marshal/Unmarshal BitSet with base64.StdEncoding(Default: base64.URLEncoding) +func Base64StdEncoding() { base64Encoding = base64.StdEncoding } + +// LittleEndian Marshal/Unmarshal Binary as Little Endian(Default: binary.BigEndian) +func LittleEndian() { binaryOrder = binary.LittleEndian } + +// A BitSet is a set of bits. The zero value of a BitSet is an empty set of length 0. +type BitSet struct { + length uint + set []uint64 +} + +// Error is used to distinguish errors (panics) generated in this package. +type Error string + +// safeSet will fixup b.set to be non-nil and return the field value +func (b *BitSet) safeSet() []uint64 { + if b.set == nil { + b.set = make([]uint64, wordsNeeded(0)) + } + return b.set +} + +// SetBitsetFrom fills the bitset with an array of integers without creating a new BitSet instance +func (b *BitSet) SetBitsetFrom(buf []uint64) { + b.length = uint(len(buf)) * 64 + b.set = buf +} + +// From is a constructor used to create a BitSet from an array of words +func From(buf []uint64) *BitSet { + return FromWithLength(uint(len(buf))*64, buf) +} + +// FromWithLength constructs from an array of words and length. +func FromWithLength(len uint, set []uint64) *BitSet { + return &BitSet{len, set} +} + +// Bytes returns the bitset as array of words +func (b *BitSet) Bytes() []uint64 { + return b.set +} + +// wordsNeeded calculates the number of words needed for i bits +func wordsNeeded(i uint) int { + if i > (Cap() - wordSize + 1) { + return int(Cap() >> log2WordSize) + } + return int((i + (wordSize - 1)) >> log2WordSize) +} + +// wordsNeededUnbound calculates the number of words needed for i bits, possibly exceeding the capacity. +// This function is useful if you know that the capacity cannot be exceeded (e.g., you have an existing bitmap). +func wordsNeededUnbound(i uint) int { + return int((i + (wordSize - 1)) >> log2WordSize) +} + +// wordsIndex calculates the index of words in a `uint64` +func wordsIndex(i uint) uint { + return i & (wordSize - 1) +} + +// New creates a new BitSet with a hint that length bits will be required +func New(length uint) (bset *BitSet) { + defer func() { + if r := recover(); r != nil { + bset = &BitSet{ + 0, + make([]uint64, 0), + } + } + }() + + bset = &BitSet{ + length, + make([]uint64, wordsNeeded(length)), + } + + return bset +} + +// Cap returns the total possible capacity, or number of bits +func Cap() uint { + return ^uint(0) +} + +// Len returns the number of bits in the BitSet. +// Note the difference to method Count, see example. +func (b *BitSet) Len() uint { + return b.length +} + +// extendSet adds additional words to incorporate new bits if needed +func (b *BitSet) extendSet(i uint) { + if i >= Cap() { + panic("You are exceeding the capacity") + } + nsize := wordsNeeded(i + 1) + if b.set == nil { + b.set = make([]uint64, nsize) + } else if cap(b.set) >= nsize { + b.set = b.set[:nsize] // fast resize + } else if len(b.set) < nsize { + newset := make([]uint64, nsize, 2*nsize) // increase capacity 2x + copy(newset, b.set) + b.set = newset + } + b.length = i + 1 +} + +// Test whether bit i is set. +func (b *BitSet) Test(i uint) bool { + if i >= b.length { + return false + } + return b.set[i>>log2WordSize]&(1<= Cap(), this function will panic. +// Warning: using a very large value for 'i' +// may lead to a memory shortage and a panic: the caller is responsible +// for providing sensible parameters in line with their memory capacity. +func (b *BitSet) Set(i uint) *BitSet { + if i >= b.length { // if we need more bits, make 'em + b.extendSet(i) + } + b.set[i>>log2WordSize] |= 1 << wordsIndex(i) + return b +} + +// Clear bit i to 0 +func (b *BitSet) Clear(i uint) *BitSet { + if i >= b.length { + return b + } + b.set[i>>log2WordSize] &^= 1 << wordsIndex(i) + return b +} + +// SetTo sets bit i to value. +// If i>= Cap(), this function will panic. +// Warning: using a very large value for 'i' +// may lead to a memory shortage and a panic: the caller is responsible +// for providing sensible parameters in line with their memory capacity. +func (b *BitSet) SetTo(i uint, value bool) *BitSet { + if value { + return b.Set(i) + } + return b.Clear(i) +} + +// Flip bit at i. +// If i>= Cap(), this function will panic. +// Warning: using a very large value for 'i' +// may lead to a memory shortage and a panic: the caller is responsible +// for providing sensible parameters in line with their memory capacity. +func (b *BitSet) Flip(i uint) *BitSet { + if i >= b.length { + return b.Set(i) + } + b.set[i>>log2WordSize] ^= 1 << wordsIndex(i) + return b +} + +// FlipRange bit in [start, end). +// If end>= Cap(), this function will panic. +// Warning: using a very large value for 'end' +// may lead to a memory shortage and a panic: the caller is responsible +// for providing sensible parameters in line with their memory capacity. +func (b *BitSet) FlipRange(start, end uint) *BitSet { + if start >= end { + return b + } + if end-1 >= b.length { // if we need more bits, make 'em + b.extendSet(end - 1) + } + var startWord uint = start >> log2WordSize + var endWord uint = end >> log2WordSize + b.set[startWord] ^= ^(^uint64(0) << wordsIndex(start)) + if endWord > 0 { + // bounds check elimination + data := b.set + _ = data[endWord-1] + for i := startWord; i < endWord; i++ { + data[i] = ^data[i] + } + } + if end&(wordSize-1) != 0 { + b.set[endWord] ^= ^uint64(0) >> wordsIndex(-end) + } + return b +} + +// Shrink shrinks BitSet so that the provided value is the last possible +// set value. It clears all bits > the provided index and reduces the size +// and length of the set. +// +// Note that the parameter value is not the new length in bits: it is the +// maximal value that can be stored in the bitset after the function call. +// The new length in bits is the parameter value + 1. Thus it is not possible +// to use this function to set the length to 0, the minimal value of the length +// after this function call is 1. +// +// A new slice is allocated to store the new bits, so you may see an increase in +// memory usage until the GC runs. Normally this should not be a problem, but if you +// have an extremely large BitSet its important to understand that the old BitSet will +// remain in memory until the GC frees it. +func (b *BitSet) Shrink(lastbitindex uint) *BitSet { + length := lastbitindex + 1 + idx := wordsNeeded(length) + if idx > len(b.set) { + return b + } + shrunk := make([]uint64, idx) + copy(shrunk, b.set[:idx]) + b.set = shrunk + b.length = length + lastWordUsedBits := length % 64 + if lastWordUsedBits != 0 { + b.set[idx-1] &= allBits >> uint64(64-wordsIndex(lastWordUsedBits)) + } + return b +} + +// Compact shrinks BitSet to so that we preserve all set bits, while minimizing +// memory usage. Compact calls Shrink. +func (b *BitSet) Compact() *BitSet { + idx := len(b.set) - 1 + for ; idx >= 0 && b.set[idx] == 0; idx-- { + } + newlength := uint((idx + 1) << log2WordSize) + if newlength >= b.length { + return b // nothing to do + } + if newlength > 0 { + return b.Shrink(newlength - 1) + } + // We preserve one word + return b.Shrink(63) +} + +// InsertAt takes an index which indicates where a bit should be +// inserted. Then it shifts all the bits in the set to the left by 1, starting +// from the given index position, and sets the index position to 0. +// +// Depending on the size of your BitSet, and where you are inserting the new entry, +// this method could be extremely slow and in some cases might cause the entire BitSet +// to be recopied. +func (b *BitSet) InsertAt(idx uint) *BitSet { + insertAtElement := idx >> log2WordSize + + // if length of set is a multiple of wordSize we need to allocate more space first + if b.isLenExactMultiple() { + b.set = append(b.set, uint64(0)) + } + + var i uint + for i = uint(len(b.set) - 1); i > insertAtElement; i-- { + // all elements above the position where we want to insert can simply by shifted + b.set[i] <<= 1 + + // we take the most significant bit of the previous element and set it as + // the least significant bit of the current element + b.set[i] |= (b.set[i-1] & 0x8000000000000000) >> 63 + } + + // generate a mask to extract the data that we need to shift left + // within the element where we insert a bit + dataMask := uint64(1)< 0x40000 { + buffer.WriteString("...") + break + } + buffer.WriteString(strconv.FormatInt(int64(i), 10)) + i, e = b.NextSet(i + 1) + if e { + buffer.WriteString(",") + } + } + buffer.WriteString("}") + return buffer.String() +} + +// DeleteAt deletes the bit at the given index position from +// within the bitset +// All the bits residing on the left of the deleted bit get +// shifted right by 1 +// The running time of this operation may potentially be +// relatively slow, O(length) +func (b *BitSet) DeleteAt(i uint) *BitSet { + // the index of the slice element where we'll delete a bit + deleteAtElement := i >> log2WordSize + + // generate a mask for the data that needs to be shifted right + // within that slice element that gets modified + dataMask := ^((uint64(1) << wordsIndex(i)) - 1) + + // extract the data that we'll shift right from the slice element + data := b.set[deleteAtElement] & dataMask + + // set the masked area to 0 while leaving the rest as it is + b.set[deleteAtElement] &= ^dataMask + + // shift the previously extracted data to the right and then + // set it in the previously masked area + b.set[deleteAtElement] |= (data >> 1) & dataMask + + // loop over all the consecutive slice elements to copy each + // lowest bit into the highest position of the previous element, + // then shift the entire content to the right by 1 + for i := int(deleteAtElement) + 1; i < len(b.set); i++ { + b.set[i-1] |= (b.set[i] & 1) << 63 + b.set[i] >>= 1 + } + + b.length = b.length - 1 + + return b +} + +// NextSet returns the next bit set from the specified index, +// including possibly the current index +// along with an error code (true = valid, false = no set bit found) +// for i,e := v.NextSet(0); e; i,e = v.NextSet(i + 1) {...} +// +// Users concerned with performance may want to use NextSetMany to +// retrieve several values at once. +func (b *BitSet) NextSet(i uint) (uint, bool) { + x := int(i >> log2WordSize) + if x >= len(b.set) { + return 0, false + } + w := b.set[x] + w = w >> wordsIndex(i) + if w != 0 { + return i + trailingZeroes64(w), true + } + x++ + // bounds check elimination in the loop + if x < 0 { + return 0, false + } + for x < len(b.set) { + if b.set[x] != 0 { + return uint(x)*wordSize + trailingZeroes64(b.set[x]), true + } + x++ + + } + return 0, false +} + +// NextSetMany returns many next bit sets from the specified index, +// including possibly the current index and up to cap(buffer). +// If the returned slice has len zero, then no more set bits were found +// +// buffer := make([]uint, 256) // this should be reused +// j := uint(0) +// j, buffer = bitmap.NextSetMany(j, buffer) +// for ; len(buffer) > 0; j, buffer = bitmap.NextSetMany(j,buffer) { +// for k := range buffer { +// do something with buffer[k] +// } +// j += 1 +// } +// +// It is possible to retrieve all set bits as follow: +// +// indices := make([]uint, bitmap.Count()) +// bitmap.NextSetMany(0, indices) +// +// However if bitmap.Count() is large, it might be preferable to +// use several calls to NextSetMany, for performance reasons. +func (b *BitSet) NextSetMany(i uint, buffer []uint) (uint, []uint) { + myanswer := buffer + capacity := cap(buffer) + x := int(i >> log2WordSize) + if x >= len(b.set) || capacity == 0 { + return 0, myanswer[:0] + } + skip := wordsIndex(i) + word := b.set[x] >> skip + myanswer = myanswer[:capacity] + size := int(0) + for word != 0 { + r := trailingZeroes64(word) + t := word & ((^word) + 1) + myanswer[size] = r + i + size++ + if size == capacity { + goto End + } + word = word ^ t + } + x++ + for idx, word := range b.set[x:] { + for word != 0 { + r := trailingZeroes64(word) + t := word & ((^word) + 1) + myanswer[size] = r + (uint(x+idx) << 6) + size++ + if size == capacity { + goto End + } + word = word ^ t + } + } +End: + if size > 0 { + return myanswer[size-1], myanswer[:size] + } + return 0, myanswer[:0] +} + +// NextClear returns the next clear bit from the specified index, +// including possibly the current index +// along with an error code (true = valid, false = no bit found i.e. all bits are set) +func (b *BitSet) NextClear(i uint) (uint, bool) { + x := int(i >> log2WordSize) + if x >= len(b.set) { + return 0, false + } + w := b.set[x] + w = w >> wordsIndex(i) + wA := allBits >> wordsIndex(i) + index := i + trailingZeroes64(^w) + if w != wA && index < b.length { + return index, true + } + x++ + // bounds check elimination in the loop + if x < 0 { + return 0, false + } + for x < len(b.set) { + if b.set[x] != allBits { + index = uint(x)*wordSize + trailingZeroes64(^b.set[x]) + if index < b.length { + return index, true + } + } + x++ + } + return 0, false +} + +// ClearAll clears the entire BitSet +func (b *BitSet) ClearAll() *BitSet { + if b != nil && b.set != nil { + for i := range b.set { + b.set[i] = 0 + } + } + return b +} + +// SetAll sets the entire BitSet +func (b *BitSet) SetAll() *BitSet { + if b != nil && b.set != nil { + for i := range b.set { + b.set[i] = allBits + } + + b.cleanLastWord() + } + return b +} + +// wordCount returns the number of words used in a bit set +func (b *BitSet) wordCount() int { + return wordsNeededUnbound(b.length) +} + +// Clone this BitSet +func (b *BitSet) Clone() *BitSet { + c := New(b.length) + if b.set != nil { // Clone should not modify current object + copy(c.set, b.set) + } + return c +} + +// Copy into a destination BitSet using the Go array copy semantics: +// the number of bits copied is the minimum of the number of bits in the current +// BitSet (Len()) and the destination Bitset. +// We return the number of bits copied in the destination BitSet. +func (b *BitSet) Copy(c *BitSet) (count uint) { + if c == nil { + return + } + if b.set != nil { // Copy should not modify current object + copy(c.set, b.set) + } + count = c.length + if b.length < c.length { + count = b.length + } + // Cleaning the last word is needed to keep the invariant that other functions, such as Count, require + // that any bits in the last word that would exceed the length of the bitmask are set to 0. + c.cleanLastWord() + return +} + +// CopyFull copies into a destination BitSet such that the destination is +// identical to the source after the operation, allocating memory if necessary. +func (b *BitSet) CopyFull(c *BitSet) { + if c == nil { + return + } + c.length = b.length + if len(b.set) == 0 { + if c.set != nil { + c.set = c.set[:0] + } + } else { + if cap(c.set) < len(b.set) { + c.set = make([]uint64, len(b.set)) + } else { + c.set = c.set[:len(b.set)] + } + copy(c.set, b.set) + } +} + +// Count (number of set bits). +// Also known as "popcount" or "population count". +func (b *BitSet) Count() uint { + if b != nil && b.set != nil { + return uint(popcntSlice(b.set)) + } + return 0 +} + +// Equal tests the equivalence of two BitSets. +// False if they are of different sizes, otherwise true +// only if all the same bits are set +func (b *BitSet) Equal(c *BitSet) bool { + if c == nil || b == nil { + return c == b + } + if b.length != c.length { + return false + } + if b.length == 0 { // if they have both length == 0, then could have nil set + return true + } + wn := b.wordCount() + // bounds check elimination + if wn <= 0 { + return true + } + _ = b.set[wn-1] + _ = c.set[wn-1] + for p := 0; p < wn; p++ { + if c.set[p] != b.set[p] { + return false + } + } + return true +} + +func panicIfNull(b *BitSet) { + if b == nil { + panic(Error("BitSet must not be null")) + } +} + +// Difference of base set and other set +// This is the BitSet equivalent of &^ (and not) +func (b *BitSet) Difference(compare *BitSet) (result *BitSet) { + panicIfNull(b) + panicIfNull(compare) + result = b.Clone() // clone b (in case b is bigger than compare) + l := compare.wordCount() + if l > b.wordCount() { + l = b.wordCount() + } + for i := 0; i < l; i++ { + result.set[i] = b.set[i] &^ compare.set[i] + } + return +} + +// DifferenceCardinality computes the cardinality of the differnce +func (b *BitSet) DifferenceCardinality(compare *BitSet) uint { + panicIfNull(b) + panicIfNull(compare) + l := compare.wordCount() + if l > b.wordCount() { + l = b.wordCount() + } + cnt := uint64(0) + cnt += popcntMaskSlice(b.set[:l], compare.set[:l]) + cnt += popcntSlice(b.set[l:]) + return uint(cnt) +} + +// InPlaceDifference computes the difference of base set and other set +// This is the BitSet equivalent of &^ (and not) +func (b *BitSet) InPlaceDifference(compare *BitSet) { + panicIfNull(b) + panicIfNull(compare) + l := compare.wordCount() + if l > b.wordCount() { + l = b.wordCount() + } + if l <= 0 { + return + } + // bounds check elimination + data, cmpData := b.set, compare.set + _ = data[l-1] + _ = cmpData[l-1] + for i := 0; i < l; i++ { + data[i] &^= cmpData[i] + } +} + +// Convenience function: return two bitsets ordered by +// increasing length. Note: neither can be nil +func sortByLength(a *BitSet, b *BitSet) (ap *BitSet, bp *BitSet) { + if a.length <= b.length { + ap, bp = a, b + } else { + ap, bp = b, a + } + return +} + +// Intersection of base set and other set +// This is the BitSet equivalent of & (and) +func (b *BitSet) Intersection(compare *BitSet) (result *BitSet) { + panicIfNull(b) + panicIfNull(compare) + b, compare = sortByLength(b, compare) + result = New(b.length) + for i, word := range b.set { + result.set[i] = word & compare.set[i] + } + return +} + +// IntersectionCardinality computes the cardinality of the union +func (b *BitSet) IntersectionCardinality(compare *BitSet) uint { + panicIfNull(b) + panicIfNull(compare) + b, compare = sortByLength(b, compare) + cnt := popcntAndSlice(b.set, compare.set) + return uint(cnt) +} + +// InPlaceIntersection destructively computes the intersection of +// base set and the compare set. +// This is the BitSet equivalent of & (and) +func (b *BitSet) InPlaceIntersection(compare *BitSet) { + panicIfNull(b) + panicIfNull(compare) + l := compare.wordCount() + if l > b.wordCount() { + l = b.wordCount() + } + if l > 0 { + // bounds check elimination + data, cmpData := b.set, compare.set + _ = data[l-1] + _ = cmpData[l-1] + + for i := 0; i < l; i++ { + data[i] &= cmpData[i] + } + } + if l >= 0 { + for i := l; i < len(b.set); i++ { + b.set[i] = 0 + } + } + if compare.length > 0 { + if compare.length-1 >= b.length { + b.extendSet(compare.length - 1) + } + } +} + +// Union of base set and other set +// This is the BitSet equivalent of | (or) +func (b *BitSet) Union(compare *BitSet) (result *BitSet) { + panicIfNull(b) + panicIfNull(compare) + b, compare = sortByLength(b, compare) + result = compare.Clone() + for i, word := range b.set { + result.set[i] = word | compare.set[i] + } + return +} + +// UnionCardinality computes the cardinality of the uniton of the base set +// and the compare set. +func (b *BitSet) UnionCardinality(compare *BitSet) uint { + panicIfNull(b) + panicIfNull(compare) + b, compare = sortByLength(b, compare) + cnt := popcntOrSlice(b.set, compare.set) + if len(compare.set) > len(b.set) { + cnt += popcntSlice(compare.set[len(b.set):]) + } + return uint(cnt) +} + +// InPlaceUnion creates the destructive union of base set and compare set. +// This is the BitSet equivalent of | (or). +func (b *BitSet) InPlaceUnion(compare *BitSet) { + panicIfNull(b) + panicIfNull(compare) + l := compare.wordCount() + if l > b.wordCount() { + l = b.wordCount() + } + if compare.length > 0 && compare.length-1 >= b.length { + b.extendSet(compare.length - 1) + } + if l > 0 { + // bounds check elimination + data, cmpData := b.set, compare.set + _ = data[l-1] + _ = cmpData[l-1] + + for i := 0; i < l; i++ { + data[i] |= cmpData[i] + } + } + if len(compare.set) > l { + for i := l; i < len(compare.set); i++ { + b.set[i] = compare.set[i] + } + } +} + +// SymmetricDifference of base set and other set +// This is the BitSet equivalent of ^ (xor) +func (b *BitSet) SymmetricDifference(compare *BitSet) (result *BitSet) { + panicIfNull(b) + panicIfNull(compare) + b, compare = sortByLength(b, compare) + // compare is bigger, so clone it + result = compare.Clone() + for i, word := range b.set { + result.set[i] = word ^ compare.set[i] + } + return +} + +// SymmetricDifferenceCardinality computes the cardinality of the symmetric difference +func (b *BitSet) SymmetricDifferenceCardinality(compare *BitSet) uint { + panicIfNull(b) + panicIfNull(compare) + b, compare = sortByLength(b, compare) + cnt := popcntXorSlice(b.set, compare.set) + if len(compare.set) > len(b.set) { + cnt += popcntSlice(compare.set[len(b.set):]) + } + return uint(cnt) +} + +// InPlaceSymmetricDifference creates the destructive SymmetricDifference of base set and other set +// This is the BitSet equivalent of ^ (xor) +func (b *BitSet) InPlaceSymmetricDifference(compare *BitSet) { + panicIfNull(b) + panicIfNull(compare) + l := compare.wordCount() + if l > b.wordCount() { + l = b.wordCount() + } + if compare.length > 0 && compare.length-1 >= b.length { + b.extendSet(compare.length - 1) + } + if l > 0 { + // bounds check elimination + data, cmpData := b.set, compare.set + _ = data[l-1] + _ = cmpData[l-1] + for i := 0; i < l; i++ { + data[i] ^= cmpData[i] + } + } + if len(compare.set) > l { + for i := l; i < len(compare.set); i++ { + b.set[i] = compare.set[i] + } + } +} + +// Is the length an exact multiple of word sizes? +func (b *BitSet) isLenExactMultiple() bool { + return wordsIndex(b.length) == 0 +} + +// Clean last word by setting unused bits to 0 +func (b *BitSet) cleanLastWord() { + if !b.isLenExactMultiple() { + b.set[len(b.set)-1] &= allBits >> (wordSize - wordsIndex(b.length)) + } +} + +// Complement computes the (local) complement of a bitset (up to length bits) +func (b *BitSet) Complement() (result *BitSet) { + panicIfNull(b) + result = New(b.length) + for i, word := range b.set { + result.set[i] = ^word + } + result.cleanLastWord() + return +} + +// All returns true if all bits are set, false otherwise. Returns true for +// empty sets. +func (b *BitSet) All() bool { + panicIfNull(b) + return b.Count() == b.length +} + +// None returns true if no bit is set, false otherwise. Returns true for +// empty sets. +func (b *BitSet) None() bool { + panicIfNull(b) + if b != nil && b.set != nil { + for _, word := range b.set { + if word > 0 { + return false + } + } + } + return true +} + +// Any returns true if any bit is set, false otherwise +func (b *BitSet) Any() bool { + panicIfNull(b) + return !b.None() +} + +// IsSuperSet returns true if this is a superset of the other set +func (b *BitSet) IsSuperSet(other *BitSet) bool { + l := other.wordCount() + if b.wordCount() < l { + l = b.wordCount() + } + for i, word := range other.set[:l] { + if b.set[i]&word != word { + return false + } + } + return popcntSlice(other.set[l:]) == 0 +} + +// IsStrictSuperSet returns true if this is a strict superset of the other set +func (b *BitSet) IsStrictSuperSet(other *BitSet) bool { + return b.Count() > other.Count() && b.IsSuperSet(other) +} + +// DumpAsBits dumps a bit set as a string of bits. Following the usual convention in Go, +// the least significant bits are printed last (index 0 is at the end of the string). +func (b *BitSet) DumpAsBits() string { + if b.set == nil { + return "." + } + buffer := bytes.NewBufferString("") + i := len(b.set) - 1 + for ; i >= 0; i-- { + fmt.Fprintf(buffer, "%064b.", b.set[i]) + } + return buffer.String() +} + +// BinaryStorageSize returns the binary storage requirements (see WriteTo) in bytes. +func (b *BitSet) BinaryStorageSize() int { + return int(wordBytes + wordBytes*uint(b.wordCount())) +} + +func readUint64Array(reader io.Reader, data []uint64) error { + length := len(data) + bufferSize := 128 + buffer := make([]byte, bufferSize*int(wordBytes)) + for i := 0; i < length; i += bufferSize { + end := i + bufferSize + if end > length { + end = length + buffer = buffer[:wordBytes*uint(end-i)] + } + chunk := data[i:end] + if _, err := io.ReadFull(reader, buffer); err != nil { + return err + } + for i := range chunk { + chunk[i] = uint64(binaryOrder.Uint64(buffer[8*i:])) + } + } + return nil +} + +func writeUint64Array(writer io.Writer, data []uint64) error { + bufferSize := 128 + buffer := make([]byte, bufferSize*int(wordBytes)) + for i := 0; i < len(data); i += bufferSize { + end := i + bufferSize + if end > len(data) { + end = len(data) + buffer = buffer[:wordBytes*uint(end-i)] + } + chunk := data[i:end] + for i, x := range chunk { + binaryOrder.PutUint64(buffer[8*i:], x) + } + _, err := writer.Write(buffer) + if err != nil { + return err + } + } + return nil +} + +// WriteTo writes a BitSet to a stream. The format is: +// 1. uint64 length +// 2. []uint64 set +// Upon success, the number of bytes written is returned. +// +// Performance: if this function is used to write to a disk or network +// connection, it might be beneficial to wrap the stream in a bufio.Writer. +// E.g., +// +// f, err := os.Create("myfile") +// w := bufio.NewWriter(f) +func (b *BitSet) WriteTo(stream io.Writer) (int64, error) { + length := uint64(b.length) + // Write length + err := binary.Write(stream, binaryOrder, &length) + if err != nil { + // Upon failure, we do not guarantee that we + // return the number of bytes written. + return int64(0), err + } + err = writeUint64Array(stream, b.set[:b.wordCount()]) + if err != nil { + // Upon failure, we do not guarantee that we + // return the number of bytes written. + return int64(wordBytes), err + } + return int64(b.BinaryStorageSize()), nil +} + +// ReadFrom reads a BitSet from a stream written using WriteTo +// The format is: +// 1. uint64 length +// 2. []uint64 set +// Upon success, the number of bytes read is returned. +// If the current BitSet is not large enough to hold the data, +// it is extended. In case of error, the BitSet is either +// left unchanged or made empty if the error occurs too late +// to preserve the content. +// +// Performance: if this function is used to read from a disk or network +// connection, it might be beneficial to wrap the stream in a bufio.Reader. +// E.g., +// +// f, err := os.Open("myfile") +// r := bufio.NewReader(f) +func (b *BitSet) ReadFrom(stream io.Reader) (int64, error) { + var length uint64 + err := binary.Read(stream, binaryOrder, &length) + if err != nil { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + return 0, err + } + newlength := uint(length) + + if uint64(newlength) != length { + return 0, errors.New("unmarshalling error: type mismatch") + } + nWords := wordsNeeded(uint(newlength)) + if cap(b.set) >= nWords { + b.set = b.set[:nWords] + } else { + b.set = make([]uint64, nWords) + } + + b.length = newlength + + err = readUint64Array(stream, b.set) + if err != nil { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } + // We do not want to leave the BitSet partially filled as + // it is error prone. + b.set = b.set[:0] + b.length = 0 + return 0, err + } + + return int64(b.BinaryStorageSize()), nil +} + +// MarshalBinary encodes a BitSet into a binary form and returns the result. +func (b *BitSet) MarshalBinary() ([]byte, error) { + var buf bytes.Buffer + _, err := b.WriteTo(&buf) + if err != nil { + return []byte{}, err + } + + return buf.Bytes(), err +} + +// UnmarshalBinary decodes the binary form generated by MarshalBinary. +func (b *BitSet) UnmarshalBinary(data []byte) error { + buf := bytes.NewReader(data) + _, err := b.ReadFrom(buf) + return err +} + +// MarshalJSON marshals a BitSet as a JSON structure +func (b BitSet) MarshalJSON() ([]byte, error) { + buffer := bytes.NewBuffer(make([]byte, 0, b.BinaryStorageSize())) + _, err := b.WriteTo(buffer) + if err != nil { + return nil, err + } + + // URLEncode all bytes + return json.Marshal(base64Encoding.EncodeToString(buffer.Bytes())) +} + +// UnmarshalJSON unmarshals a BitSet from JSON created using MarshalJSON +func (b *BitSet) UnmarshalJSON(data []byte) error { + // Unmarshal as string + var s string + err := json.Unmarshal(data, &s) + if err != nil { + return err + } + + // URLDecode string + buf, err := base64Encoding.DecodeString(s) + if err != nil { + return err + } + + _, err = b.ReadFrom(bytes.NewReader(buf)) + return err +} + +// Rank returns the nunber of set bits up to and including the index +// that are set in the bitset. +// See https://en.wikipedia.org/wiki/Ranking#Ranking_in_statistics +func (b *BitSet) Rank(index uint) uint { + if index >= b.length { + return b.Count() + } + leftover := (index + 1) & 63 + answer := uint(popcntSlice(b.set[:(index+1)>>6])) + if leftover != 0 { + answer += uint(popcount(b.set[(index+1)>>6] << (64 - leftover))) + } + return answer +} + +// Select returns the index of the jth set bit, where j is the argument. +// The caller is responsible to ensure that 0 <= j < Count(): when j is +// out of range, the function returns the length of the bitset (b.length). +// +// Note that this function differs in convention from the Rank function which +// returns 1 when ranking the smallest value. We follow the conventional +// textbook definition of Select and Rank. +func (b *BitSet) Select(index uint) uint { + leftover := index + for idx, word := range b.set { + w := uint(popcount(word)) + if w > leftover { + return uint(idx)*64 + select64(word, leftover) + } + leftover -= w + } + return b.length +} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt.go b/vendor/github.com/bits-and-blooms/bitset/popcnt.go new file mode 100644 index 0000000000..76577a8382 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/popcnt.go @@ -0,0 +1,53 @@ +package bitset + +// bit population count, take from +// https://code.google.com/p/go/issues/detail?id=4988#c11 +// credit: https://code.google.com/u/arnehormann/ +func popcount(x uint64) (n uint64) { + x -= (x >> 1) & 0x5555555555555555 + x = (x>>2)&0x3333333333333333 + x&0x3333333333333333 + x += x >> 4 + x &= 0x0f0f0f0f0f0f0f0f + x *= 0x0101010101010101 + return x >> 56 +} + +func popcntSliceGo(s []uint64) uint64 { + cnt := uint64(0) + for _, x := range s { + cnt += popcount(x) + } + return cnt +} + +func popcntMaskSliceGo(s, m []uint64) uint64 { + cnt := uint64(0) + for i := range s { + cnt += popcount(s[i] &^ m[i]) + } + return cnt +} + +func popcntAndSliceGo(s, m []uint64) uint64 { + cnt := uint64(0) + for i := range s { + cnt += popcount(s[i] & m[i]) + } + return cnt +} + +func popcntOrSliceGo(s, m []uint64) uint64 { + cnt := uint64(0) + for i := range s { + cnt += popcount(s[i] | m[i]) + } + return cnt +} + +func popcntXorSliceGo(s, m []uint64) uint64 { + cnt := uint64(0) + for i := range s { + cnt += popcount(s[i] ^ m[i]) + } + return cnt +} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go new file mode 100644 index 0000000000..7855c04b5b --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/popcnt_19.go @@ -0,0 +1,62 @@ +//go:build go1.9 +// +build go1.9 + +package bitset + +import "math/bits" + +func popcntSlice(s []uint64) uint64 { + var cnt int + for _, x := range s { + cnt += bits.OnesCount64(x) + } + return uint64(cnt) +} + +func popcntMaskSlice(s, m []uint64) uint64 { + var cnt int + // this explicit check eliminates a bounds check in the loop + if len(m) < len(s) { + panic("mask slice is too short") + } + for i := range s { + cnt += bits.OnesCount64(s[i] &^ m[i]) + } + return uint64(cnt) +} + +func popcntAndSlice(s, m []uint64) uint64 { + var cnt int + // this explicit check eliminates a bounds check in the loop + if len(m) < len(s) { + panic("mask slice is too short") + } + for i := range s { + cnt += bits.OnesCount64(s[i] & m[i]) + } + return uint64(cnt) +} + +func popcntOrSlice(s, m []uint64) uint64 { + var cnt int + // this explicit check eliminates a bounds check in the loop + if len(m) < len(s) { + panic("mask slice is too short") + } + for i := range s { + cnt += bits.OnesCount64(s[i] | m[i]) + } + return uint64(cnt) +} + +func popcntXorSlice(s, m []uint64) uint64 { + var cnt int + // this explicit check eliminates a bounds check in the loop + if len(m) < len(s) { + panic("mask slice is too short") + } + for i := range s { + cnt += bits.OnesCount64(s[i] ^ m[i]) + } + return uint64(cnt) +} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go new file mode 100644 index 0000000000..116e044407 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.go @@ -0,0 +1,68 @@ +//go:build !go1.9 && amd64 && !appengine +// +build !go1.9,amd64,!appengine + +package bitset + +// *** the following functions are defined in popcnt_amd64.s + +//go:noescape + +func hasAsm() bool + +// useAsm is a flag used to select the GO or ASM implementation of the popcnt function +var useAsm = hasAsm() + +//go:noescape + +func popcntSliceAsm(s []uint64) uint64 + +//go:noescape + +func popcntMaskSliceAsm(s, m []uint64) uint64 + +//go:noescape + +func popcntAndSliceAsm(s, m []uint64) uint64 + +//go:noescape + +func popcntOrSliceAsm(s, m []uint64) uint64 + +//go:noescape + +func popcntXorSliceAsm(s, m []uint64) uint64 + +func popcntSlice(s []uint64) uint64 { + if useAsm { + return popcntSliceAsm(s) + } + return popcntSliceGo(s) +} + +func popcntMaskSlice(s, m []uint64) uint64 { + if useAsm { + return popcntMaskSliceAsm(s, m) + } + return popcntMaskSliceGo(s, m) +} + +func popcntAndSlice(s, m []uint64) uint64 { + if useAsm { + return popcntAndSliceAsm(s, m) + } + return popcntAndSliceGo(s, m) +} + +func popcntOrSlice(s, m []uint64) uint64 { + if useAsm { + return popcntOrSliceAsm(s, m) + } + return popcntOrSliceGo(s, m) +} + +func popcntXorSlice(s, m []uint64) uint64 { + if useAsm { + return popcntXorSliceAsm(s, m) + } + return popcntXorSliceGo(s, m) +} diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s new file mode 100644 index 0000000000..666c0dcc17 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/popcnt_amd64.s @@ -0,0 +1,104 @@ +// +build !go1.9 +// +build amd64,!appengine + +TEXT ·hasAsm(SB),4,$0-1 +MOVQ $1, AX +CPUID +SHRQ $23, CX +ANDQ $1, CX +MOVB CX, ret+0(FP) +RET + +#define POPCNTQ_DX_DX BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0xd2 + +TEXT ·popcntSliceAsm(SB),4,$0-32 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s_len+8(FP), CX +TESTQ CX, CX +JZ popcntSliceEnd +popcntSliceLoop: +BYTE $0xf3; BYTE $0x48; BYTE $0x0f; BYTE $0xb8; BYTE $0x16 // POPCNTQ (SI), DX +ADDQ DX, AX +ADDQ $8, SI +LOOP popcntSliceLoop +popcntSliceEnd: +MOVQ AX, ret+24(FP) +RET + +TEXT ·popcntMaskSliceAsm(SB),4,$0-56 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s_len+8(FP), CX +TESTQ CX, CX +JZ popcntMaskSliceEnd +MOVQ m+24(FP), DI +popcntMaskSliceLoop: +MOVQ (DI), DX +NOTQ DX +ANDQ (SI), DX +POPCNTQ_DX_DX +ADDQ DX, AX +ADDQ $8, SI +ADDQ $8, DI +LOOP popcntMaskSliceLoop +popcntMaskSliceEnd: +MOVQ AX, ret+48(FP) +RET + +TEXT ·popcntAndSliceAsm(SB),4,$0-56 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s_len+8(FP), CX +TESTQ CX, CX +JZ popcntAndSliceEnd +MOVQ m+24(FP), DI +popcntAndSliceLoop: +MOVQ (DI), DX +ANDQ (SI), DX +POPCNTQ_DX_DX +ADDQ DX, AX +ADDQ $8, SI +ADDQ $8, DI +LOOP popcntAndSliceLoop +popcntAndSliceEnd: +MOVQ AX, ret+48(FP) +RET + +TEXT ·popcntOrSliceAsm(SB),4,$0-56 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s_len+8(FP), CX +TESTQ CX, CX +JZ popcntOrSliceEnd +MOVQ m+24(FP), DI +popcntOrSliceLoop: +MOVQ (DI), DX +ORQ (SI), DX +POPCNTQ_DX_DX +ADDQ DX, AX +ADDQ $8, SI +ADDQ $8, DI +LOOP popcntOrSliceLoop +popcntOrSliceEnd: +MOVQ AX, ret+48(FP) +RET + +TEXT ·popcntXorSliceAsm(SB),4,$0-56 +XORQ AX, AX +MOVQ s+0(FP), SI +MOVQ s_len+8(FP), CX +TESTQ CX, CX +JZ popcntXorSliceEnd +MOVQ m+24(FP), DI +popcntXorSliceLoop: +MOVQ (DI), DX +XORQ (SI), DX +POPCNTQ_DX_DX +ADDQ DX, AX +ADDQ $8, SI +ADDQ $8, DI +LOOP popcntXorSliceLoop +popcntXorSliceEnd: +MOVQ AX, ret+48(FP) +RET diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go b/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go new file mode 100644 index 0000000000..9e0ad464e0 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/popcnt_generic.go @@ -0,0 +1,25 @@ +//go:build !go1.9 && (!amd64 || appengine) +// +build !go1.9 +// +build !amd64 appengine + +package bitset + +func popcntSlice(s []uint64) uint64 { + return popcntSliceGo(s) +} + +func popcntMaskSlice(s, m []uint64) uint64 { + return popcntMaskSliceGo(s, m) +} + +func popcntAndSlice(s, m []uint64) uint64 { + return popcntAndSliceGo(s, m) +} + +func popcntOrSlice(s, m []uint64) uint64 { + return popcntOrSliceGo(s, m) +} + +func popcntXorSlice(s, m []uint64) uint64 { + return popcntXorSliceGo(s, m) +} diff --git a/vendor/github.com/bits-and-blooms/bitset/select.go b/vendor/github.com/bits-and-blooms/bitset/select.go new file mode 100644 index 0000000000..f15e74a2c9 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/select.go @@ -0,0 +1,45 @@ +package bitset + +func select64(w uint64, j uint) uint { + seen := 0 + // Divide 64bit + part := w & 0xFFFFFFFF + n := uint(popcount(part)) + if n <= j { + part = w >> 32 + seen += 32 + j -= n + } + ww := part + + // Divide 32bit + part = ww & 0xFFFF + + n = uint(popcount(part)) + if n <= j { + part = ww >> 16 + seen += 16 + j -= n + } + ww = part + + // Divide 16bit + part = ww & 0xFF + n = uint(popcount(part)) + if n <= j { + part = ww >> 8 + seen += 8 + j -= n + } + ww = part + + // Lookup in final byte + counter := 0 + for ; counter < 8; counter++ { + j -= uint((ww >> counter) & 1) + if j+1 == 0 { + break + } + } + return uint(seen + counter) +} diff --git a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go new file mode 100644 index 0000000000..12336e76af --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_18.go @@ -0,0 +1,15 @@ +//go:build !go1.9 +// +build !go1.9 + +package bitset + +var deBruijn = [...]byte{ + 0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4, + 62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5, + 63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11, + 54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6, +} + +func trailingZeroes64(v uint64) uint { + return uint(deBruijn[((v&-v)*0x03f79d71b4ca8b09)>>58]) +} diff --git a/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go new file mode 100644 index 0000000000..cfb0a84091 --- /dev/null +++ b/vendor/github.com/bits-and-blooms/bitset/trailing_zeros_19.go @@ -0,0 +1,10 @@ +//go:build go1.9 +// +build go1.9 + +package bitset + +import "math/bits" + +func trailingZeroes64(v uint64) uint { + return uint(bits.TrailingZeros64(v)) +} diff --git a/vendor/github.com/pmezard/go-difflib/LICENSE b/vendor/github.com/pmezard/go-difflib/LICENSE new file mode 100644 index 0000000000..c67dad612a --- /dev/null +++ b/vendor/github.com/pmezard/go-difflib/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2013, Patrick Mezard +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + The names of its contributors may not be used to endorse or promote +products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/pmezard/go-difflib/difflib/difflib.go b/vendor/github.com/pmezard/go-difflib/difflib/difflib.go new file mode 100644 index 0000000000..003e99fadb --- /dev/null +++ b/vendor/github.com/pmezard/go-difflib/difflib/difflib.go @@ -0,0 +1,772 @@ +// Package difflib is a partial port of Python difflib module. +// +// It provides tools to compare sequences of strings and generate textual diffs. +// +// The following class and functions have been ported: +// +// - SequenceMatcher +// +// - unified_diff +// +// - context_diff +// +// Getting unified diffs was the main goal of the port. Keep in mind this code +// is mostly suitable to output text differences in a human friendly way, there +// are no guarantees generated diffs are consumable by patch(1). +package difflib + +import ( + "bufio" + "bytes" + "fmt" + "io" + "strings" +) + +func min(a, b int) int { + if a < b { + return a + } + return b +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} + +func calculateRatio(matches, length int) float64 { + if length > 0 { + return 2.0 * float64(matches) / float64(length) + } + return 1.0 +} + +type Match struct { + A int + B int + Size int +} + +type OpCode struct { + Tag byte + I1 int + I2 int + J1 int + J2 int +} + +// SequenceMatcher compares sequence of strings. The basic +// algorithm predates, and is a little fancier than, an algorithm +// published in the late 1980's by Ratcliff and Obershelp under the +// hyperbolic name "gestalt pattern matching". The basic idea is to find +// the longest contiguous matching subsequence that contains no "junk" +// elements (R-O doesn't address junk). The same idea is then applied +// recursively to the pieces of the sequences to the left and to the right +// of the matching subsequence. This does not yield minimal edit +// sequences, but does tend to yield matches that "look right" to people. +// +// SequenceMatcher tries to compute a "human-friendly diff" between two +// sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the +// longest *contiguous* & junk-free matching subsequence. That's what +// catches peoples' eyes. The Windows(tm) windiff has another interesting +// notion, pairing up elements that appear uniquely in each sequence. +// That, and the method here, appear to yield more intuitive difference +// reports than does diff. This method appears to be the least vulnerable +// to synching up on blocks of "junk lines", though (like blank lines in +// ordinary text files, or maybe "

" lines in HTML files). That may be +// because this is the only method of the 3 that has a *concept* of +// "junk" . +// +// Timing: Basic R-O is cubic time worst case and quadratic time expected +// case. SequenceMatcher is quadratic time for the worst case and has +// expected-case behavior dependent in a complicated way on how many +// elements the sequences have in common; best case time is linear. +type SequenceMatcher struct { + a []string + b []string + b2j map[string][]int + IsJunk func(string) bool + autoJunk bool + bJunk map[string]struct{} + matchingBlocks []Match + fullBCount map[string]int + bPopular map[string]struct{} + opCodes []OpCode +} + +func NewMatcher(a, b []string) *SequenceMatcher { + m := SequenceMatcher{autoJunk: true} + m.SetSeqs(a, b) + return &m +} + +func NewMatcherWithJunk(a, b []string, autoJunk bool, + isJunk func(string) bool) *SequenceMatcher { + + m := SequenceMatcher{IsJunk: isJunk, autoJunk: autoJunk} + m.SetSeqs(a, b) + return &m +} + +// Set two sequences to be compared. +func (m *SequenceMatcher) SetSeqs(a, b []string) { + m.SetSeq1(a) + m.SetSeq2(b) +} + +// Set the first sequence to be compared. The second sequence to be compared is +// not changed. +// +// SequenceMatcher computes and caches detailed information about the second +// sequence, so if you want to compare one sequence S against many sequences, +// use .SetSeq2(s) once and call .SetSeq1(x) repeatedly for each of the other +// sequences. +// +// See also SetSeqs() and SetSeq2(). +func (m *SequenceMatcher) SetSeq1(a []string) { + if &a == &m.a { + return + } + m.a = a + m.matchingBlocks = nil + m.opCodes = nil +} + +// Set the second sequence to be compared. The first sequence to be compared is +// not changed. +func (m *SequenceMatcher) SetSeq2(b []string) { + if &b == &m.b { + return + } + m.b = b + m.matchingBlocks = nil + m.opCodes = nil + m.fullBCount = nil + m.chainB() +} + +func (m *SequenceMatcher) chainB() { + // Populate line -> index mapping + b2j := map[string][]int{} + for i, s := range m.b { + indices := b2j[s] + indices = append(indices, i) + b2j[s] = indices + } + + // Purge junk elements + m.bJunk = map[string]struct{}{} + if m.IsJunk != nil { + junk := m.bJunk + for s, _ := range b2j { + if m.IsJunk(s) { + junk[s] = struct{}{} + } + } + for s, _ := range junk { + delete(b2j, s) + } + } + + // Purge remaining popular elements + popular := map[string]struct{}{} + n := len(m.b) + if m.autoJunk && n >= 200 { + ntest := n/100 + 1 + for s, indices := range b2j { + if len(indices) > ntest { + popular[s] = struct{}{} + } + } + for s, _ := range popular { + delete(b2j, s) + } + } + m.bPopular = popular + m.b2j = b2j +} + +func (m *SequenceMatcher) isBJunk(s string) bool { + _, ok := m.bJunk[s] + return ok +} + +// Find longest matching block in a[alo:ahi] and b[blo:bhi]. +// +// If IsJunk is not defined: +// +// Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where +// alo <= i <= i+k <= ahi +// blo <= j <= j+k <= bhi +// and for all (i',j',k') meeting those conditions, +// k >= k' +// i <= i' +// and if i == i', j <= j' +// +// In other words, of all maximal matching blocks, return one that +// starts earliest in a, and of all those maximal matching blocks that +// start earliest in a, return the one that starts earliest in b. +// +// If IsJunk is defined, first the longest matching block is +// determined as above, but with the additional restriction that no +// junk element appears in the block. Then that block is extended as +// far as possible by matching (only) junk elements on both sides. So +// the resulting block never matches on junk except as identical junk +// happens to be adjacent to an "interesting" match. +// +// If no blocks match, return (alo, blo, 0). +func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match { + // CAUTION: stripping common prefix or suffix would be incorrect. + // E.g., + // ab + // acab + // Longest matching block is "ab", but if common prefix is + // stripped, it's "a" (tied with "b"). UNIX(tm) diff does so + // strip, so ends up claiming that ab is changed to acab by + // inserting "ca" in the middle. That's minimal but unintuitive: + // "it's obvious" that someone inserted "ac" at the front. + // Windiff ends up at the same place as diff, but by pairing up + // the unique 'b's and then matching the first two 'a's. + besti, bestj, bestsize := alo, blo, 0 + + // find longest junk-free match + // during an iteration of the loop, j2len[j] = length of longest + // junk-free match ending with a[i-1] and b[j] + j2len := map[int]int{} + for i := alo; i != ahi; i++ { + // look at all instances of a[i] in b; note that because + // b2j has no junk keys, the loop is skipped if a[i] is junk + newj2len := map[int]int{} + for _, j := range m.b2j[m.a[i]] { + // a[i] matches b[j] + if j < blo { + continue + } + if j >= bhi { + break + } + k := j2len[j-1] + 1 + newj2len[j] = k + if k > bestsize { + besti, bestj, bestsize = i-k+1, j-k+1, k + } + } + j2len = newj2len + } + + // Extend the best by non-junk elements on each end. In particular, + // "popular" non-junk elements aren't in b2j, which greatly speeds + // the inner loop above, but also means "the best" match so far + // doesn't contain any junk *or* popular non-junk elements. + for besti > alo && bestj > blo && !m.isBJunk(m.b[bestj-1]) && + m.a[besti-1] == m.b[bestj-1] { + besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 + } + for besti+bestsize < ahi && bestj+bestsize < bhi && + !m.isBJunk(m.b[bestj+bestsize]) && + m.a[besti+bestsize] == m.b[bestj+bestsize] { + bestsize += 1 + } + + // Now that we have a wholly interesting match (albeit possibly + // empty!), we may as well suck up the matching junk on each + // side of it too. Can't think of a good reason not to, and it + // saves post-processing the (possibly considerable) expense of + // figuring out what to do with it. In the case of an empty + // interesting match, this is clearly the right thing to do, + // because no other kind of match is possible in the regions. + for besti > alo && bestj > blo && m.isBJunk(m.b[bestj-1]) && + m.a[besti-1] == m.b[bestj-1] { + besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 + } + for besti+bestsize < ahi && bestj+bestsize < bhi && + m.isBJunk(m.b[bestj+bestsize]) && + m.a[besti+bestsize] == m.b[bestj+bestsize] { + bestsize += 1 + } + + return Match{A: besti, B: bestj, Size: bestsize} +} + +// Return list of triples describing matching subsequences. +// +// Each triple is of the form (i, j, n), and means that +// a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in +// i and in j. It's also guaranteed that if (i, j, n) and (i', j', n') are +// adjacent triples in the list, and the second is not the last triple in the +// list, then i+n != i' or j+n != j'. IOW, adjacent triples never describe +// adjacent equal blocks. +// +// The last triple is a dummy, (len(a), len(b), 0), and is the only +// triple with n==0. +func (m *SequenceMatcher) GetMatchingBlocks() []Match { + if m.matchingBlocks != nil { + return m.matchingBlocks + } + + var matchBlocks func(alo, ahi, blo, bhi int, matched []Match) []Match + matchBlocks = func(alo, ahi, blo, bhi int, matched []Match) []Match { + match := m.findLongestMatch(alo, ahi, blo, bhi) + i, j, k := match.A, match.B, match.Size + if match.Size > 0 { + if alo < i && blo < j { + matched = matchBlocks(alo, i, blo, j, matched) + } + matched = append(matched, match) + if i+k < ahi && j+k < bhi { + matched = matchBlocks(i+k, ahi, j+k, bhi, matched) + } + } + return matched + } + matched := matchBlocks(0, len(m.a), 0, len(m.b), nil) + + // It's possible that we have adjacent equal blocks in the + // matching_blocks list now. + nonAdjacent := []Match{} + i1, j1, k1 := 0, 0, 0 + for _, b := range matched { + // Is this block adjacent to i1, j1, k1? + i2, j2, k2 := b.A, b.B, b.Size + if i1+k1 == i2 && j1+k1 == j2 { + // Yes, so collapse them -- this just increases the length of + // the first block by the length of the second, and the first + // block so lengthened remains the block to compare against. + k1 += k2 + } else { + // Not adjacent. Remember the first block (k1==0 means it's + // the dummy we started with), and make the second block the + // new block to compare against. + if k1 > 0 { + nonAdjacent = append(nonAdjacent, Match{i1, j1, k1}) + } + i1, j1, k1 = i2, j2, k2 + } + } + if k1 > 0 { + nonAdjacent = append(nonAdjacent, Match{i1, j1, k1}) + } + + nonAdjacent = append(nonAdjacent, Match{len(m.a), len(m.b), 0}) + m.matchingBlocks = nonAdjacent + return m.matchingBlocks +} + +// Return list of 5-tuples describing how to turn a into b. +// +// Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple +// has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the +// tuple preceding it, and likewise for j1 == the previous j2. +// +// The tags are characters, with these meanings: +// +// 'r' (replace): a[i1:i2] should be replaced by b[j1:j2] +// +// 'd' (delete): a[i1:i2] should be deleted, j1==j2 in this case. +// +// 'i' (insert): b[j1:j2] should be inserted at a[i1:i1], i1==i2 in this case. +// +// 'e' (equal): a[i1:i2] == b[j1:j2] +func (m *SequenceMatcher) GetOpCodes() []OpCode { + if m.opCodes != nil { + return m.opCodes + } + i, j := 0, 0 + matching := m.GetMatchingBlocks() + opCodes := make([]OpCode, 0, len(matching)) + for _, m := range matching { + // invariant: we've pumped out correct diffs to change + // a[:i] into b[:j], and the next matching block is + // a[ai:ai+size] == b[bj:bj+size]. So we need to pump + // out a diff to change a[i:ai] into b[j:bj], pump out + // the matching block, and move (i,j) beyond the match + ai, bj, size := m.A, m.B, m.Size + tag := byte(0) + if i < ai && j < bj { + tag = 'r' + } else if i < ai { + tag = 'd' + } else if j < bj { + tag = 'i' + } + if tag > 0 { + opCodes = append(opCodes, OpCode{tag, i, ai, j, bj}) + } + i, j = ai+size, bj+size + // the list of matching blocks is terminated by a + // sentinel with size 0 + if size > 0 { + opCodes = append(opCodes, OpCode{'e', ai, i, bj, j}) + } + } + m.opCodes = opCodes + return m.opCodes +} + +// Isolate change clusters by eliminating ranges with no changes. +// +// Return a generator of groups with up to n lines of context. +// Each group is in the same format as returned by GetOpCodes(). +func (m *SequenceMatcher) GetGroupedOpCodes(n int) [][]OpCode { + if n < 0 { + n = 3 + } + codes := m.GetOpCodes() + if len(codes) == 0 { + codes = []OpCode{OpCode{'e', 0, 1, 0, 1}} + } + // Fixup leading and trailing groups if they show no changes. + if codes[0].Tag == 'e' { + c := codes[0] + i1, i2, j1, j2 := c.I1, c.I2, c.J1, c.J2 + codes[0] = OpCode{c.Tag, max(i1, i2-n), i2, max(j1, j2-n), j2} + } + if codes[len(codes)-1].Tag == 'e' { + c := codes[len(codes)-1] + i1, i2, j1, j2 := c.I1, c.I2, c.J1, c.J2 + codes[len(codes)-1] = OpCode{c.Tag, i1, min(i2, i1+n), j1, min(j2, j1+n)} + } + nn := n + n + groups := [][]OpCode{} + group := []OpCode{} + for _, c := range codes { + i1, i2, j1, j2 := c.I1, c.I2, c.J1, c.J2 + // End the current group and start a new one whenever + // there is a large range with no changes. + if c.Tag == 'e' && i2-i1 > nn { + group = append(group, OpCode{c.Tag, i1, min(i2, i1+n), + j1, min(j2, j1+n)}) + groups = append(groups, group) + group = []OpCode{} + i1, j1 = max(i1, i2-n), max(j1, j2-n) + } + group = append(group, OpCode{c.Tag, i1, i2, j1, j2}) + } + if len(group) > 0 && !(len(group) == 1 && group[0].Tag == 'e') { + groups = append(groups, group) + } + return groups +} + +// Return a measure of the sequences' similarity (float in [0,1]). +// +// Where T is the total number of elements in both sequences, and +// M is the number of matches, this is 2.0*M / T. +// Note that this is 1 if the sequences are identical, and 0 if +// they have nothing in common. +// +// .Ratio() is expensive to compute if you haven't already computed +// .GetMatchingBlocks() or .GetOpCodes(), in which case you may +// want to try .QuickRatio() or .RealQuickRation() first to get an +// upper bound. +func (m *SequenceMatcher) Ratio() float64 { + matches := 0 + for _, m := range m.GetMatchingBlocks() { + matches += m.Size + } + return calculateRatio(matches, len(m.a)+len(m.b)) +} + +// Return an upper bound on ratio() relatively quickly. +// +// This isn't defined beyond that it is an upper bound on .Ratio(), and +// is faster to compute. +func (m *SequenceMatcher) QuickRatio() float64 { + // viewing a and b as multisets, set matches to the cardinality + // of their intersection; this counts the number of matches + // without regard to order, so is clearly an upper bound + if m.fullBCount == nil { + m.fullBCount = map[string]int{} + for _, s := range m.b { + m.fullBCount[s] = m.fullBCount[s] + 1 + } + } + + // avail[x] is the number of times x appears in 'b' less the + // number of times we've seen it in 'a' so far ... kinda + avail := map[string]int{} + matches := 0 + for _, s := range m.a { + n, ok := avail[s] + if !ok { + n = m.fullBCount[s] + } + avail[s] = n - 1 + if n > 0 { + matches += 1 + } + } + return calculateRatio(matches, len(m.a)+len(m.b)) +} + +// Return an upper bound on ratio() very quickly. +// +// This isn't defined beyond that it is an upper bound on .Ratio(), and +// is faster to compute than either .Ratio() or .QuickRatio(). +func (m *SequenceMatcher) RealQuickRatio() float64 { + la, lb := len(m.a), len(m.b) + return calculateRatio(min(la, lb), la+lb) +} + +// Convert range to the "ed" format +func formatRangeUnified(start, stop int) string { + // Per the diff spec at http://www.unix.org/single_unix_specification/ + beginning := start + 1 // lines start numbering with one + length := stop - start + if length == 1 { + return fmt.Sprintf("%d", beginning) + } + if length == 0 { + beginning -= 1 // empty ranges begin at line just before the range + } + return fmt.Sprintf("%d,%d", beginning, length) +} + +// Unified diff parameters +type UnifiedDiff struct { + A []string // First sequence lines + FromFile string // First file name + FromDate string // First file time + B []string // Second sequence lines + ToFile string // Second file name + ToDate string // Second file time + Eol string // Headers end of line, defaults to LF + Context int // Number of context lines +} + +// Compare two sequences of lines; generate the delta as a unified diff. +// +// Unified diffs are a compact way of showing line changes and a few +// lines of context. The number of context lines is set by 'n' which +// defaults to three. +// +// By default, the diff control lines (those with ---, +++, or @@) are +// created with a trailing newline. This is helpful so that inputs +// created from file.readlines() result in diffs that are suitable for +// file.writelines() since both the inputs and outputs have trailing +// newlines. +// +// For inputs that do not have trailing newlines, set the lineterm +// argument to "" so that the output will be uniformly newline free. +// +// The unidiff format normally has a header for filenames and modification +// times. Any or all of these may be specified using strings for +// 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. +// The modification times are normally expressed in the ISO 8601 format. +func WriteUnifiedDiff(writer io.Writer, diff UnifiedDiff) error { + buf := bufio.NewWriter(writer) + defer buf.Flush() + wf := func(format string, args ...interface{}) error { + _, err := buf.WriteString(fmt.Sprintf(format, args...)) + return err + } + ws := func(s string) error { + _, err := buf.WriteString(s) + return err + } + + if len(diff.Eol) == 0 { + diff.Eol = "\n" + } + + started := false + m := NewMatcher(diff.A, diff.B) + for _, g := range m.GetGroupedOpCodes(diff.Context) { + if !started { + started = true + fromDate := "" + if len(diff.FromDate) > 0 { + fromDate = "\t" + diff.FromDate + } + toDate := "" + if len(diff.ToDate) > 0 { + toDate = "\t" + diff.ToDate + } + if diff.FromFile != "" || diff.ToFile != "" { + err := wf("--- %s%s%s", diff.FromFile, fromDate, diff.Eol) + if err != nil { + return err + } + err = wf("+++ %s%s%s", diff.ToFile, toDate, diff.Eol) + if err != nil { + return err + } + } + } + first, last := g[0], g[len(g)-1] + range1 := formatRangeUnified(first.I1, last.I2) + range2 := formatRangeUnified(first.J1, last.J2) + if err := wf("@@ -%s +%s @@%s", range1, range2, diff.Eol); err != nil { + return err + } + for _, c := range g { + i1, i2, j1, j2 := c.I1, c.I2, c.J1, c.J2 + if c.Tag == 'e' { + for _, line := range diff.A[i1:i2] { + if err := ws(" " + line); err != nil { + return err + } + } + continue + } + if c.Tag == 'r' || c.Tag == 'd' { + for _, line := range diff.A[i1:i2] { + if err := ws("-" + line); err != nil { + return err + } + } + } + if c.Tag == 'r' || c.Tag == 'i' { + for _, line := range diff.B[j1:j2] { + if err := ws("+" + line); err != nil { + return err + } + } + } + } + } + return nil +} + +// Like WriteUnifiedDiff but returns the diff a string. +func GetUnifiedDiffString(diff UnifiedDiff) (string, error) { + w := &bytes.Buffer{} + err := WriteUnifiedDiff(w, diff) + return string(w.Bytes()), err +} + +// Convert range to the "ed" format. +func formatRangeContext(start, stop int) string { + // Per the diff spec at http://www.unix.org/single_unix_specification/ + beginning := start + 1 // lines start numbering with one + length := stop - start + if length == 0 { + beginning -= 1 // empty ranges begin at line just before the range + } + if length <= 1 { + return fmt.Sprintf("%d", beginning) + } + return fmt.Sprintf("%d,%d", beginning, beginning+length-1) +} + +type ContextDiff UnifiedDiff + +// Compare two sequences of lines; generate the delta as a context diff. +// +// Context diffs are a compact way of showing line changes and a few +// lines of context. The number of context lines is set by diff.Context +// which defaults to three. +// +// By default, the diff control lines (those with *** or ---) are +// created with a trailing newline. +// +// For inputs that do not have trailing newlines, set the diff.Eol +// argument to "" so that the output will be uniformly newline free. +// +// The context diff format normally has a header for filenames and +// modification times. Any or all of these may be specified using +// strings for diff.FromFile, diff.ToFile, diff.FromDate, diff.ToDate. +// The modification times are normally expressed in the ISO 8601 format. +// If not specified, the strings default to blanks. +func WriteContextDiff(writer io.Writer, diff ContextDiff) error { + buf := bufio.NewWriter(writer) + defer buf.Flush() + var diffErr error + wf := func(format string, args ...interface{}) { + _, err := buf.WriteString(fmt.Sprintf(format, args...)) + if diffErr == nil && err != nil { + diffErr = err + } + } + ws := func(s string) { + _, err := buf.WriteString(s) + if diffErr == nil && err != nil { + diffErr = err + } + } + + if len(diff.Eol) == 0 { + diff.Eol = "\n" + } + + prefix := map[byte]string{ + 'i': "+ ", + 'd': "- ", + 'r': "! ", + 'e': " ", + } + + started := false + m := NewMatcher(diff.A, diff.B) + for _, g := range m.GetGroupedOpCodes(diff.Context) { + if !started { + started = true + fromDate := "" + if len(diff.FromDate) > 0 { + fromDate = "\t" + diff.FromDate + } + toDate := "" + if len(diff.ToDate) > 0 { + toDate = "\t" + diff.ToDate + } + if diff.FromFile != "" || diff.ToFile != "" { + wf("*** %s%s%s", diff.FromFile, fromDate, diff.Eol) + wf("--- %s%s%s", diff.ToFile, toDate, diff.Eol) + } + } + + first, last := g[0], g[len(g)-1] + ws("***************" + diff.Eol) + + range1 := formatRangeContext(first.I1, last.I2) + wf("*** %s ****%s", range1, diff.Eol) + for _, c := range g { + if c.Tag == 'r' || c.Tag == 'd' { + for _, cc := range g { + if cc.Tag == 'i' { + continue + } + for _, line := range diff.A[cc.I1:cc.I2] { + ws(prefix[cc.Tag] + line) + } + } + break + } + } + + range2 := formatRangeContext(first.J1, last.J2) + wf("--- %s ----%s", range2, diff.Eol) + for _, c := range g { + if c.Tag == 'r' || c.Tag == 'i' { + for _, cc := range g { + if cc.Tag == 'd' { + continue + } + for _, line := range diff.B[cc.J1:cc.J2] { + ws(prefix[cc.Tag] + line) + } + } + break + } + } + } + return diffErr +} + +// Like WriteContextDiff but returns the diff a string. +func GetContextDiffString(diff ContextDiff) (string, error) { + w := &bytes.Buffer{} + err := WriteContextDiff(w, diff) + return string(w.Bytes()), err +} + +// Split a string on "\n" while preserving them. The output can be used +// as input for UnifiedDiff and ContextDiff structures. +func SplitLines(s string) []string { + lines := strings.SplitAfter(s, "\n") + lines[len(lines)-1] += "\n" + return lines +} diff --git a/vendor/github.com/stretchr/testify/LICENSE b/vendor/github.com/stretchr/testify/LICENSE new file mode 100644 index 0000000000..4b0421cf9e --- /dev/null +++ b/vendor/github.com/stretchr/testify/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2012-2020 Mat Ryer, Tyler Bunnell and contributors. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/stretchr/testify/assert/assertion_compare.go b/vendor/github.com/stretchr/testify/assert/assertion_compare.go new file mode 100644 index 0000000000..b774da88d8 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/assertion_compare.go @@ -0,0 +1,458 @@ +package assert + +import ( + "bytes" + "fmt" + "reflect" + "time" +) + +type CompareType int + +const ( + compareLess CompareType = iota - 1 + compareEqual + compareGreater +) + +var ( + intType = reflect.TypeOf(int(1)) + int8Type = reflect.TypeOf(int8(1)) + int16Type = reflect.TypeOf(int16(1)) + int32Type = reflect.TypeOf(int32(1)) + int64Type = reflect.TypeOf(int64(1)) + + uintType = reflect.TypeOf(uint(1)) + uint8Type = reflect.TypeOf(uint8(1)) + uint16Type = reflect.TypeOf(uint16(1)) + uint32Type = reflect.TypeOf(uint32(1)) + uint64Type = reflect.TypeOf(uint64(1)) + + float32Type = reflect.TypeOf(float32(1)) + float64Type = reflect.TypeOf(float64(1)) + + stringType = reflect.TypeOf("") + + timeType = reflect.TypeOf(time.Time{}) + bytesType = reflect.TypeOf([]byte{}) +) + +func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) { + obj1Value := reflect.ValueOf(obj1) + obj2Value := reflect.ValueOf(obj2) + + // throughout this switch we try and avoid calling .Convert() if possible, + // as this has a pretty big performance impact + switch kind { + case reflect.Int: + { + intobj1, ok := obj1.(int) + if !ok { + intobj1 = obj1Value.Convert(intType).Interface().(int) + } + intobj2, ok := obj2.(int) + if !ok { + intobj2 = obj2Value.Convert(intType).Interface().(int) + } + if intobj1 > intobj2 { + return compareGreater, true + } + if intobj1 == intobj2 { + return compareEqual, true + } + if intobj1 < intobj2 { + return compareLess, true + } + } + case reflect.Int8: + { + int8obj1, ok := obj1.(int8) + if !ok { + int8obj1 = obj1Value.Convert(int8Type).Interface().(int8) + } + int8obj2, ok := obj2.(int8) + if !ok { + int8obj2 = obj2Value.Convert(int8Type).Interface().(int8) + } + if int8obj1 > int8obj2 { + return compareGreater, true + } + if int8obj1 == int8obj2 { + return compareEqual, true + } + if int8obj1 < int8obj2 { + return compareLess, true + } + } + case reflect.Int16: + { + int16obj1, ok := obj1.(int16) + if !ok { + int16obj1 = obj1Value.Convert(int16Type).Interface().(int16) + } + int16obj2, ok := obj2.(int16) + if !ok { + int16obj2 = obj2Value.Convert(int16Type).Interface().(int16) + } + if int16obj1 > int16obj2 { + return compareGreater, true + } + if int16obj1 == int16obj2 { + return compareEqual, true + } + if int16obj1 < int16obj2 { + return compareLess, true + } + } + case reflect.Int32: + { + int32obj1, ok := obj1.(int32) + if !ok { + int32obj1 = obj1Value.Convert(int32Type).Interface().(int32) + } + int32obj2, ok := obj2.(int32) + if !ok { + int32obj2 = obj2Value.Convert(int32Type).Interface().(int32) + } + if int32obj1 > int32obj2 { + return compareGreater, true + } + if int32obj1 == int32obj2 { + return compareEqual, true + } + if int32obj1 < int32obj2 { + return compareLess, true + } + } + case reflect.Int64: + { + int64obj1, ok := obj1.(int64) + if !ok { + int64obj1 = obj1Value.Convert(int64Type).Interface().(int64) + } + int64obj2, ok := obj2.(int64) + if !ok { + int64obj2 = obj2Value.Convert(int64Type).Interface().(int64) + } + if int64obj1 > int64obj2 { + return compareGreater, true + } + if int64obj1 == int64obj2 { + return compareEqual, true + } + if int64obj1 < int64obj2 { + return compareLess, true + } + } + case reflect.Uint: + { + uintobj1, ok := obj1.(uint) + if !ok { + uintobj1 = obj1Value.Convert(uintType).Interface().(uint) + } + uintobj2, ok := obj2.(uint) + if !ok { + uintobj2 = obj2Value.Convert(uintType).Interface().(uint) + } + if uintobj1 > uintobj2 { + return compareGreater, true + } + if uintobj1 == uintobj2 { + return compareEqual, true + } + if uintobj1 < uintobj2 { + return compareLess, true + } + } + case reflect.Uint8: + { + uint8obj1, ok := obj1.(uint8) + if !ok { + uint8obj1 = obj1Value.Convert(uint8Type).Interface().(uint8) + } + uint8obj2, ok := obj2.(uint8) + if !ok { + uint8obj2 = obj2Value.Convert(uint8Type).Interface().(uint8) + } + if uint8obj1 > uint8obj2 { + return compareGreater, true + } + if uint8obj1 == uint8obj2 { + return compareEqual, true + } + if uint8obj1 < uint8obj2 { + return compareLess, true + } + } + case reflect.Uint16: + { + uint16obj1, ok := obj1.(uint16) + if !ok { + uint16obj1 = obj1Value.Convert(uint16Type).Interface().(uint16) + } + uint16obj2, ok := obj2.(uint16) + if !ok { + uint16obj2 = obj2Value.Convert(uint16Type).Interface().(uint16) + } + if uint16obj1 > uint16obj2 { + return compareGreater, true + } + if uint16obj1 == uint16obj2 { + return compareEqual, true + } + if uint16obj1 < uint16obj2 { + return compareLess, true + } + } + case reflect.Uint32: + { + uint32obj1, ok := obj1.(uint32) + if !ok { + uint32obj1 = obj1Value.Convert(uint32Type).Interface().(uint32) + } + uint32obj2, ok := obj2.(uint32) + if !ok { + uint32obj2 = obj2Value.Convert(uint32Type).Interface().(uint32) + } + if uint32obj1 > uint32obj2 { + return compareGreater, true + } + if uint32obj1 == uint32obj2 { + return compareEqual, true + } + if uint32obj1 < uint32obj2 { + return compareLess, true + } + } + case reflect.Uint64: + { + uint64obj1, ok := obj1.(uint64) + if !ok { + uint64obj1 = obj1Value.Convert(uint64Type).Interface().(uint64) + } + uint64obj2, ok := obj2.(uint64) + if !ok { + uint64obj2 = obj2Value.Convert(uint64Type).Interface().(uint64) + } + if uint64obj1 > uint64obj2 { + return compareGreater, true + } + if uint64obj1 == uint64obj2 { + return compareEqual, true + } + if uint64obj1 < uint64obj2 { + return compareLess, true + } + } + case reflect.Float32: + { + float32obj1, ok := obj1.(float32) + if !ok { + float32obj1 = obj1Value.Convert(float32Type).Interface().(float32) + } + float32obj2, ok := obj2.(float32) + if !ok { + float32obj2 = obj2Value.Convert(float32Type).Interface().(float32) + } + if float32obj1 > float32obj2 { + return compareGreater, true + } + if float32obj1 == float32obj2 { + return compareEqual, true + } + if float32obj1 < float32obj2 { + return compareLess, true + } + } + case reflect.Float64: + { + float64obj1, ok := obj1.(float64) + if !ok { + float64obj1 = obj1Value.Convert(float64Type).Interface().(float64) + } + float64obj2, ok := obj2.(float64) + if !ok { + float64obj2 = obj2Value.Convert(float64Type).Interface().(float64) + } + if float64obj1 > float64obj2 { + return compareGreater, true + } + if float64obj1 == float64obj2 { + return compareEqual, true + } + if float64obj1 < float64obj2 { + return compareLess, true + } + } + case reflect.String: + { + stringobj1, ok := obj1.(string) + if !ok { + stringobj1 = obj1Value.Convert(stringType).Interface().(string) + } + stringobj2, ok := obj2.(string) + if !ok { + stringobj2 = obj2Value.Convert(stringType).Interface().(string) + } + if stringobj1 > stringobj2 { + return compareGreater, true + } + if stringobj1 == stringobj2 { + return compareEqual, true + } + if stringobj1 < stringobj2 { + return compareLess, true + } + } + // Check for known struct types we can check for compare results. + case reflect.Struct: + { + // All structs enter here. We're not interested in most types. + if !canConvert(obj1Value, timeType) { + break + } + + // time.Time can compared! + timeObj1, ok := obj1.(time.Time) + if !ok { + timeObj1 = obj1Value.Convert(timeType).Interface().(time.Time) + } + + timeObj2, ok := obj2.(time.Time) + if !ok { + timeObj2 = obj2Value.Convert(timeType).Interface().(time.Time) + } + + return compare(timeObj1.UnixNano(), timeObj2.UnixNano(), reflect.Int64) + } + case reflect.Slice: + { + // We only care about the []byte type. + if !canConvert(obj1Value, bytesType) { + break + } + + // []byte can be compared! + bytesObj1, ok := obj1.([]byte) + if !ok { + bytesObj1 = obj1Value.Convert(bytesType).Interface().([]byte) + + } + bytesObj2, ok := obj2.([]byte) + if !ok { + bytesObj2 = obj2Value.Convert(bytesType).Interface().([]byte) + } + + return CompareType(bytes.Compare(bytesObj1, bytesObj2)), true + } + } + + return compareEqual, false +} + +// Greater asserts that the first element is greater than the second +// +// assert.Greater(t, 2, 1) +// assert.Greater(t, float64(2), float64(1)) +// assert.Greater(t, "b", "a") +func Greater(t TestingT, e1 interface{}, e2 interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return compareTwoValues(t, e1, e2, []CompareType{compareGreater}, "\"%v\" is not greater than \"%v\"", msgAndArgs...) +} + +// GreaterOrEqual asserts that the first element is greater than or equal to the second +// +// assert.GreaterOrEqual(t, 2, 1) +// assert.GreaterOrEqual(t, 2, 2) +// assert.GreaterOrEqual(t, "b", "a") +// assert.GreaterOrEqual(t, "b", "b") +func GreaterOrEqual(t TestingT, e1 interface{}, e2 interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return compareTwoValues(t, e1, e2, []CompareType{compareGreater, compareEqual}, "\"%v\" is not greater than or equal to \"%v\"", msgAndArgs...) +} + +// Less asserts that the first element is less than the second +// +// assert.Less(t, 1, 2) +// assert.Less(t, float64(1), float64(2)) +// assert.Less(t, "a", "b") +func Less(t TestingT, e1 interface{}, e2 interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return compareTwoValues(t, e1, e2, []CompareType{compareLess}, "\"%v\" is not less than \"%v\"", msgAndArgs...) +} + +// LessOrEqual asserts that the first element is less than or equal to the second +// +// assert.LessOrEqual(t, 1, 2) +// assert.LessOrEqual(t, 2, 2) +// assert.LessOrEqual(t, "a", "b") +// assert.LessOrEqual(t, "b", "b") +func LessOrEqual(t TestingT, e1 interface{}, e2 interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return compareTwoValues(t, e1, e2, []CompareType{compareLess, compareEqual}, "\"%v\" is not less than or equal to \"%v\"", msgAndArgs...) +} + +// Positive asserts that the specified element is positive +// +// assert.Positive(t, 1) +// assert.Positive(t, 1.23) +func Positive(t TestingT, e interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + zero := reflect.Zero(reflect.TypeOf(e)) + return compareTwoValues(t, e, zero.Interface(), []CompareType{compareGreater}, "\"%v\" is not positive", msgAndArgs...) +} + +// Negative asserts that the specified element is negative +// +// assert.Negative(t, -1) +// assert.Negative(t, -1.23) +func Negative(t TestingT, e interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + zero := reflect.Zero(reflect.TypeOf(e)) + return compareTwoValues(t, e, zero.Interface(), []CompareType{compareLess}, "\"%v\" is not negative", msgAndArgs...) +} + +func compareTwoValues(t TestingT, e1 interface{}, e2 interface{}, allowedComparesResults []CompareType, failMessage string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + e1Kind := reflect.ValueOf(e1).Kind() + e2Kind := reflect.ValueOf(e2).Kind() + if e1Kind != e2Kind { + return Fail(t, "Elements should be the same type", msgAndArgs...) + } + + compareResult, isComparable := compare(e1, e2, e1Kind) + if !isComparable { + return Fail(t, fmt.Sprintf("Can not compare type \"%s\"", reflect.TypeOf(e1)), msgAndArgs...) + } + + if !containsValue(allowedComparesResults, compareResult) { + return Fail(t, fmt.Sprintf(failMessage, e1, e2), msgAndArgs...) + } + + return true +} + +func containsValue(values []CompareType, value CompareType) bool { + for _, v := range values { + if v == value { + return true + } + } + + return false +} diff --git a/vendor/github.com/stretchr/testify/assert/assertion_compare_can_convert.go b/vendor/github.com/stretchr/testify/assert/assertion_compare_can_convert.go new file mode 100644 index 0000000000..da867903e2 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/assertion_compare_can_convert.go @@ -0,0 +1,16 @@ +//go:build go1.17 +// +build go1.17 + +// TODO: once support for Go 1.16 is dropped, this file can be +// merged/removed with assertion_compare_go1.17_test.go and +// assertion_compare_legacy.go + +package assert + +import "reflect" + +// Wrapper around reflect.Value.CanConvert, for compatibility +// reasons. +func canConvert(value reflect.Value, to reflect.Type) bool { + return value.CanConvert(to) +} diff --git a/vendor/github.com/stretchr/testify/assert/assertion_compare_legacy.go b/vendor/github.com/stretchr/testify/assert/assertion_compare_legacy.go new file mode 100644 index 0000000000..1701af2a3c --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/assertion_compare_legacy.go @@ -0,0 +1,16 @@ +//go:build !go1.17 +// +build !go1.17 + +// TODO: once support for Go 1.16 is dropped, this file can be +// merged/removed with assertion_compare_go1.17_test.go and +// assertion_compare_can_convert.go + +package assert + +import "reflect" + +// Older versions of Go does not have the reflect.Value.CanConvert +// method. +func canConvert(value reflect.Value, to reflect.Type) bool { + return false +} diff --git a/vendor/github.com/stretchr/testify/assert/assertion_format.go b/vendor/github.com/stretchr/testify/assert/assertion_format.go new file mode 100644 index 0000000000..84dbd6c790 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/assertion_format.go @@ -0,0 +1,805 @@ +/* +* CODE GENERATED AUTOMATICALLY WITH github.com/stretchr/testify/_codegen +* THIS FILE MUST NOT BE EDITED BY HAND + */ + +package assert + +import ( + http "net/http" + url "net/url" + time "time" +) + +// Conditionf uses a Comparison to assert a complex condition. +func Conditionf(t TestingT, comp Comparison, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Condition(t, comp, append([]interface{}{msg}, args...)...) +} + +// Containsf asserts that the specified string, list(array, slice...) or map contains the +// specified substring or element. +// +// assert.Containsf(t, "Hello World", "World", "error message %s", "formatted") +// assert.Containsf(t, ["Hello", "World"], "World", "error message %s", "formatted") +// assert.Containsf(t, {"Hello": "World"}, "Hello", "error message %s", "formatted") +func Containsf(t TestingT, s interface{}, contains interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Contains(t, s, contains, append([]interface{}{msg}, args...)...) +} + +// DirExistsf checks whether a directory exists in the given path. It also fails +// if the path is a file rather a directory or there is an error checking whether it exists. +func DirExistsf(t TestingT, path string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return DirExists(t, path, append([]interface{}{msg}, args...)...) +} + +// ElementsMatchf asserts that the specified listA(array, slice...) is equal to specified +// listB(array, slice...) ignoring the order of the elements. If there are duplicate elements, +// the number of appearances of each of them in both lists should match. +// +// assert.ElementsMatchf(t, [1, 3, 2, 3], [1, 3, 3, 2], "error message %s", "formatted") +func ElementsMatchf(t TestingT, listA interface{}, listB interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return ElementsMatch(t, listA, listB, append([]interface{}{msg}, args...)...) +} + +// Emptyf asserts that the specified object is empty. I.e. nil, "", false, 0 or either +// a slice or a channel with len == 0. +// +// assert.Emptyf(t, obj, "error message %s", "formatted") +func Emptyf(t TestingT, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Empty(t, object, append([]interface{}{msg}, args...)...) +} + +// Equalf asserts that two objects are equal. +// +// assert.Equalf(t, 123, 123, "error message %s", "formatted") +// +// Pointer variable equality is determined based on the equality of the +// referenced values (as opposed to the memory addresses). Function equality +// cannot be determined and will always fail. +func Equalf(t TestingT, expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Equal(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// EqualErrorf asserts that a function returned an error (i.e. not `nil`) +// and that it is equal to the provided error. +// +// actualObj, err := SomeFunction() +// assert.EqualErrorf(t, err, expectedErrorString, "error message %s", "formatted") +func EqualErrorf(t TestingT, theError error, errString string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return EqualError(t, theError, errString, append([]interface{}{msg}, args...)...) +} + +// EqualExportedValuesf asserts that the types of two objects are equal and their public +// fields are also equal. This is useful for comparing structs that have private fields +// that could potentially differ. +// +// type S struct { +// Exported int +// notExported int +// } +// assert.EqualExportedValuesf(t, S{1, 2}, S{1, 3}, "error message %s", "formatted") => true +// assert.EqualExportedValuesf(t, S{1, 2}, S{2, 3}, "error message %s", "formatted") => false +func EqualExportedValuesf(t TestingT, expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return EqualExportedValues(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// EqualValuesf asserts that two objects are equal or convertable to the same types +// and equal. +// +// assert.EqualValuesf(t, uint32(123), int32(123), "error message %s", "formatted") +func EqualValuesf(t TestingT, expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return EqualValues(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// Errorf asserts that a function returned an error (i.e. not `nil`). +// +// actualObj, err := SomeFunction() +// if assert.Errorf(t, err, "error message %s", "formatted") { +// assert.Equal(t, expectedErrorf, err) +// } +func Errorf(t TestingT, err error, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Error(t, err, append([]interface{}{msg}, args...)...) +} + +// ErrorAsf asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value. +// This is a wrapper for errors.As. +func ErrorAsf(t TestingT, err error, target interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return ErrorAs(t, err, target, append([]interface{}{msg}, args...)...) +} + +// ErrorContainsf asserts that a function returned an error (i.e. not `nil`) +// and that the error contains the specified substring. +// +// actualObj, err := SomeFunction() +// assert.ErrorContainsf(t, err, expectedErrorSubString, "error message %s", "formatted") +func ErrorContainsf(t TestingT, theError error, contains string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return ErrorContains(t, theError, contains, append([]interface{}{msg}, args...)...) +} + +// ErrorIsf asserts that at least one of the errors in err's chain matches target. +// This is a wrapper for errors.Is. +func ErrorIsf(t TestingT, err error, target error, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return ErrorIs(t, err, target, append([]interface{}{msg}, args...)...) +} + +// Eventuallyf asserts that given condition will be met in waitFor time, +// periodically checking target function each tick. +// +// assert.Eventuallyf(t, func() bool { return true; }, time.Second, 10*time.Millisecond, "error message %s", "formatted") +func Eventuallyf(t TestingT, condition func() bool, waitFor time.Duration, tick time.Duration, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Eventually(t, condition, waitFor, tick, append([]interface{}{msg}, args...)...) +} + +// EventuallyWithTf asserts that given condition will be met in waitFor time, +// periodically checking target function each tick. In contrast to Eventually, +// it supplies a CollectT to the condition function, so that the condition +// function can use the CollectT to call other assertions. +// The condition is considered "met" if no errors are raised in a tick. +// The supplied CollectT collects all errors from one tick (if there are any). +// If the condition is not met before waitFor, the collected errors of +// the last tick are copied to t. +// +// externalValue := false +// go func() { +// time.Sleep(8*time.Second) +// externalValue = true +// }() +// assert.EventuallyWithTf(t, func(c *assert.CollectT, "error message %s", "formatted") { +// // add assertions as needed; any assertion failure will fail the current tick +// assert.True(c, externalValue, "expected 'externalValue' to be true") +// }, 1*time.Second, 10*time.Second, "external state has not changed to 'true'; still false") +func EventuallyWithTf(t TestingT, condition func(collect *CollectT), waitFor time.Duration, tick time.Duration, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return EventuallyWithT(t, condition, waitFor, tick, append([]interface{}{msg}, args...)...) +} + +// Exactlyf asserts that two objects are equal in value and type. +// +// assert.Exactlyf(t, int32(123), int64(123), "error message %s", "formatted") +func Exactlyf(t TestingT, expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Exactly(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// Failf reports a failure through +func Failf(t TestingT, failureMessage string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Fail(t, failureMessage, append([]interface{}{msg}, args...)...) +} + +// FailNowf fails test +func FailNowf(t TestingT, failureMessage string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return FailNow(t, failureMessage, append([]interface{}{msg}, args...)...) +} + +// Falsef asserts that the specified value is false. +// +// assert.Falsef(t, myBool, "error message %s", "formatted") +func Falsef(t TestingT, value bool, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return False(t, value, append([]interface{}{msg}, args...)...) +} + +// FileExistsf checks whether a file exists in the given path. It also fails if +// the path points to a directory or there is an error when trying to check the file. +func FileExistsf(t TestingT, path string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return FileExists(t, path, append([]interface{}{msg}, args...)...) +} + +// Greaterf asserts that the first element is greater than the second +// +// assert.Greaterf(t, 2, 1, "error message %s", "formatted") +// assert.Greaterf(t, float64(2), float64(1), "error message %s", "formatted") +// assert.Greaterf(t, "b", "a", "error message %s", "formatted") +func Greaterf(t TestingT, e1 interface{}, e2 interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Greater(t, e1, e2, append([]interface{}{msg}, args...)...) +} + +// GreaterOrEqualf asserts that the first element is greater than or equal to the second +// +// assert.GreaterOrEqualf(t, 2, 1, "error message %s", "formatted") +// assert.GreaterOrEqualf(t, 2, 2, "error message %s", "formatted") +// assert.GreaterOrEqualf(t, "b", "a", "error message %s", "formatted") +// assert.GreaterOrEqualf(t, "b", "b", "error message %s", "formatted") +func GreaterOrEqualf(t TestingT, e1 interface{}, e2 interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return GreaterOrEqual(t, e1, e2, append([]interface{}{msg}, args...)...) +} + +// HTTPBodyContainsf asserts that a specified handler returns a +// body that contains a string. +// +// assert.HTTPBodyContainsf(t, myHandler, "GET", "www.google.com", nil, "I'm Feeling Lucky", "error message %s", "formatted") +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPBodyContainsf(t TestingT, handler http.HandlerFunc, method string, url string, values url.Values, str interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return HTTPBodyContains(t, handler, method, url, values, str, append([]interface{}{msg}, args...)...) +} + +// HTTPBodyNotContainsf asserts that a specified handler returns a +// body that does not contain a string. +// +// assert.HTTPBodyNotContainsf(t, myHandler, "GET", "www.google.com", nil, "I'm Feeling Lucky", "error message %s", "formatted") +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPBodyNotContainsf(t TestingT, handler http.HandlerFunc, method string, url string, values url.Values, str interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return HTTPBodyNotContains(t, handler, method, url, values, str, append([]interface{}{msg}, args...)...) +} + +// HTTPErrorf asserts that a specified handler returns an error status code. +// +// assert.HTTPErrorf(t, myHandler, "POST", "/a/b/c", url.Values{"a": []string{"b", "c"}} +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPErrorf(t TestingT, handler http.HandlerFunc, method string, url string, values url.Values, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return HTTPError(t, handler, method, url, values, append([]interface{}{msg}, args...)...) +} + +// HTTPRedirectf asserts that a specified handler returns a redirect status code. +// +// assert.HTTPRedirectf(t, myHandler, "GET", "/a/b/c", url.Values{"a": []string{"b", "c"}} +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPRedirectf(t TestingT, handler http.HandlerFunc, method string, url string, values url.Values, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return HTTPRedirect(t, handler, method, url, values, append([]interface{}{msg}, args...)...) +} + +// HTTPStatusCodef asserts that a specified handler returns a specified status code. +// +// assert.HTTPStatusCodef(t, myHandler, "GET", "/notImplemented", nil, 501, "error message %s", "formatted") +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPStatusCodef(t TestingT, handler http.HandlerFunc, method string, url string, values url.Values, statuscode int, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return HTTPStatusCode(t, handler, method, url, values, statuscode, append([]interface{}{msg}, args...)...) +} + +// HTTPSuccessf asserts that a specified handler returns a success status code. +// +// assert.HTTPSuccessf(t, myHandler, "POST", "http://www.google.com", nil, "error message %s", "formatted") +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPSuccessf(t TestingT, handler http.HandlerFunc, method string, url string, values url.Values, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return HTTPSuccess(t, handler, method, url, values, append([]interface{}{msg}, args...)...) +} + +// Implementsf asserts that an object is implemented by the specified interface. +// +// assert.Implementsf(t, (*MyInterface)(nil), new(MyObject), "error message %s", "formatted") +func Implementsf(t TestingT, interfaceObject interface{}, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Implements(t, interfaceObject, object, append([]interface{}{msg}, args...)...) +} + +// InDeltaf asserts that the two numerals are within delta of each other. +// +// assert.InDeltaf(t, math.Pi, 22/7.0, 0.01, "error message %s", "formatted") +func InDeltaf(t TestingT, expected interface{}, actual interface{}, delta float64, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return InDelta(t, expected, actual, delta, append([]interface{}{msg}, args...)...) +} + +// InDeltaMapValuesf is the same as InDelta, but it compares all values between two maps. Both maps must have exactly the same keys. +func InDeltaMapValuesf(t TestingT, expected interface{}, actual interface{}, delta float64, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return InDeltaMapValues(t, expected, actual, delta, append([]interface{}{msg}, args...)...) +} + +// InDeltaSlicef is the same as InDelta, except it compares two slices. +func InDeltaSlicef(t TestingT, expected interface{}, actual interface{}, delta float64, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return InDeltaSlice(t, expected, actual, delta, append([]interface{}{msg}, args...)...) +} + +// InEpsilonf asserts that expected and actual have a relative error less than epsilon +func InEpsilonf(t TestingT, expected interface{}, actual interface{}, epsilon float64, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return InEpsilon(t, expected, actual, epsilon, append([]interface{}{msg}, args...)...) +} + +// InEpsilonSlicef is the same as InEpsilon, except it compares each value from two slices. +func InEpsilonSlicef(t TestingT, expected interface{}, actual interface{}, epsilon float64, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return InEpsilonSlice(t, expected, actual, epsilon, append([]interface{}{msg}, args...)...) +} + +// IsDecreasingf asserts that the collection is decreasing +// +// assert.IsDecreasingf(t, []int{2, 1, 0}, "error message %s", "formatted") +// assert.IsDecreasingf(t, []float{2, 1}, "error message %s", "formatted") +// assert.IsDecreasingf(t, []string{"b", "a"}, "error message %s", "formatted") +func IsDecreasingf(t TestingT, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return IsDecreasing(t, object, append([]interface{}{msg}, args...)...) +} + +// IsIncreasingf asserts that the collection is increasing +// +// assert.IsIncreasingf(t, []int{1, 2, 3}, "error message %s", "formatted") +// assert.IsIncreasingf(t, []float{1, 2}, "error message %s", "formatted") +// assert.IsIncreasingf(t, []string{"a", "b"}, "error message %s", "formatted") +func IsIncreasingf(t TestingT, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return IsIncreasing(t, object, append([]interface{}{msg}, args...)...) +} + +// IsNonDecreasingf asserts that the collection is not decreasing +// +// assert.IsNonDecreasingf(t, []int{1, 1, 2}, "error message %s", "formatted") +// assert.IsNonDecreasingf(t, []float{1, 2}, "error message %s", "formatted") +// assert.IsNonDecreasingf(t, []string{"a", "b"}, "error message %s", "formatted") +func IsNonDecreasingf(t TestingT, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return IsNonDecreasing(t, object, append([]interface{}{msg}, args...)...) +} + +// IsNonIncreasingf asserts that the collection is not increasing +// +// assert.IsNonIncreasingf(t, []int{2, 1, 1}, "error message %s", "formatted") +// assert.IsNonIncreasingf(t, []float{2, 1}, "error message %s", "formatted") +// assert.IsNonIncreasingf(t, []string{"b", "a"}, "error message %s", "formatted") +func IsNonIncreasingf(t TestingT, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return IsNonIncreasing(t, object, append([]interface{}{msg}, args...)...) +} + +// IsTypef asserts that the specified objects are of the same type. +func IsTypef(t TestingT, expectedType interface{}, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return IsType(t, expectedType, object, append([]interface{}{msg}, args...)...) +} + +// JSONEqf asserts that two JSON strings are equivalent. +// +// assert.JSONEqf(t, `{"hello": "world", "foo": "bar"}`, `{"foo": "bar", "hello": "world"}`, "error message %s", "formatted") +func JSONEqf(t TestingT, expected string, actual string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return JSONEq(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// Lenf asserts that the specified object has specific length. +// Lenf also fails if the object has a type that len() not accept. +// +// assert.Lenf(t, mySlice, 3, "error message %s", "formatted") +func Lenf(t TestingT, object interface{}, length int, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Len(t, object, length, append([]interface{}{msg}, args...)...) +} + +// Lessf asserts that the first element is less than the second +// +// assert.Lessf(t, 1, 2, "error message %s", "formatted") +// assert.Lessf(t, float64(1), float64(2), "error message %s", "formatted") +// assert.Lessf(t, "a", "b", "error message %s", "formatted") +func Lessf(t TestingT, e1 interface{}, e2 interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Less(t, e1, e2, append([]interface{}{msg}, args...)...) +} + +// LessOrEqualf asserts that the first element is less than or equal to the second +// +// assert.LessOrEqualf(t, 1, 2, "error message %s", "formatted") +// assert.LessOrEqualf(t, 2, 2, "error message %s", "formatted") +// assert.LessOrEqualf(t, "a", "b", "error message %s", "formatted") +// assert.LessOrEqualf(t, "b", "b", "error message %s", "formatted") +func LessOrEqualf(t TestingT, e1 interface{}, e2 interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return LessOrEqual(t, e1, e2, append([]interface{}{msg}, args...)...) +} + +// Negativef asserts that the specified element is negative +// +// assert.Negativef(t, -1, "error message %s", "formatted") +// assert.Negativef(t, -1.23, "error message %s", "formatted") +func Negativef(t TestingT, e interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Negative(t, e, append([]interface{}{msg}, args...)...) +} + +// Neverf asserts that the given condition doesn't satisfy in waitFor time, +// periodically checking the target function each tick. +// +// assert.Neverf(t, func() bool { return false; }, time.Second, 10*time.Millisecond, "error message %s", "formatted") +func Neverf(t TestingT, condition func() bool, waitFor time.Duration, tick time.Duration, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Never(t, condition, waitFor, tick, append([]interface{}{msg}, args...)...) +} + +// Nilf asserts that the specified object is nil. +// +// assert.Nilf(t, err, "error message %s", "formatted") +func Nilf(t TestingT, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Nil(t, object, append([]interface{}{msg}, args...)...) +} + +// NoDirExistsf checks whether a directory does not exist in the given path. +// It fails if the path points to an existing _directory_ only. +func NoDirExistsf(t TestingT, path string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NoDirExists(t, path, append([]interface{}{msg}, args...)...) +} + +// NoErrorf asserts that a function returned no error (i.e. `nil`). +// +// actualObj, err := SomeFunction() +// if assert.NoErrorf(t, err, "error message %s", "formatted") { +// assert.Equal(t, expectedObj, actualObj) +// } +func NoErrorf(t TestingT, err error, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NoError(t, err, append([]interface{}{msg}, args...)...) +} + +// NoFileExistsf checks whether a file does not exist in a given path. It fails +// if the path points to an existing _file_ only. +func NoFileExistsf(t TestingT, path string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NoFileExists(t, path, append([]interface{}{msg}, args...)...) +} + +// NotContainsf asserts that the specified string, list(array, slice...) or map does NOT contain the +// specified substring or element. +// +// assert.NotContainsf(t, "Hello World", "Earth", "error message %s", "formatted") +// assert.NotContainsf(t, ["Hello", "World"], "Earth", "error message %s", "formatted") +// assert.NotContainsf(t, {"Hello": "World"}, "Earth", "error message %s", "formatted") +func NotContainsf(t TestingT, s interface{}, contains interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotContains(t, s, contains, append([]interface{}{msg}, args...)...) +} + +// NotEmptyf asserts that the specified object is NOT empty. I.e. not nil, "", false, 0 or either +// a slice or a channel with len == 0. +// +// if assert.NotEmptyf(t, obj, "error message %s", "formatted") { +// assert.Equal(t, "two", obj[1]) +// } +func NotEmptyf(t TestingT, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotEmpty(t, object, append([]interface{}{msg}, args...)...) +} + +// NotEqualf asserts that the specified values are NOT equal. +// +// assert.NotEqualf(t, obj1, obj2, "error message %s", "formatted") +// +// Pointer variable equality is determined based on the equality of the +// referenced values (as opposed to the memory addresses). +func NotEqualf(t TestingT, expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotEqual(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// NotEqualValuesf asserts that two objects are not equal even when converted to the same type +// +// assert.NotEqualValuesf(t, obj1, obj2, "error message %s", "formatted") +func NotEqualValuesf(t TestingT, expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotEqualValues(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// NotErrorIsf asserts that at none of the errors in err's chain matches target. +// This is a wrapper for errors.Is. +func NotErrorIsf(t TestingT, err error, target error, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotErrorIs(t, err, target, append([]interface{}{msg}, args...)...) +} + +// NotNilf asserts that the specified object is not nil. +// +// assert.NotNilf(t, err, "error message %s", "formatted") +func NotNilf(t TestingT, object interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotNil(t, object, append([]interface{}{msg}, args...)...) +} + +// NotPanicsf asserts that the code inside the specified PanicTestFunc does NOT panic. +// +// assert.NotPanicsf(t, func(){ RemainCalm() }, "error message %s", "formatted") +func NotPanicsf(t TestingT, f PanicTestFunc, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotPanics(t, f, append([]interface{}{msg}, args...)...) +} + +// NotRegexpf asserts that a specified regexp does not match a string. +// +// assert.NotRegexpf(t, regexp.MustCompile("starts"), "it's starting", "error message %s", "formatted") +// assert.NotRegexpf(t, "^start", "it's not starting", "error message %s", "formatted") +func NotRegexpf(t TestingT, rx interface{}, str interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotRegexp(t, rx, str, append([]interface{}{msg}, args...)...) +} + +// NotSamef asserts that two pointers do not reference the same object. +// +// assert.NotSamef(t, ptr1, ptr2, "error message %s", "formatted") +// +// Both arguments must be pointer variables. Pointer variable sameness is +// determined based on the equality of both type and value. +func NotSamef(t TestingT, expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotSame(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// NotSubsetf asserts that the specified list(array, slice...) contains not all +// elements given in the specified subset(array, slice...). +// +// assert.NotSubsetf(t, [1, 3, 4], [1, 2], "But [1, 3, 4] does not contain [1, 2]", "error message %s", "formatted") +func NotSubsetf(t TestingT, list interface{}, subset interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotSubset(t, list, subset, append([]interface{}{msg}, args...)...) +} + +// NotZerof asserts that i is not the zero value for its type. +func NotZerof(t TestingT, i interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return NotZero(t, i, append([]interface{}{msg}, args...)...) +} + +// Panicsf asserts that the code inside the specified PanicTestFunc panics. +// +// assert.Panicsf(t, func(){ GoCrazy() }, "error message %s", "formatted") +func Panicsf(t TestingT, f PanicTestFunc, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Panics(t, f, append([]interface{}{msg}, args...)...) +} + +// PanicsWithErrorf asserts that the code inside the specified PanicTestFunc +// panics, and that the recovered panic value is an error that satisfies the +// EqualError comparison. +// +// assert.PanicsWithErrorf(t, "crazy error", func(){ GoCrazy() }, "error message %s", "formatted") +func PanicsWithErrorf(t TestingT, errString string, f PanicTestFunc, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return PanicsWithError(t, errString, f, append([]interface{}{msg}, args...)...) +} + +// PanicsWithValuef asserts that the code inside the specified PanicTestFunc panics, and that +// the recovered panic value equals the expected panic value. +// +// assert.PanicsWithValuef(t, "crazy error", func(){ GoCrazy() }, "error message %s", "formatted") +func PanicsWithValuef(t TestingT, expected interface{}, f PanicTestFunc, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return PanicsWithValue(t, expected, f, append([]interface{}{msg}, args...)...) +} + +// Positivef asserts that the specified element is positive +// +// assert.Positivef(t, 1, "error message %s", "formatted") +// assert.Positivef(t, 1.23, "error message %s", "formatted") +func Positivef(t TestingT, e interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Positive(t, e, append([]interface{}{msg}, args...)...) +} + +// Regexpf asserts that a specified regexp matches a string. +// +// assert.Regexpf(t, regexp.MustCompile("start"), "it's starting", "error message %s", "formatted") +// assert.Regexpf(t, "start...$", "it's not starting", "error message %s", "formatted") +func Regexpf(t TestingT, rx interface{}, str interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Regexp(t, rx, str, append([]interface{}{msg}, args...)...) +} + +// Samef asserts that two pointers reference the same object. +// +// assert.Samef(t, ptr1, ptr2, "error message %s", "formatted") +// +// Both arguments must be pointer variables. Pointer variable sameness is +// determined based on the equality of both type and value. +func Samef(t TestingT, expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Same(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// Subsetf asserts that the specified list(array, slice...) contains all +// elements given in the specified subset(array, slice...). +// +// assert.Subsetf(t, [1, 2, 3], [1, 2], "But [1, 2, 3] does contain [1, 2]", "error message %s", "formatted") +func Subsetf(t TestingT, list interface{}, subset interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Subset(t, list, subset, append([]interface{}{msg}, args...)...) +} + +// Truef asserts that the specified value is true. +// +// assert.Truef(t, myBool, "error message %s", "formatted") +func Truef(t TestingT, value bool, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return True(t, value, append([]interface{}{msg}, args...)...) +} + +// WithinDurationf asserts that the two times are within duration delta of each other. +// +// assert.WithinDurationf(t, time.Now(), time.Now(), 10*time.Second, "error message %s", "formatted") +func WithinDurationf(t TestingT, expected time.Time, actual time.Time, delta time.Duration, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return WithinDuration(t, expected, actual, delta, append([]interface{}{msg}, args...)...) +} + +// WithinRangef asserts that a time is within a time range (inclusive). +// +// assert.WithinRangef(t, time.Now(), time.Now().Add(-time.Second), time.Now().Add(time.Second), "error message %s", "formatted") +func WithinRangef(t TestingT, actual time.Time, start time.Time, end time.Time, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return WithinRange(t, actual, start, end, append([]interface{}{msg}, args...)...) +} + +// YAMLEqf asserts that two YAML strings are equivalent. +func YAMLEqf(t TestingT, expected string, actual string, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return YAMLEq(t, expected, actual, append([]interface{}{msg}, args...)...) +} + +// Zerof asserts that i is the zero value for its type. +func Zerof(t TestingT, i interface{}, msg string, args ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Zero(t, i, append([]interface{}{msg}, args...)...) +} diff --git a/vendor/github.com/stretchr/testify/assert/assertion_format.go.tmpl b/vendor/github.com/stretchr/testify/assert/assertion_format.go.tmpl new file mode 100644 index 0000000000..d2bb0b8177 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/assertion_format.go.tmpl @@ -0,0 +1,5 @@ +{{.CommentFormat}} +func {{.DocInfo.Name}}f(t TestingT, {{.ParamsFormat}}) bool { + if h, ok := t.(tHelper); ok { h.Helper() } + return {{.DocInfo.Name}}(t, {{.ForwardedParamsFormat}}) +} diff --git a/vendor/github.com/stretchr/testify/assert/assertion_forward.go b/vendor/github.com/stretchr/testify/assert/assertion_forward.go new file mode 100644 index 0000000000..b1d94aec53 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/assertion_forward.go @@ -0,0 +1,1598 @@ +/* +* CODE GENERATED AUTOMATICALLY WITH github.com/stretchr/testify/_codegen +* THIS FILE MUST NOT BE EDITED BY HAND + */ + +package assert + +import ( + http "net/http" + url "net/url" + time "time" +) + +// Condition uses a Comparison to assert a complex condition. +func (a *Assertions) Condition(comp Comparison, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Condition(a.t, comp, msgAndArgs...) +} + +// Conditionf uses a Comparison to assert a complex condition. +func (a *Assertions) Conditionf(comp Comparison, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Conditionf(a.t, comp, msg, args...) +} + +// Contains asserts that the specified string, list(array, slice...) or map contains the +// specified substring or element. +// +// a.Contains("Hello World", "World") +// a.Contains(["Hello", "World"], "World") +// a.Contains({"Hello": "World"}, "Hello") +func (a *Assertions) Contains(s interface{}, contains interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Contains(a.t, s, contains, msgAndArgs...) +} + +// Containsf asserts that the specified string, list(array, slice...) or map contains the +// specified substring or element. +// +// a.Containsf("Hello World", "World", "error message %s", "formatted") +// a.Containsf(["Hello", "World"], "World", "error message %s", "formatted") +// a.Containsf({"Hello": "World"}, "Hello", "error message %s", "formatted") +func (a *Assertions) Containsf(s interface{}, contains interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Containsf(a.t, s, contains, msg, args...) +} + +// DirExists checks whether a directory exists in the given path. It also fails +// if the path is a file rather a directory or there is an error checking whether it exists. +func (a *Assertions) DirExists(path string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return DirExists(a.t, path, msgAndArgs...) +} + +// DirExistsf checks whether a directory exists in the given path. It also fails +// if the path is a file rather a directory or there is an error checking whether it exists. +func (a *Assertions) DirExistsf(path string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return DirExistsf(a.t, path, msg, args...) +} + +// ElementsMatch asserts that the specified listA(array, slice...) is equal to specified +// listB(array, slice...) ignoring the order of the elements. If there are duplicate elements, +// the number of appearances of each of them in both lists should match. +// +// a.ElementsMatch([1, 3, 2, 3], [1, 3, 3, 2]) +func (a *Assertions) ElementsMatch(listA interface{}, listB interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return ElementsMatch(a.t, listA, listB, msgAndArgs...) +} + +// ElementsMatchf asserts that the specified listA(array, slice...) is equal to specified +// listB(array, slice...) ignoring the order of the elements. If there are duplicate elements, +// the number of appearances of each of them in both lists should match. +// +// a.ElementsMatchf([1, 3, 2, 3], [1, 3, 3, 2], "error message %s", "formatted") +func (a *Assertions) ElementsMatchf(listA interface{}, listB interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return ElementsMatchf(a.t, listA, listB, msg, args...) +} + +// Empty asserts that the specified object is empty. I.e. nil, "", false, 0 or either +// a slice or a channel with len == 0. +// +// a.Empty(obj) +func (a *Assertions) Empty(object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Empty(a.t, object, msgAndArgs...) +} + +// Emptyf asserts that the specified object is empty. I.e. nil, "", false, 0 or either +// a slice or a channel with len == 0. +// +// a.Emptyf(obj, "error message %s", "formatted") +func (a *Assertions) Emptyf(object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Emptyf(a.t, object, msg, args...) +} + +// Equal asserts that two objects are equal. +// +// a.Equal(123, 123) +// +// Pointer variable equality is determined based on the equality of the +// referenced values (as opposed to the memory addresses). Function equality +// cannot be determined and will always fail. +func (a *Assertions) Equal(expected interface{}, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Equal(a.t, expected, actual, msgAndArgs...) +} + +// EqualError asserts that a function returned an error (i.e. not `nil`) +// and that it is equal to the provided error. +// +// actualObj, err := SomeFunction() +// a.EqualError(err, expectedErrorString) +func (a *Assertions) EqualError(theError error, errString string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return EqualError(a.t, theError, errString, msgAndArgs...) +} + +// EqualErrorf asserts that a function returned an error (i.e. not `nil`) +// and that it is equal to the provided error. +// +// actualObj, err := SomeFunction() +// a.EqualErrorf(err, expectedErrorString, "error message %s", "formatted") +func (a *Assertions) EqualErrorf(theError error, errString string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return EqualErrorf(a.t, theError, errString, msg, args...) +} + +// EqualExportedValues asserts that the types of two objects are equal and their public +// fields are also equal. This is useful for comparing structs that have private fields +// that could potentially differ. +// +// type S struct { +// Exported int +// notExported int +// } +// a.EqualExportedValues(S{1, 2}, S{1, 3}) => true +// a.EqualExportedValues(S{1, 2}, S{2, 3}) => false +func (a *Assertions) EqualExportedValues(expected interface{}, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return EqualExportedValues(a.t, expected, actual, msgAndArgs...) +} + +// EqualExportedValuesf asserts that the types of two objects are equal and their public +// fields are also equal. This is useful for comparing structs that have private fields +// that could potentially differ. +// +// type S struct { +// Exported int +// notExported int +// } +// a.EqualExportedValuesf(S{1, 2}, S{1, 3}, "error message %s", "formatted") => true +// a.EqualExportedValuesf(S{1, 2}, S{2, 3}, "error message %s", "formatted") => false +func (a *Assertions) EqualExportedValuesf(expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return EqualExportedValuesf(a.t, expected, actual, msg, args...) +} + +// EqualValues asserts that two objects are equal or convertable to the same types +// and equal. +// +// a.EqualValues(uint32(123), int32(123)) +func (a *Assertions) EqualValues(expected interface{}, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return EqualValues(a.t, expected, actual, msgAndArgs...) +} + +// EqualValuesf asserts that two objects are equal or convertable to the same types +// and equal. +// +// a.EqualValuesf(uint32(123), int32(123), "error message %s", "formatted") +func (a *Assertions) EqualValuesf(expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return EqualValuesf(a.t, expected, actual, msg, args...) +} + +// Equalf asserts that two objects are equal. +// +// a.Equalf(123, 123, "error message %s", "formatted") +// +// Pointer variable equality is determined based on the equality of the +// referenced values (as opposed to the memory addresses). Function equality +// cannot be determined and will always fail. +func (a *Assertions) Equalf(expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Equalf(a.t, expected, actual, msg, args...) +} + +// Error asserts that a function returned an error (i.e. not `nil`). +// +// actualObj, err := SomeFunction() +// if a.Error(err) { +// assert.Equal(t, expectedError, err) +// } +func (a *Assertions) Error(err error, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Error(a.t, err, msgAndArgs...) +} + +// ErrorAs asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value. +// This is a wrapper for errors.As. +func (a *Assertions) ErrorAs(err error, target interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return ErrorAs(a.t, err, target, msgAndArgs...) +} + +// ErrorAsf asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value. +// This is a wrapper for errors.As. +func (a *Assertions) ErrorAsf(err error, target interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return ErrorAsf(a.t, err, target, msg, args...) +} + +// ErrorContains asserts that a function returned an error (i.e. not `nil`) +// and that the error contains the specified substring. +// +// actualObj, err := SomeFunction() +// a.ErrorContains(err, expectedErrorSubString) +func (a *Assertions) ErrorContains(theError error, contains string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return ErrorContains(a.t, theError, contains, msgAndArgs...) +} + +// ErrorContainsf asserts that a function returned an error (i.e. not `nil`) +// and that the error contains the specified substring. +// +// actualObj, err := SomeFunction() +// a.ErrorContainsf(err, expectedErrorSubString, "error message %s", "formatted") +func (a *Assertions) ErrorContainsf(theError error, contains string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return ErrorContainsf(a.t, theError, contains, msg, args...) +} + +// ErrorIs asserts that at least one of the errors in err's chain matches target. +// This is a wrapper for errors.Is. +func (a *Assertions) ErrorIs(err error, target error, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return ErrorIs(a.t, err, target, msgAndArgs...) +} + +// ErrorIsf asserts that at least one of the errors in err's chain matches target. +// This is a wrapper for errors.Is. +func (a *Assertions) ErrorIsf(err error, target error, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return ErrorIsf(a.t, err, target, msg, args...) +} + +// Errorf asserts that a function returned an error (i.e. not `nil`). +// +// actualObj, err := SomeFunction() +// if a.Errorf(err, "error message %s", "formatted") { +// assert.Equal(t, expectedErrorf, err) +// } +func (a *Assertions) Errorf(err error, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Errorf(a.t, err, msg, args...) +} + +// Eventually asserts that given condition will be met in waitFor time, +// periodically checking target function each tick. +// +// a.Eventually(func() bool { return true; }, time.Second, 10*time.Millisecond) +func (a *Assertions) Eventually(condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Eventually(a.t, condition, waitFor, tick, msgAndArgs...) +} + +// EventuallyWithT asserts that given condition will be met in waitFor time, +// periodically checking target function each tick. In contrast to Eventually, +// it supplies a CollectT to the condition function, so that the condition +// function can use the CollectT to call other assertions. +// The condition is considered "met" if no errors are raised in a tick. +// The supplied CollectT collects all errors from one tick (if there are any). +// If the condition is not met before waitFor, the collected errors of +// the last tick are copied to t. +// +// externalValue := false +// go func() { +// time.Sleep(8*time.Second) +// externalValue = true +// }() +// a.EventuallyWithT(func(c *assert.CollectT) { +// // add assertions as needed; any assertion failure will fail the current tick +// assert.True(c, externalValue, "expected 'externalValue' to be true") +// }, 1*time.Second, 10*time.Second, "external state has not changed to 'true'; still false") +func (a *Assertions) EventuallyWithT(condition func(collect *CollectT), waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return EventuallyWithT(a.t, condition, waitFor, tick, msgAndArgs...) +} + +// EventuallyWithTf asserts that given condition will be met in waitFor time, +// periodically checking target function each tick. In contrast to Eventually, +// it supplies a CollectT to the condition function, so that the condition +// function can use the CollectT to call other assertions. +// The condition is considered "met" if no errors are raised in a tick. +// The supplied CollectT collects all errors from one tick (if there are any). +// If the condition is not met before waitFor, the collected errors of +// the last tick are copied to t. +// +// externalValue := false +// go func() { +// time.Sleep(8*time.Second) +// externalValue = true +// }() +// a.EventuallyWithTf(func(c *assert.CollectT, "error message %s", "formatted") { +// // add assertions as needed; any assertion failure will fail the current tick +// assert.True(c, externalValue, "expected 'externalValue' to be true") +// }, 1*time.Second, 10*time.Second, "external state has not changed to 'true'; still false") +func (a *Assertions) EventuallyWithTf(condition func(collect *CollectT), waitFor time.Duration, tick time.Duration, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return EventuallyWithTf(a.t, condition, waitFor, tick, msg, args...) +} + +// Eventuallyf asserts that given condition will be met in waitFor time, +// periodically checking target function each tick. +// +// a.Eventuallyf(func() bool { return true; }, time.Second, 10*time.Millisecond, "error message %s", "formatted") +func (a *Assertions) Eventuallyf(condition func() bool, waitFor time.Duration, tick time.Duration, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Eventuallyf(a.t, condition, waitFor, tick, msg, args...) +} + +// Exactly asserts that two objects are equal in value and type. +// +// a.Exactly(int32(123), int64(123)) +func (a *Assertions) Exactly(expected interface{}, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Exactly(a.t, expected, actual, msgAndArgs...) +} + +// Exactlyf asserts that two objects are equal in value and type. +// +// a.Exactlyf(int32(123), int64(123), "error message %s", "formatted") +func (a *Assertions) Exactlyf(expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Exactlyf(a.t, expected, actual, msg, args...) +} + +// Fail reports a failure through +func (a *Assertions) Fail(failureMessage string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Fail(a.t, failureMessage, msgAndArgs...) +} + +// FailNow fails test +func (a *Assertions) FailNow(failureMessage string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return FailNow(a.t, failureMessage, msgAndArgs...) +} + +// FailNowf fails test +func (a *Assertions) FailNowf(failureMessage string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return FailNowf(a.t, failureMessage, msg, args...) +} + +// Failf reports a failure through +func (a *Assertions) Failf(failureMessage string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Failf(a.t, failureMessage, msg, args...) +} + +// False asserts that the specified value is false. +// +// a.False(myBool) +func (a *Assertions) False(value bool, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return False(a.t, value, msgAndArgs...) +} + +// Falsef asserts that the specified value is false. +// +// a.Falsef(myBool, "error message %s", "formatted") +func (a *Assertions) Falsef(value bool, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Falsef(a.t, value, msg, args...) +} + +// FileExists checks whether a file exists in the given path. It also fails if +// the path points to a directory or there is an error when trying to check the file. +func (a *Assertions) FileExists(path string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return FileExists(a.t, path, msgAndArgs...) +} + +// FileExistsf checks whether a file exists in the given path. It also fails if +// the path points to a directory or there is an error when trying to check the file. +func (a *Assertions) FileExistsf(path string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return FileExistsf(a.t, path, msg, args...) +} + +// Greater asserts that the first element is greater than the second +// +// a.Greater(2, 1) +// a.Greater(float64(2), float64(1)) +// a.Greater("b", "a") +func (a *Assertions) Greater(e1 interface{}, e2 interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Greater(a.t, e1, e2, msgAndArgs...) +} + +// GreaterOrEqual asserts that the first element is greater than or equal to the second +// +// a.GreaterOrEqual(2, 1) +// a.GreaterOrEqual(2, 2) +// a.GreaterOrEqual("b", "a") +// a.GreaterOrEqual("b", "b") +func (a *Assertions) GreaterOrEqual(e1 interface{}, e2 interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return GreaterOrEqual(a.t, e1, e2, msgAndArgs...) +} + +// GreaterOrEqualf asserts that the first element is greater than or equal to the second +// +// a.GreaterOrEqualf(2, 1, "error message %s", "formatted") +// a.GreaterOrEqualf(2, 2, "error message %s", "formatted") +// a.GreaterOrEqualf("b", "a", "error message %s", "formatted") +// a.GreaterOrEqualf("b", "b", "error message %s", "formatted") +func (a *Assertions) GreaterOrEqualf(e1 interface{}, e2 interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return GreaterOrEqualf(a.t, e1, e2, msg, args...) +} + +// Greaterf asserts that the first element is greater than the second +// +// a.Greaterf(2, 1, "error message %s", "formatted") +// a.Greaterf(float64(2), float64(1), "error message %s", "formatted") +// a.Greaterf("b", "a", "error message %s", "formatted") +func (a *Assertions) Greaterf(e1 interface{}, e2 interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Greaterf(a.t, e1, e2, msg, args...) +} + +// HTTPBodyContains asserts that a specified handler returns a +// body that contains a string. +// +// a.HTTPBodyContains(myHandler, "GET", "www.google.com", nil, "I'm Feeling Lucky") +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPBodyContains(handler http.HandlerFunc, method string, url string, values url.Values, str interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPBodyContains(a.t, handler, method, url, values, str, msgAndArgs...) +} + +// HTTPBodyContainsf asserts that a specified handler returns a +// body that contains a string. +// +// a.HTTPBodyContainsf(myHandler, "GET", "www.google.com", nil, "I'm Feeling Lucky", "error message %s", "formatted") +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPBodyContainsf(handler http.HandlerFunc, method string, url string, values url.Values, str interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPBodyContainsf(a.t, handler, method, url, values, str, msg, args...) +} + +// HTTPBodyNotContains asserts that a specified handler returns a +// body that does not contain a string. +// +// a.HTTPBodyNotContains(myHandler, "GET", "www.google.com", nil, "I'm Feeling Lucky") +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPBodyNotContains(handler http.HandlerFunc, method string, url string, values url.Values, str interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPBodyNotContains(a.t, handler, method, url, values, str, msgAndArgs...) +} + +// HTTPBodyNotContainsf asserts that a specified handler returns a +// body that does not contain a string. +// +// a.HTTPBodyNotContainsf(myHandler, "GET", "www.google.com", nil, "I'm Feeling Lucky", "error message %s", "formatted") +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPBodyNotContainsf(handler http.HandlerFunc, method string, url string, values url.Values, str interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPBodyNotContainsf(a.t, handler, method, url, values, str, msg, args...) +} + +// HTTPError asserts that a specified handler returns an error status code. +// +// a.HTTPError(myHandler, "POST", "/a/b/c", url.Values{"a": []string{"b", "c"}} +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPError(handler http.HandlerFunc, method string, url string, values url.Values, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPError(a.t, handler, method, url, values, msgAndArgs...) +} + +// HTTPErrorf asserts that a specified handler returns an error status code. +// +// a.HTTPErrorf(myHandler, "POST", "/a/b/c", url.Values{"a": []string{"b", "c"}} +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPErrorf(handler http.HandlerFunc, method string, url string, values url.Values, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPErrorf(a.t, handler, method, url, values, msg, args...) +} + +// HTTPRedirect asserts that a specified handler returns a redirect status code. +// +// a.HTTPRedirect(myHandler, "GET", "/a/b/c", url.Values{"a": []string{"b", "c"}} +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPRedirect(handler http.HandlerFunc, method string, url string, values url.Values, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPRedirect(a.t, handler, method, url, values, msgAndArgs...) +} + +// HTTPRedirectf asserts that a specified handler returns a redirect status code. +// +// a.HTTPRedirectf(myHandler, "GET", "/a/b/c", url.Values{"a": []string{"b", "c"}} +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPRedirectf(handler http.HandlerFunc, method string, url string, values url.Values, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPRedirectf(a.t, handler, method, url, values, msg, args...) +} + +// HTTPStatusCode asserts that a specified handler returns a specified status code. +// +// a.HTTPStatusCode(myHandler, "GET", "/notImplemented", nil, 501) +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPStatusCode(handler http.HandlerFunc, method string, url string, values url.Values, statuscode int, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPStatusCode(a.t, handler, method, url, values, statuscode, msgAndArgs...) +} + +// HTTPStatusCodef asserts that a specified handler returns a specified status code. +// +// a.HTTPStatusCodef(myHandler, "GET", "/notImplemented", nil, 501, "error message %s", "formatted") +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPStatusCodef(handler http.HandlerFunc, method string, url string, values url.Values, statuscode int, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPStatusCodef(a.t, handler, method, url, values, statuscode, msg, args...) +} + +// HTTPSuccess asserts that a specified handler returns a success status code. +// +// a.HTTPSuccess(myHandler, "POST", "http://www.google.com", nil) +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPSuccess(handler http.HandlerFunc, method string, url string, values url.Values, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPSuccess(a.t, handler, method, url, values, msgAndArgs...) +} + +// HTTPSuccessf asserts that a specified handler returns a success status code. +// +// a.HTTPSuccessf(myHandler, "POST", "http://www.google.com", nil, "error message %s", "formatted") +// +// Returns whether the assertion was successful (true) or not (false). +func (a *Assertions) HTTPSuccessf(handler http.HandlerFunc, method string, url string, values url.Values, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return HTTPSuccessf(a.t, handler, method, url, values, msg, args...) +} + +// Implements asserts that an object is implemented by the specified interface. +// +// a.Implements((*MyInterface)(nil), new(MyObject)) +func (a *Assertions) Implements(interfaceObject interface{}, object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Implements(a.t, interfaceObject, object, msgAndArgs...) +} + +// Implementsf asserts that an object is implemented by the specified interface. +// +// a.Implementsf((*MyInterface)(nil), new(MyObject), "error message %s", "formatted") +func (a *Assertions) Implementsf(interfaceObject interface{}, object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Implementsf(a.t, interfaceObject, object, msg, args...) +} + +// InDelta asserts that the two numerals are within delta of each other. +// +// a.InDelta(math.Pi, 22/7.0, 0.01) +func (a *Assertions) InDelta(expected interface{}, actual interface{}, delta float64, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InDelta(a.t, expected, actual, delta, msgAndArgs...) +} + +// InDeltaMapValues is the same as InDelta, but it compares all values between two maps. Both maps must have exactly the same keys. +func (a *Assertions) InDeltaMapValues(expected interface{}, actual interface{}, delta float64, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InDeltaMapValues(a.t, expected, actual, delta, msgAndArgs...) +} + +// InDeltaMapValuesf is the same as InDelta, but it compares all values between two maps. Both maps must have exactly the same keys. +func (a *Assertions) InDeltaMapValuesf(expected interface{}, actual interface{}, delta float64, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InDeltaMapValuesf(a.t, expected, actual, delta, msg, args...) +} + +// InDeltaSlice is the same as InDelta, except it compares two slices. +func (a *Assertions) InDeltaSlice(expected interface{}, actual interface{}, delta float64, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InDeltaSlice(a.t, expected, actual, delta, msgAndArgs...) +} + +// InDeltaSlicef is the same as InDelta, except it compares two slices. +func (a *Assertions) InDeltaSlicef(expected interface{}, actual interface{}, delta float64, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InDeltaSlicef(a.t, expected, actual, delta, msg, args...) +} + +// InDeltaf asserts that the two numerals are within delta of each other. +// +// a.InDeltaf(math.Pi, 22/7.0, 0.01, "error message %s", "formatted") +func (a *Assertions) InDeltaf(expected interface{}, actual interface{}, delta float64, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InDeltaf(a.t, expected, actual, delta, msg, args...) +} + +// InEpsilon asserts that expected and actual have a relative error less than epsilon +func (a *Assertions) InEpsilon(expected interface{}, actual interface{}, epsilon float64, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InEpsilon(a.t, expected, actual, epsilon, msgAndArgs...) +} + +// InEpsilonSlice is the same as InEpsilon, except it compares each value from two slices. +func (a *Assertions) InEpsilonSlice(expected interface{}, actual interface{}, epsilon float64, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InEpsilonSlice(a.t, expected, actual, epsilon, msgAndArgs...) +} + +// InEpsilonSlicef is the same as InEpsilon, except it compares each value from two slices. +func (a *Assertions) InEpsilonSlicef(expected interface{}, actual interface{}, epsilon float64, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InEpsilonSlicef(a.t, expected, actual, epsilon, msg, args...) +} + +// InEpsilonf asserts that expected and actual have a relative error less than epsilon +func (a *Assertions) InEpsilonf(expected interface{}, actual interface{}, epsilon float64, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return InEpsilonf(a.t, expected, actual, epsilon, msg, args...) +} + +// IsDecreasing asserts that the collection is decreasing +// +// a.IsDecreasing([]int{2, 1, 0}) +// a.IsDecreasing([]float{2, 1}) +// a.IsDecreasing([]string{"b", "a"}) +func (a *Assertions) IsDecreasing(object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsDecreasing(a.t, object, msgAndArgs...) +} + +// IsDecreasingf asserts that the collection is decreasing +// +// a.IsDecreasingf([]int{2, 1, 0}, "error message %s", "formatted") +// a.IsDecreasingf([]float{2, 1}, "error message %s", "formatted") +// a.IsDecreasingf([]string{"b", "a"}, "error message %s", "formatted") +func (a *Assertions) IsDecreasingf(object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsDecreasingf(a.t, object, msg, args...) +} + +// IsIncreasing asserts that the collection is increasing +// +// a.IsIncreasing([]int{1, 2, 3}) +// a.IsIncreasing([]float{1, 2}) +// a.IsIncreasing([]string{"a", "b"}) +func (a *Assertions) IsIncreasing(object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsIncreasing(a.t, object, msgAndArgs...) +} + +// IsIncreasingf asserts that the collection is increasing +// +// a.IsIncreasingf([]int{1, 2, 3}, "error message %s", "formatted") +// a.IsIncreasingf([]float{1, 2}, "error message %s", "formatted") +// a.IsIncreasingf([]string{"a", "b"}, "error message %s", "formatted") +func (a *Assertions) IsIncreasingf(object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsIncreasingf(a.t, object, msg, args...) +} + +// IsNonDecreasing asserts that the collection is not decreasing +// +// a.IsNonDecreasing([]int{1, 1, 2}) +// a.IsNonDecreasing([]float{1, 2}) +// a.IsNonDecreasing([]string{"a", "b"}) +func (a *Assertions) IsNonDecreasing(object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsNonDecreasing(a.t, object, msgAndArgs...) +} + +// IsNonDecreasingf asserts that the collection is not decreasing +// +// a.IsNonDecreasingf([]int{1, 1, 2}, "error message %s", "formatted") +// a.IsNonDecreasingf([]float{1, 2}, "error message %s", "formatted") +// a.IsNonDecreasingf([]string{"a", "b"}, "error message %s", "formatted") +func (a *Assertions) IsNonDecreasingf(object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsNonDecreasingf(a.t, object, msg, args...) +} + +// IsNonIncreasing asserts that the collection is not increasing +// +// a.IsNonIncreasing([]int{2, 1, 1}) +// a.IsNonIncreasing([]float{2, 1}) +// a.IsNonIncreasing([]string{"b", "a"}) +func (a *Assertions) IsNonIncreasing(object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsNonIncreasing(a.t, object, msgAndArgs...) +} + +// IsNonIncreasingf asserts that the collection is not increasing +// +// a.IsNonIncreasingf([]int{2, 1, 1}, "error message %s", "formatted") +// a.IsNonIncreasingf([]float{2, 1}, "error message %s", "formatted") +// a.IsNonIncreasingf([]string{"b", "a"}, "error message %s", "formatted") +func (a *Assertions) IsNonIncreasingf(object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsNonIncreasingf(a.t, object, msg, args...) +} + +// IsType asserts that the specified objects are of the same type. +func (a *Assertions) IsType(expectedType interface{}, object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsType(a.t, expectedType, object, msgAndArgs...) +} + +// IsTypef asserts that the specified objects are of the same type. +func (a *Assertions) IsTypef(expectedType interface{}, object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return IsTypef(a.t, expectedType, object, msg, args...) +} + +// JSONEq asserts that two JSON strings are equivalent. +// +// a.JSONEq(`{"hello": "world", "foo": "bar"}`, `{"foo": "bar", "hello": "world"}`) +func (a *Assertions) JSONEq(expected string, actual string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return JSONEq(a.t, expected, actual, msgAndArgs...) +} + +// JSONEqf asserts that two JSON strings are equivalent. +// +// a.JSONEqf(`{"hello": "world", "foo": "bar"}`, `{"foo": "bar", "hello": "world"}`, "error message %s", "formatted") +func (a *Assertions) JSONEqf(expected string, actual string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return JSONEqf(a.t, expected, actual, msg, args...) +} + +// Len asserts that the specified object has specific length. +// Len also fails if the object has a type that len() not accept. +// +// a.Len(mySlice, 3) +func (a *Assertions) Len(object interface{}, length int, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Len(a.t, object, length, msgAndArgs...) +} + +// Lenf asserts that the specified object has specific length. +// Lenf also fails if the object has a type that len() not accept. +// +// a.Lenf(mySlice, 3, "error message %s", "formatted") +func (a *Assertions) Lenf(object interface{}, length int, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Lenf(a.t, object, length, msg, args...) +} + +// Less asserts that the first element is less than the second +// +// a.Less(1, 2) +// a.Less(float64(1), float64(2)) +// a.Less("a", "b") +func (a *Assertions) Less(e1 interface{}, e2 interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Less(a.t, e1, e2, msgAndArgs...) +} + +// LessOrEqual asserts that the first element is less than or equal to the second +// +// a.LessOrEqual(1, 2) +// a.LessOrEqual(2, 2) +// a.LessOrEqual("a", "b") +// a.LessOrEqual("b", "b") +func (a *Assertions) LessOrEqual(e1 interface{}, e2 interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return LessOrEqual(a.t, e1, e2, msgAndArgs...) +} + +// LessOrEqualf asserts that the first element is less than or equal to the second +// +// a.LessOrEqualf(1, 2, "error message %s", "formatted") +// a.LessOrEqualf(2, 2, "error message %s", "formatted") +// a.LessOrEqualf("a", "b", "error message %s", "formatted") +// a.LessOrEqualf("b", "b", "error message %s", "formatted") +func (a *Assertions) LessOrEqualf(e1 interface{}, e2 interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return LessOrEqualf(a.t, e1, e2, msg, args...) +} + +// Lessf asserts that the first element is less than the second +// +// a.Lessf(1, 2, "error message %s", "formatted") +// a.Lessf(float64(1), float64(2), "error message %s", "formatted") +// a.Lessf("a", "b", "error message %s", "formatted") +func (a *Assertions) Lessf(e1 interface{}, e2 interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Lessf(a.t, e1, e2, msg, args...) +} + +// Negative asserts that the specified element is negative +// +// a.Negative(-1) +// a.Negative(-1.23) +func (a *Assertions) Negative(e interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Negative(a.t, e, msgAndArgs...) +} + +// Negativef asserts that the specified element is negative +// +// a.Negativef(-1, "error message %s", "formatted") +// a.Negativef(-1.23, "error message %s", "formatted") +func (a *Assertions) Negativef(e interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Negativef(a.t, e, msg, args...) +} + +// Never asserts that the given condition doesn't satisfy in waitFor time, +// periodically checking the target function each tick. +// +// a.Never(func() bool { return false; }, time.Second, 10*time.Millisecond) +func (a *Assertions) Never(condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Never(a.t, condition, waitFor, tick, msgAndArgs...) +} + +// Neverf asserts that the given condition doesn't satisfy in waitFor time, +// periodically checking the target function each tick. +// +// a.Neverf(func() bool { return false; }, time.Second, 10*time.Millisecond, "error message %s", "formatted") +func (a *Assertions) Neverf(condition func() bool, waitFor time.Duration, tick time.Duration, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Neverf(a.t, condition, waitFor, tick, msg, args...) +} + +// Nil asserts that the specified object is nil. +// +// a.Nil(err) +func (a *Assertions) Nil(object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Nil(a.t, object, msgAndArgs...) +} + +// Nilf asserts that the specified object is nil. +// +// a.Nilf(err, "error message %s", "formatted") +func (a *Assertions) Nilf(object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Nilf(a.t, object, msg, args...) +} + +// NoDirExists checks whether a directory does not exist in the given path. +// It fails if the path points to an existing _directory_ only. +func (a *Assertions) NoDirExists(path string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NoDirExists(a.t, path, msgAndArgs...) +} + +// NoDirExistsf checks whether a directory does not exist in the given path. +// It fails if the path points to an existing _directory_ only. +func (a *Assertions) NoDirExistsf(path string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NoDirExistsf(a.t, path, msg, args...) +} + +// NoError asserts that a function returned no error (i.e. `nil`). +// +// actualObj, err := SomeFunction() +// if a.NoError(err) { +// assert.Equal(t, expectedObj, actualObj) +// } +func (a *Assertions) NoError(err error, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NoError(a.t, err, msgAndArgs...) +} + +// NoErrorf asserts that a function returned no error (i.e. `nil`). +// +// actualObj, err := SomeFunction() +// if a.NoErrorf(err, "error message %s", "formatted") { +// assert.Equal(t, expectedObj, actualObj) +// } +func (a *Assertions) NoErrorf(err error, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NoErrorf(a.t, err, msg, args...) +} + +// NoFileExists checks whether a file does not exist in a given path. It fails +// if the path points to an existing _file_ only. +func (a *Assertions) NoFileExists(path string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NoFileExists(a.t, path, msgAndArgs...) +} + +// NoFileExistsf checks whether a file does not exist in a given path. It fails +// if the path points to an existing _file_ only. +func (a *Assertions) NoFileExistsf(path string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NoFileExistsf(a.t, path, msg, args...) +} + +// NotContains asserts that the specified string, list(array, slice...) or map does NOT contain the +// specified substring or element. +// +// a.NotContains("Hello World", "Earth") +// a.NotContains(["Hello", "World"], "Earth") +// a.NotContains({"Hello": "World"}, "Earth") +func (a *Assertions) NotContains(s interface{}, contains interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotContains(a.t, s, contains, msgAndArgs...) +} + +// NotContainsf asserts that the specified string, list(array, slice...) or map does NOT contain the +// specified substring or element. +// +// a.NotContainsf("Hello World", "Earth", "error message %s", "formatted") +// a.NotContainsf(["Hello", "World"], "Earth", "error message %s", "formatted") +// a.NotContainsf({"Hello": "World"}, "Earth", "error message %s", "formatted") +func (a *Assertions) NotContainsf(s interface{}, contains interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotContainsf(a.t, s, contains, msg, args...) +} + +// NotEmpty asserts that the specified object is NOT empty. I.e. not nil, "", false, 0 or either +// a slice or a channel with len == 0. +// +// if a.NotEmpty(obj) { +// assert.Equal(t, "two", obj[1]) +// } +func (a *Assertions) NotEmpty(object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotEmpty(a.t, object, msgAndArgs...) +} + +// NotEmptyf asserts that the specified object is NOT empty. I.e. not nil, "", false, 0 or either +// a slice or a channel with len == 0. +// +// if a.NotEmptyf(obj, "error message %s", "formatted") { +// assert.Equal(t, "two", obj[1]) +// } +func (a *Assertions) NotEmptyf(object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotEmptyf(a.t, object, msg, args...) +} + +// NotEqual asserts that the specified values are NOT equal. +// +// a.NotEqual(obj1, obj2) +// +// Pointer variable equality is determined based on the equality of the +// referenced values (as opposed to the memory addresses). +func (a *Assertions) NotEqual(expected interface{}, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotEqual(a.t, expected, actual, msgAndArgs...) +} + +// NotEqualValues asserts that two objects are not equal even when converted to the same type +// +// a.NotEqualValues(obj1, obj2) +func (a *Assertions) NotEqualValues(expected interface{}, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotEqualValues(a.t, expected, actual, msgAndArgs...) +} + +// NotEqualValuesf asserts that two objects are not equal even when converted to the same type +// +// a.NotEqualValuesf(obj1, obj2, "error message %s", "formatted") +func (a *Assertions) NotEqualValuesf(expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotEqualValuesf(a.t, expected, actual, msg, args...) +} + +// NotEqualf asserts that the specified values are NOT equal. +// +// a.NotEqualf(obj1, obj2, "error message %s", "formatted") +// +// Pointer variable equality is determined based on the equality of the +// referenced values (as opposed to the memory addresses). +func (a *Assertions) NotEqualf(expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotEqualf(a.t, expected, actual, msg, args...) +} + +// NotErrorIs asserts that at none of the errors in err's chain matches target. +// This is a wrapper for errors.Is. +func (a *Assertions) NotErrorIs(err error, target error, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotErrorIs(a.t, err, target, msgAndArgs...) +} + +// NotErrorIsf asserts that at none of the errors in err's chain matches target. +// This is a wrapper for errors.Is. +func (a *Assertions) NotErrorIsf(err error, target error, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotErrorIsf(a.t, err, target, msg, args...) +} + +// NotNil asserts that the specified object is not nil. +// +// a.NotNil(err) +func (a *Assertions) NotNil(object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotNil(a.t, object, msgAndArgs...) +} + +// NotNilf asserts that the specified object is not nil. +// +// a.NotNilf(err, "error message %s", "formatted") +func (a *Assertions) NotNilf(object interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotNilf(a.t, object, msg, args...) +} + +// NotPanics asserts that the code inside the specified PanicTestFunc does NOT panic. +// +// a.NotPanics(func(){ RemainCalm() }) +func (a *Assertions) NotPanics(f PanicTestFunc, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotPanics(a.t, f, msgAndArgs...) +} + +// NotPanicsf asserts that the code inside the specified PanicTestFunc does NOT panic. +// +// a.NotPanicsf(func(){ RemainCalm() }, "error message %s", "formatted") +func (a *Assertions) NotPanicsf(f PanicTestFunc, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotPanicsf(a.t, f, msg, args...) +} + +// NotRegexp asserts that a specified regexp does not match a string. +// +// a.NotRegexp(regexp.MustCompile("starts"), "it's starting") +// a.NotRegexp("^start", "it's not starting") +func (a *Assertions) NotRegexp(rx interface{}, str interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotRegexp(a.t, rx, str, msgAndArgs...) +} + +// NotRegexpf asserts that a specified regexp does not match a string. +// +// a.NotRegexpf(regexp.MustCompile("starts"), "it's starting", "error message %s", "formatted") +// a.NotRegexpf("^start", "it's not starting", "error message %s", "formatted") +func (a *Assertions) NotRegexpf(rx interface{}, str interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotRegexpf(a.t, rx, str, msg, args...) +} + +// NotSame asserts that two pointers do not reference the same object. +// +// a.NotSame(ptr1, ptr2) +// +// Both arguments must be pointer variables. Pointer variable sameness is +// determined based on the equality of both type and value. +func (a *Assertions) NotSame(expected interface{}, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotSame(a.t, expected, actual, msgAndArgs...) +} + +// NotSamef asserts that two pointers do not reference the same object. +// +// a.NotSamef(ptr1, ptr2, "error message %s", "formatted") +// +// Both arguments must be pointer variables. Pointer variable sameness is +// determined based on the equality of both type and value. +func (a *Assertions) NotSamef(expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotSamef(a.t, expected, actual, msg, args...) +} + +// NotSubset asserts that the specified list(array, slice...) contains not all +// elements given in the specified subset(array, slice...). +// +// a.NotSubset([1, 3, 4], [1, 2], "But [1, 3, 4] does not contain [1, 2]") +func (a *Assertions) NotSubset(list interface{}, subset interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotSubset(a.t, list, subset, msgAndArgs...) +} + +// NotSubsetf asserts that the specified list(array, slice...) contains not all +// elements given in the specified subset(array, slice...). +// +// a.NotSubsetf([1, 3, 4], [1, 2], "But [1, 3, 4] does not contain [1, 2]", "error message %s", "formatted") +func (a *Assertions) NotSubsetf(list interface{}, subset interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotSubsetf(a.t, list, subset, msg, args...) +} + +// NotZero asserts that i is not the zero value for its type. +func (a *Assertions) NotZero(i interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotZero(a.t, i, msgAndArgs...) +} + +// NotZerof asserts that i is not the zero value for its type. +func (a *Assertions) NotZerof(i interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return NotZerof(a.t, i, msg, args...) +} + +// Panics asserts that the code inside the specified PanicTestFunc panics. +// +// a.Panics(func(){ GoCrazy() }) +func (a *Assertions) Panics(f PanicTestFunc, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Panics(a.t, f, msgAndArgs...) +} + +// PanicsWithError asserts that the code inside the specified PanicTestFunc +// panics, and that the recovered panic value is an error that satisfies the +// EqualError comparison. +// +// a.PanicsWithError("crazy error", func(){ GoCrazy() }) +func (a *Assertions) PanicsWithError(errString string, f PanicTestFunc, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return PanicsWithError(a.t, errString, f, msgAndArgs...) +} + +// PanicsWithErrorf asserts that the code inside the specified PanicTestFunc +// panics, and that the recovered panic value is an error that satisfies the +// EqualError comparison. +// +// a.PanicsWithErrorf("crazy error", func(){ GoCrazy() }, "error message %s", "formatted") +func (a *Assertions) PanicsWithErrorf(errString string, f PanicTestFunc, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return PanicsWithErrorf(a.t, errString, f, msg, args...) +} + +// PanicsWithValue asserts that the code inside the specified PanicTestFunc panics, and that +// the recovered panic value equals the expected panic value. +// +// a.PanicsWithValue("crazy error", func(){ GoCrazy() }) +func (a *Assertions) PanicsWithValue(expected interface{}, f PanicTestFunc, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return PanicsWithValue(a.t, expected, f, msgAndArgs...) +} + +// PanicsWithValuef asserts that the code inside the specified PanicTestFunc panics, and that +// the recovered panic value equals the expected panic value. +// +// a.PanicsWithValuef("crazy error", func(){ GoCrazy() }, "error message %s", "formatted") +func (a *Assertions) PanicsWithValuef(expected interface{}, f PanicTestFunc, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return PanicsWithValuef(a.t, expected, f, msg, args...) +} + +// Panicsf asserts that the code inside the specified PanicTestFunc panics. +// +// a.Panicsf(func(){ GoCrazy() }, "error message %s", "formatted") +func (a *Assertions) Panicsf(f PanicTestFunc, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Panicsf(a.t, f, msg, args...) +} + +// Positive asserts that the specified element is positive +// +// a.Positive(1) +// a.Positive(1.23) +func (a *Assertions) Positive(e interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Positive(a.t, e, msgAndArgs...) +} + +// Positivef asserts that the specified element is positive +// +// a.Positivef(1, "error message %s", "formatted") +// a.Positivef(1.23, "error message %s", "formatted") +func (a *Assertions) Positivef(e interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Positivef(a.t, e, msg, args...) +} + +// Regexp asserts that a specified regexp matches a string. +// +// a.Regexp(regexp.MustCompile("start"), "it's starting") +// a.Regexp("start...$", "it's not starting") +func (a *Assertions) Regexp(rx interface{}, str interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Regexp(a.t, rx, str, msgAndArgs...) +} + +// Regexpf asserts that a specified regexp matches a string. +// +// a.Regexpf(regexp.MustCompile("start"), "it's starting", "error message %s", "formatted") +// a.Regexpf("start...$", "it's not starting", "error message %s", "formatted") +func (a *Assertions) Regexpf(rx interface{}, str interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Regexpf(a.t, rx, str, msg, args...) +} + +// Same asserts that two pointers reference the same object. +// +// a.Same(ptr1, ptr2) +// +// Both arguments must be pointer variables. Pointer variable sameness is +// determined based on the equality of both type and value. +func (a *Assertions) Same(expected interface{}, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Same(a.t, expected, actual, msgAndArgs...) +} + +// Samef asserts that two pointers reference the same object. +// +// a.Samef(ptr1, ptr2, "error message %s", "formatted") +// +// Both arguments must be pointer variables. Pointer variable sameness is +// determined based on the equality of both type and value. +func (a *Assertions) Samef(expected interface{}, actual interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Samef(a.t, expected, actual, msg, args...) +} + +// Subset asserts that the specified list(array, slice...) contains all +// elements given in the specified subset(array, slice...). +// +// a.Subset([1, 2, 3], [1, 2], "But [1, 2, 3] does contain [1, 2]") +func (a *Assertions) Subset(list interface{}, subset interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Subset(a.t, list, subset, msgAndArgs...) +} + +// Subsetf asserts that the specified list(array, slice...) contains all +// elements given in the specified subset(array, slice...). +// +// a.Subsetf([1, 2, 3], [1, 2], "But [1, 2, 3] does contain [1, 2]", "error message %s", "formatted") +func (a *Assertions) Subsetf(list interface{}, subset interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Subsetf(a.t, list, subset, msg, args...) +} + +// True asserts that the specified value is true. +// +// a.True(myBool) +func (a *Assertions) True(value bool, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return True(a.t, value, msgAndArgs...) +} + +// Truef asserts that the specified value is true. +// +// a.Truef(myBool, "error message %s", "formatted") +func (a *Assertions) Truef(value bool, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Truef(a.t, value, msg, args...) +} + +// WithinDuration asserts that the two times are within duration delta of each other. +// +// a.WithinDuration(time.Now(), time.Now(), 10*time.Second) +func (a *Assertions) WithinDuration(expected time.Time, actual time.Time, delta time.Duration, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return WithinDuration(a.t, expected, actual, delta, msgAndArgs...) +} + +// WithinDurationf asserts that the two times are within duration delta of each other. +// +// a.WithinDurationf(time.Now(), time.Now(), 10*time.Second, "error message %s", "formatted") +func (a *Assertions) WithinDurationf(expected time.Time, actual time.Time, delta time.Duration, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return WithinDurationf(a.t, expected, actual, delta, msg, args...) +} + +// WithinRange asserts that a time is within a time range (inclusive). +// +// a.WithinRange(time.Now(), time.Now().Add(-time.Second), time.Now().Add(time.Second)) +func (a *Assertions) WithinRange(actual time.Time, start time.Time, end time.Time, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return WithinRange(a.t, actual, start, end, msgAndArgs...) +} + +// WithinRangef asserts that a time is within a time range (inclusive). +// +// a.WithinRangef(time.Now(), time.Now().Add(-time.Second), time.Now().Add(time.Second), "error message %s", "formatted") +func (a *Assertions) WithinRangef(actual time.Time, start time.Time, end time.Time, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return WithinRangef(a.t, actual, start, end, msg, args...) +} + +// YAMLEq asserts that two YAML strings are equivalent. +func (a *Assertions) YAMLEq(expected string, actual string, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return YAMLEq(a.t, expected, actual, msgAndArgs...) +} + +// YAMLEqf asserts that two YAML strings are equivalent. +func (a *Assertions) YAMLEqf(expected string, actual string, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return YAMLEqf(a.t, expected, actual, msg, args...) +} + +// Zero asserts that i is the zero value for its type. +func (a *Assertions) Zero(i interface{}, msgAndArgs ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Zero(a.t, i, msgAndArgs...) +} + +// Zerof asserts that i is the zero value for its type. +func (a *Assertions) Zerof(i interface{}, msg string, args ...interface{}) bool { + if h, ok := a.t.(tHelper); ok { + h.Helper() + } + return Zerof(a.t, i, msg, args...) +} diff --git a/vendor/github.com/stretchr/testify/assert/assertion_forward.go.tmpl b/vendor/github.com/stretchr/testify/assert/assertion_forward.go.tmpl new file mode 100644 index 0000000000..188bb9e174 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/assertion_forward.go.tmpl @@ -0,0 +1,5 @@ +{{.CommentWithoutT "a"}} +func (a *Assertions) {{.DocInfo.Name}}({{.Params}}) bool { + if h, ok := a.t.(tHelper); ok { h.Helper() } + return {{.DocInfo.Name}}(a.t, {{.ForwardedParams}}) +} diff --git a/vendor/github.com/stretchr/testify/assert/assertion_order.go b/vendor/github.com/stretchr/testify/assert/assertion_order.go new file mode 100644 index 0000000000..00df62a059 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/assertion_order.go @@ -0,0 +1,81 @@ +package assert + +import ( + "fmt" + "reflect" +) + +// isOrdered checks that collection contains orderable elements. +func isOrdered(t TestingT, object interface{}, allowedComparesResults []CompareType, failMessage string, msgAndArgs ...interface{}) bool { + objKind := reflect.TypeOf(object).Kind() + if objKind != reflect.Slice && objKind != reflect.Array { + return false + } + + objValue := reflect.ValueOf(object) + objLen := objValue.Len() + + if objLen <= 1 { + return true + } + + value := objValue.Index(0) + valueInterface := value.Interface() + firstValueKind := value.Kind() + + for i := 1; i < objLen; i++ { + prevValue := value + prevValueInterface := valueInterface + + value = objValue.Index(i) + valueInterface = value.Interface() + + compareResult, isComparable := compare(prevValueInterface, valueInterface, firstValueKind) + + if !isComparable { + return Fail(t, fmt.Sprintf("Can not compare type \"%s\" and \"%s\"", reflect.TypeOf(value), reflect.TypeOf(prevValue)), msgAndArgs...) + } + + if !containsValue(allowedComparesResults, compareResult) { + return Fail(t, fmt.Sprintf(failMessage, prevValue, value), msgAndArgs...) + } + } + + return true +} + +// IsIncreasing asserts that the collection is increasing +// +// assert.IsIncreasing(t, []int{1, 2, 3}) +// assert.IsIncreasing(t, []float{1, 2}) +// assert.IsIncreasing(t, []string{"a", "b"}) +func IsIncreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) bool { + return isOrdered(t, object, []CompareType{compareLess}, "\"%v\" is not less than \"%v\"", msgAndArgs...) +} + +// IsNonIncreasing asserts that the collection is not increasing +// +// assert.IsNonIncreasing(t, []int{2, 1, 1}) +// assert.IsNonIncreasing(t, []float{2, 1}) +// assert.IsNonIncreasing(t, []string{"b", "a"}) +func IsNonIncreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) bool { + return isOrdered(t, object, []CompareType{compareEqual, compareGreater}, "\"%v\" is not greater than or equal to \"%v\"", msgAndArgs...) +} + +// IsDecreasing asserts that the collection is decreasing +// +// assert.IsDecreasing(t, []int{2, 1, 0}) +// assert.IsDecreasing(t, []float{2, 1}) +// assert.IsDecreasing(t, []string{"b", "a"}) +func IsDecreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) bool { + return isOrdered(t, object, []CompareType{compareGreater}, "\"%v\" is not greater than \"%v\"", msgAndArgs...) +} + +// IsNonDecreasing asserts that the collection is not decreasing +// +// assert.IsNonDecreasing(t, []int{1, 1, 2}) +// assert.IsNonDecreasing(t, []float{1, 2}) +// assert.IsNonDecreasing(t, []string{"a", "b"}) +func IsNonDecreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) bool { + return isOrdered(t, object, []CompareType{compareLess, compareEqual}, "\"%v\" is not less than or equal to \"%v\"", msgAndArgs...) +} diff --git a/vendor/github.com/stretchr/testify/assert/assertions.go b/vendor/github.com/stretchr/testify/assert/assertions.go new file mode 100644 index 0000000000..a55d1bba92 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/assertions.go @@ -0,0 +1,2054 @@ +package assert + +import ( + "bufio" + "bytes" + "encoding/json" + "errors" + "fmt" + "math" + "os" + "reflect" + "regexp" + "runtime" + "runtime/debug" + "strings" + "time" + "unicode" + "unicode/utf8" + + "github.com/davecgh/go-spew/spew" + "github.com/pmezard/go-difflib/difflib" + yaml "gopkg.in/yaml.v3" +) + +//go:generate sh -c "cd ../_codegen && go build && cd - && ../_codegen/_codegen -output-package=assert -template=assertion_format.go.tmpl" + +// TestingT is an interface wrapper around *testing.T +type TestingT interface { + Errorf(format string, args ...interface{}) +} + +// ComparisonAssertionFunc is a common function prototype when comparing two values. Can be useful +// for table driven tests. +type ComparisonAssertionFunc func(TestingT, interface{}, interface{}, ...interface{}) bool + +// ValueAssertionFunc is a common function prototype when validating a single value. Can be useful +// for table driven tests. +type ValueAssertionFunc func(TestingT, interface{}, ...interface{}) bool + +// BoolAssertionFunc is a common function prototype when validating a bool value. Can be useful +// for table driven tests. +type BoolAssertionFunc func(TestingT, bool, ...interface{}) bool + +// ErrorAssertionFunc is a common function prototype when validating an error value. Can be useful +// for table driven tests. +type ErrorAssertionFunc func(TestingT, error, ...interface{}) bool + +// Comparison is a custom function that returns true on success and false on failure +type Comparison func() (success bool) + +/* + Helper functions +*/ + +// ObjectsAreEqual determines if two objects are considered equal. +// +// This function does no assertion of any kind. +func ObjectsAreEqual(expected, actual interface{}) bool { + if expected == nil || actual == nil { + return expected == actual + } + + exp, ok := expected.([]byte) + if !ok { + return reflect.DeepEqual(expected, actual) + } + + act, ok := actual.([]byte) + if !ok { + return false + } + if exp == nil || act == nil { + return exp == nil && act == nil + } + return bytes.Equal(exp, act) +} + +// copyExportedFields iterates downward through nested data structures and creates a copy +// that only contains the exported struct fields. +func copyExportedFields(expected interface{}) interface{} { + if isNil(expected) { + return expected + } + + expectedType := reflect.TypeOf(expected) + expectedKind := expectedType.Kind() + expectedValue := reflect.ValueOf(expected) + + switch expectedKind { + case reflect.Struct: + result := reflect.New(expectedType).Elem() + for i := 0; i < expectedType.NumField(); i++ { + field := expectedType.Field(i) + isExported := field.IsExported() + if isExported { + fieldValue := expectedValue.Field(i) + if isNil(fieldValue) || isNil(fieldValue.Interface()) { + continue + } + newValue := copyExportedFields(fieldValue.Interface()) + result.Field(i).Set(reflect.ValueOf(newValue)) + } + } + return result.Interface() + + case reflect.Ptr: + result := reflect.New(expectedType.Elem()) + unexportedRemoved := copyExportedFields(expectedValue.Elem().Interface()) + result.Elem().Set(reflect.ValueOf(unexportedRemoved)) + return result.Interface() + + case reflect.Array, reflect.Slice: + result := reflect.MakeSlice(expectedType, expectedValue.Len(), expectedValue.Len()) + for i := 0; i < expectedValue.Len(); i++ { + index := expectedValue.Index(i) + if isNil(index) { + continue + } + unexportedRemoved := copyExportedFields(index.Interface()) + result.Index(i).Set(reflect.ValueOf(unexportedRemoved)) + } + return result.Interface() + + case reflect.Map: + result := reflect.MakeMap(expectedType) + for _, k := range expectedValue.MapKeys() { + index := expectedValue.MapIndex(k) + unexportedRemoved := copyExportedFields(index.Interface()) + result.SetMapIndex(k, reflect.ValueOf(unexportedRemoved)) + } + return result.Interface() + + default: + return expected + } +} + +// ObjectsExportedFieldsAreEqual determines if the exported (public) fields of two objects are +// considered equal. This comparison of only exported fields is applied recursively to nested data +// structures. +// +// This function does no assertion of any kind. +func ObjectsExportedFieldsAreEqual(expected, actual interface{}) bool { + expectedCleaned := copyExportedFields(expected) + actualCleaned := copyExportedFields(actual) + return ObjectsAreEqualValues(expectedCleaned, actualCleaned) +} + +// ObjectsAreEqualValues gets whether two objects are equal, or if their +// values are equal. +func ObjectsAreEqualValues(expected, actual interface{}) bool { + if ObjectsAreEqual(expected, actual) { + return true + } + + actualType := reflect.TypeOf(actual) + if actualType == nil { + return false + } + expectedValue := reflect.ValueOf(expected) + if expectedValue.IsValid() && expectedValue.Type().ConvertibleTo(actualType) { + // Attempt comparison after type conversion + return reflect.DeepEqual(expectedValue.Convert(actualType).Interface(), actual) + } + + return false +} + +/* CallerInfo is necessary because the assert functions use the testing object +internally, causing it to print the file:line of the assert method, rather than where +the problem actually occurred in calling code.*/ + +// CallerInfo returns an array of strings containing the file and line number +// of each stack frame leading from the current test to the assert call that +// failed. +func CallerInfo() []string { + + var pc uintptr + var ok bool + var file string + var line int + var name string + + callers := []string{} + for i := 0; ; i++ { + pc, file, line, ok = runtime.Caller(i) + if !ok { + // The breaks below failed to terminate the loop, and we ran off the + // end of the call stack. + break + } + + // This is a huge edge case, but it will panic if this is the case, see #180 + if file == "" { + break + } + + f := runtime.FuncForPC(pc) + if f == nil { + break + } + name = f.Name() + + // testing.tRunner is the standard library function that calls + // tests. Subtests are called directly by tRunner, without going through + // the Test/Benchmark/Example function that contains the t.Run calls, so + // with subtests we should break when we hit tRunner, without adding it + // to the list of callers. + if name == "testing.tRunner" { + break + } + + parts := strings.Split(file, "/") + if len(parts) > 1 { + filename := parts[len(parts)-1] + dir := parts[len(parts)-2] + if (dir != "assert" && dir != "mock" && dir != "require") || filename == "mock_test.go" { + callers = append(callers, fmt.Sprintf("%s:%d", file, line)) + } + } + + // Drop the package + segments := strings.Split(name, ".") + name = segments[len(segments)-1] + if isTest(name, "Test") || + isTest(name, "Benchmark") || + isTest(name, "Example") { + break + } + } + + return callers +} + +// Stolen from the `go test` tool. +// isTest tells whether name looks like a test (or benchmark, according to prefix). +// It is a Test (say) if there is a character after Test that is not a lower-case letter. +// We don't want TesticularCancer. +func isTest(name, prefix string) bool { + if !strings.HasPrefix(name, prefix) { + return false + } + if len(name) == len(prefix) { // "Test" is ok + return true + } + r, _ := utf8.DecodeRuneInString(name[len(prefix):]) + return !unicode.IsLower(r) +} + +func messageFromMsgAndArgs(msgAndArgs ...interface{}) string { + if len(msgAndArgs) == 0 || msgAndArgs == nil { + return "" + } + if len(msgAndArgs) == 1 { + msg := msgAndArgs[0] + if msgAsStr, ok := msg.(string); ok { + return msgAsStr + } + return fmt.Sprintf("%+v", msg) + } + if len(msgAndArgs) > 1 { + return fmt.Sprintf(msgAndArgs[0].(string), msgAndArgs[1:]...) + } + return "" +} + +// Aligns the provided message so that all lines after the first line start at the same location as the first line. +// Assumes that the first line starts at the correct location (after carriage return, tab, label, spacer and tab). +// The longestLabelLen parameter specifies the length of the longest label in the output (required becaues this is the +// basis on which the alignment occurs). +func indentMessageLines(message string, longestLabelLen int) string { + outBuf := new(bytes.Buffer) + + for i, scanner := 0, bufio.NewScanner(strings.NewReader(message)); scanner.Scan(); i++ { + // no need to align first line because it starts at the correct location (after the label) + if i != 0 { + // append alignLen+1 spaces to align with "{{longestLabel}}:" before adding tab + outBuf.WriteString("\n\t" + strings.Repeat(" ", longestLabelLen+1) + "\t") + } + outBuf.WriteString(scanner.Text()) + } + + return outBuf.String() +} + +type failNower interface { + FailNow() +} + +// FailNow fails test +func FailNow(t TestingT, failureMessage string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + Fail(t, failureMessage, msgAndArgs...) + + // We cannot extend TestingT with FailNow() and + // maintain backwards compatibility, so we fallback + // to panicking when FailNow is not available in + // TestingT. + // See issue #263 + + if t, ok := t.(failNower); ok { + t.FailNow() + } else { + panic("test failed and t is missing `FailNow()`") + } + return false +} + +// Fail reports a failure through +func Fail(t TestingT, failureMessage string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + content := []labeledContent{ + {"Error Trace", strings.Join(CallerInfo(), "\n\t\t\t")}, + {"Error", failureMessage}, + } + + // Add test name if the Go version supports it + if n, ok := t.(interface { + Name() string + }); ok { + content = append(content, labeledContent{"Test", n.Name()}) + } + + message := messageFromMsgAndArgs(msgAndArgs...) + if len(message) > 0 { + content = append(content, labeledContent{"Messages", message}) + } + + t.Errorf("\n%s", ""+labeledOutput(content...)) + + return false +} + +type labeledContent struct { + label string + content string +} + +// labeledOutput returns a string consisting of the provided labeledContent. Each labeled output is appended in the following manner: +// +// \t{{label}}:{{align_spaces}}\t{{content}}\n +// +// The initial carriage return is required to undo/erase any padding added by testing.T.Errorf. The "\t{{label}}:" is for the label. +// If a label is shorter than the longest label provided, padding spaces are added to make all the labels match in length. Once this +// alignment is achieved, "\t{{content}}\n" is added for the output. +// +// If the content of the labeledOutput contains line breaks, the subsequent lines are aligned so that they start at the same location as the first line. +func labeledOutput(content ...labeledContent) string { + longestLabel := 0 + for _, v := range content { + if len(v.label) > longestLabel { + longestLabel = len(v.label) + } + } + var output string + for _, v := range content { + output += "\t" + v.label + ":" + strings.Repeat(" ", longestLabel-len(v.label)) + "\t" + indentMessageLines(v.content, longestLabel) + "\n" + } + return output +} + +// Implements asserts that an object is implemented by the specified interface. +// +// assert.Implements(t, (*MyInterface)(nil), new(MyObject)) +func Implements(t TestingT, interfaceObject interface{}, object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + interfaceType := reflect.TypeOf(interfaceObject).Elem() + + if object == nil { + return Fail(t, fmt.Sprintf("Cannot check if nil implements %v", interfaceType), msgAndArgs...) + } + if !reflect.TypeOf(object).Implements(interfaceType) { + return Fail(t, fmt.Sprintf("%T must implement %v", object, interfaceType), msgAndArgs...) + } + + return true +} + +// IsType asserts that the specified objects are of the same type. +func IsType(t TestingT, expectedType interface{}, object interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + if !ObjectsAreEqual(reflect.TypeOf(object), reflect.TypeOf(expectedType)) { + return Fail(t, fmt.Sprintf("Object expected to be of type %v, but was %v", reflect.TypeOf(expectedType), reflect.TypeOf(object)), msgAndArgs...) + } + + return true +} + +// Equal asserts that two objects are equal. +// +// assert.Equal(t, 123, 123) +// +// Pointer variable equality is determined based on the equality of the +// referenced values (as opposed to the memory addresses). Function equality +// cannot be determined and will always fail. +func Equal(t TestingT, expected, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if err := validateEqualArgs(expected, actual); err != nil { + return Fail(t, fmt.Sprintf("Invalid operation: %#v == %#v (%s)", + expected, actual, err), msgAndArgs...) + } + + if !ObjectsAreEqual(expected, actual) { + diff := diff(expected, actual) + expected, actual = formatUnequalValues(expected, actual) + return Fail(t, fmt.Sprintf("Not equal: \n"+ + "expected: %s\n"+ + "actual : %s%s", expected, actual, diff), msgAndArgs...) + } + + return true + +} + +// validateEqualArgs checks whether provided arguments can be safely used in the +// Equal/NotEqual functions. +func validateEqualArgs(expected, actual interface{}) error { + if expected == nil && actual == nil { + return nil + } + + if isFunction(expected) || isFunction(actual) { + return errors.New("cannot take func type as argument") + } + return nil +} + +// Same asserts that two pointers reference the same object. +// +// assert.Same(t, ptr1, ptr2) +// +// Both arguments must be pointer variables. Pointer variable sameness is +// determined based on the equality of both type and value. +func Same(t TestingT, expected, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + if !samePointers(expected, actual) { + return Fail(t, fmt.Sprintf("Not same: \n"+ + "expected: %p %#v\n"+ + "actual : %p %#v", expected, expected, actual, actual), msgAndArgs...) + } + + return true +} + +// NotSame asserts that two pointers do not reference the same object. +// +// assert.NotSame(t, ptr1, ptr2) +// +// Both arguments must be pointer variables. Pointer variable sameness is +// determined based on the equality of both type and value. +func NotSame(t TestingT, expected, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + if samePointers(expected, actual) { + return Fail(t, fmt.Sprintf( + "Expected and actual point to the same object: %p %#v", + expected, expected), msgAndArgs...) + } + return true +} + +// samePointers compares two generic interface objects and returns whether +// they point to the same object +func samePointers(first, second interface{}) bool { + firstPtr, secondPtr := reflect.ValueOf(first), reflect.ValueOf(second) + if firstPtr.Kind() != reflect.Ptr || secondPtr.Kind() != reflect.Ptr { + return false + } + + firstType, secondType := reflect.TypeOf(first), reflect.TypeOf(second) + if firstType != secondType { + return false + } + + // compare pointer addresses + return first == second +} + +// formatUnequalValues takes two values of arbitrary types and returns string +// representations appropriate to be presented to the user. +// +// If the values are not of like type, the returned strings will be prefixed +// with the type name, and the value will be enclosed in parenthesis similar +// to a type conversion in the Go grammar. +func formatUnequalValues(expected, actual interface{}) (e string, a string) { + if reflect.TypeOf(expected) != reflect.TypeOf(actual) { + return fmt.Sprintf("%T(%s)", expected, truncatingFormat(expected)), + fmt.Sprintf("%T(%s)", actual, truncatingFormat(actual)) + } + switch expected.(type) { + case time.Duration: + return fmt.Sprintf("%v", expected), fmt.Sprintf("%v", actual) + } + return truncatingFormat(expected), truncatingFormat(actual) +} + +// truncatingFormat formats the data and truncates it if it's too long. +// +// This helps keep formatted error messages lines from exceeding the +// bufio.MaxScanTokenSize max line length that the go testing framework imposes. +func truncatingFormat(data interface{}) string { + value := fmt.Sprintf("%#v", data) + max := bufio.MaxScanTokenSize - 100 // Give us some space the type info too if needed. + if len(value) > max { + value = value[0:max] + "<... truncated>" + } + return value +} + +// EqualValues asserts that two objects are equal or convertable to the same types +// and equal. +// +// assert.EqualValues(t, uint32(123), int32(123)) +func EqualValues(t TestingT, expected, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + if !ObjectsAreEqualValues(expected, actual) { + diff := diff(expected, actual) + expected, actual = formatUnequalValues(expected, actual) + return Fail(t, fmt.Sprintf("Not equal: \n"+ + "expected: %s\n"+ + "actual : %s%s", expected, actual, diff), msgAndArgs...) + } + + return true + +} + +// EqualExportedValues asserts that the types of two objects are equal and their public +// fields are also equal. This is useful for comparing structs that have private fields +// that could potentially differ. +// +// type S struct { +// Exported int +// notExported int +// } +// assert.EqualExportedValues(t, S{1, 2}, S{1, 3}) => true +// assert.EqualExportedValues(t, S{1, 2}, S{2, 3}) => false +func EqualExportedValues(t TestingT, expected, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + aType := reflect.TypeOf(expected) + bType := reflect.TypeOf(actual) + + if aType != bType { + return Fail(t, fmt.Sprintf("Types expected to match exactly\n\t%v != %v", aType, bType), msgAndArgs...) + } + + if aType.Kind() != reflect.Struct { + return Fail(t, fmt.Sprintf("Types expected to both be struct \n\t%v != %v", aType.Kind(), reflect.Struct), msgAndArgs...) + } + + if bType.Kind() != reflect.Struct { + return Fail(t, fmt.Sprintf("Types expected to both be struct \n\t%v != %v", bType.Kind(), reflect.Struct), msgAndArgs...) + } + + expected = copyExportedFields(expected) + actual = copyExportedFields(actual) + + if !ObjectsAreEqualValues(expected, actual) { + diff := diff(expected, actual) + expected, actual = formatUnequalValues(expected, actual) + return Fail(t, fmt.Sprintf("Not equal (comparing only exported fields): \n"+ + "expected: %s\n"+ + "actual : %s%s", expected, actual, diff), msgAndArgs...) + } + + return true +} + +// Exactly asserts that two objects are equal in value and type. +// +// assert.Exactly(t, int32(123), int64(123)) +func Exactly(t TestingT, expected, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + aType := reflect.TypeOf(expected) + bType := reflect.TypeOf(actual) + + if aType != bType { + return Fail(t, fmt.Sprintf("Types expected to match exactly\n\t%v != %v", aType, bType), msgAndArgs...) + } + + return Equal(t, expected, actual, msgAndArgs...) + +} + +// NotNil asserts that the specified object is not nil. +// +// assert.NotNil(t, err) +func NotNil(t TestingT, object interface{}, msgAndArgs ...interface{}) bool { + if !isNil(object) { + return true + } + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Fail(t, "Expected value not to be nil.", msgAndArgs...) +} + +// containsKind checks if a specified kind in the slice of kinds. +func containsKind(kinds []reflect.Kind, kind reflect.Kind) bool { + for i := 0; i < len(kinds); i++ { + if kind == kinds[i] { + return true + } + } + + return false +} + +// isNil checks if a specified object is nil or not, without Failing. +func isNil(object interface{}) bool { + if object == nil { + return true + } + + value := reflect.ValueOf(object) + kind := value.Kind() + isNilableKind := containsKind( + []reflect.Kind{ + reflect.Chan, reflect.Func, + reflect.Interface, reflect.Map, + reflect.Ptr, reflect.Slice, reflect.UnsafePointer}, + kind) + + if isNilableKind && value.IsNil() { + return true + } + + return false +} + +// Nil asserts that the specified object is nil. +// +// assert.Nil(t, err) +func Nil(t TestingT, object interface{}, msgAndArgs ...interface{}) bool { + if isNil(object) { + return true + } + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Fail(t, fmt.Sprintf("Expected nil, but got: %#v", object), msgAndArgs...) +} + +// isEmpty gets whether the specified object is considered empty or not. +func isEmpty(object interface{}) bool { + + // get nil case out of the way + if object == nil { + return true + } + + objValue := reflect.ValueOf(object) + + switch objValue.Kind() { + // collection types are empty when they have no element + case reflect.Chan, reflect.Map, reflect.Slice: + return objValue.Len() == 0 + // pointers are empty if nil or if the value they point to is empty + case reflect.Ptr: + if objValue.IsNil() { + return true + } + deref := objValue.Elem().Interface() + return isEmpty(deref) + // for all other types, compare against the zero value + // array types are empty when they match their zero-initialized state + default: + zero := reflect.Zero(objValue.Type()) + return reflect.DeepEqual(object, zero.Interface()) + } +} + +// Empty asserts that the specified object is empty. I.e. nil, "", false, 0 or either +// a slice or a channel with len == 0. +// +// assert.Empty(t, obj) +func Empty(t TestingT, object interface{}, msgAndArgs ...interface{}) bool { + pass := isEmpty(object) + if !pass { + if h, ok := t.(tHelper); ok { + h.Helper() + } + Fail(t, fmt.Sprintf("Should be empty, but was %v", object), msgAndArgs...) + } + + return pass + +} + +// NotEmpty asserts that the specified object is NOT empty. I.e. not nil, "", false, 0 or either +// a slice or a channel with len == 0. +// +// if assert.NotEmpty(t, obj) { +// assert.Equal(t, "two", obj[1]) +// } +func NotEmpty(t TestingT, object interface{}, msgAndArgs ...interface{}) bool { + pass := !isEmpty(object) + if !pass { + if h, ok := t.(tHelper); ok { + h.Helper() + } + Fail(t, fmt.Sprintf("Should NOT be empty, but was %v", object), msgAndArgs...) + } + + return pass + +} + +// getLen try to get length of object. +// return (false, 0) if impossible. +func getLen(x interface{}) (ok bool, length int) { + v := reflect.ValueOf(x) + defer func() { + if e := recover(); e != nil { + ok = false + } + }() + return true, v.Len() +} + +// Len asserts that the specified object has specific length. +// Len also fails if the object has a type that len() not accept. +// +// assert.Len(t, mySlice, 3) +func Len(t TestingT, object interface{}, length int, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + ok, l := getLen(object) + if !ok { + return Fail(t, fmt.Sprintf("\"%s\" could not be applied builtin len()", object), msgAndArgs...) + } + + if l != length { + return Fail(t, fmt.Sprintf("\"%s\" should have %d item(s), but has %d", object, length, l), msgAndArgs...) + } + return true +} + +// True asserts that the specified value is true. +// +// assert.True(t, myBool) +func True(t TestingT, value bool, msgAndArgs ...interface{}) bool { + if !value { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Fail(t, "Should be true", msgAndArgs...) + } + + return true + +} + +// False asserts that the specified value is false. +// +// assert.False(t, myBool) +func False(t TestingT, value bool, msgAndArgs ...interface{}) bool { + if value { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Fail(t, "Should be false", msgAndArgs...) + } + + return true + +} + +// NotEqual asserts that the specified values are NOT equal. +// +// assert.NotEqual(t, obj1, obj2) +// +// Pointer variable equality is determined based on the equality of the +// referenced values (as opposed to the memory addresses). +func NotEqual(t TestingT, expected, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if err := validateEqualArgs(expected, actual); err != nil { + return Fail(t, fmt.Sprintf("Invalid operation: %#v != %#v (%s)", + expected, actual, err), msgAndArgs...) + } + + if ObjectsAreEqual(expected, actual) { + return Fail(t, fmt.Sprintf("Should not be: %#v\n", actual), msgAndArgs...) + } + + return true + +} + +// NotEqualValues asserts that two objects are not equal even when converted to the same type +// +// assert.NotEqualValues(t, obj1, obj2) +func NotEqualValues(t TestingT, expected, actual interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + if ObjectsAreEqualValues(expected, actual) { + return Fail(t, fmt.Sprintf("Should not be: %#v\n", actual), msgAndArgs...) + } + + return true +} + +// containsElement try loop over the list check if the list includes the element. +// return (false, false) if impossible. +// return (true, false) if element was not found. +// return (true, true) if element was found. +func containsElement(list interface{}, element interface{}) (ok, found bool) { + + listValue := reflect.ValueOf(list) + listType := reflect.TypeOf(list) + if listType == nil { + return false, false + } + listKind := listType.Kind() + defer func() { + if e := recover(); e != nil { + ok = false + found = false + } + }() + + if listKind == reflect.String { + elementValue := reflect.ValueOf(element) + return true, strings.Contains(listValue.String(), elementValue.String()) + } + + if listKind == reflect.Map { + mapKeys := listValue.MapKeys() + for i := 0; i < len(mapKeys); i++ { + if ObjectsAreEqual(mapKeys[i].Interface(), element) { + return true, true + } + } + return true, false + } + + for i := 0; i < listValue.Len(); i++ { + if ObjectsAreEqual(listValue.Index(i).Interface(), element) { + return true, true + } + } + return true, false + +} + +// Contains asserts that the specified string, list(array, slice...) or map contains the +// specified substring or element. +// +// assert.Contains(t, "Hello World", "World") +// assert.Contains(t, ["Hello", "World"], "World") +// assert.Contains(t, {"Hello": "World"}, "Hello") +func Contains(t TestingT, s, contains interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + ok, found := containsElement(s, contains) + if !ok { + return Fail(t, fmt.Sprintf("%#v could not be applied builtin len()", s), msgAndArgs...) + } + if !found { + return Fail(t, fmt.Sprintf("%#v does not contain %#v", s, contains), msgAndArgs...) + } + + return true + +} + +// NotContains asserts that the specified string, list(array, slice...) or map does NOT contain the +// specified substring or element. +// +// assert.NotContains(t, "Hello World", "Earth") +// assert.NotContains(t, ["Hello", "World"], "Earth") +// assert.NotContains(t, {"Hello": "World"}, "Earth") +func NotContains(t TestingT, s, contains interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + ok, found := containsElement(s, contains) + if !ok { + return Fail(t, fmt.Sprintf("%#v could not be applied builtin len()", s), msgAndArgs...) + } + if found { + return Fail(t, fmt.Sprintf("%#v should not contain %#v", s, contains), msgAndArgs...) + } + + return true + +} + +// Subset asserts that the specified list(array, slice...) contains all +// elements given in the specified subset(array, slice...). +// +// assert.Subset(t, [1, 2, 3], [1, 2], "But [1, 2, 3] does contain [1, 2]") +func Subset(t TestingT, list, subset interface{}, msgAndArgs ...interface{}) (ok bool) { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if subset == nil { + return true // we consider nil to be equal to the nil set + } + + listKind := reflect.TypeOf(list).Kind() + if listKind != reflect.Array && listKind != reflect.Slice && listKind != reflect.Map { + return Fail(t, fmt.Sprintf("%q has an unsupported type %s", list, listKind), msgAndArgs...) + } + + subsetKind := reflect.TypeOf(subset).Kind() + if subsetKind != reflect.Array && subsetKind != reflect.Slice && listKind != reflect.Map { + return Fail(t, fmt.Sprintf("%q has an unsupported type %s", subset, subsetKind), msgAndArgs...) + } + + if subsetKind == reflect.Map && listKind == reflect.Map { + subsetMap := reflect.ValueOf(subset) + actualMap := reflect.ValueOf(list) + + for _, k := range subsetMap.MapKeys() { + ev := subsetMap.MapIndex(k) + av := actualMap.MapIndex(k) + + if !av.IsValid() { + return Fail(t, fmt.Sprintf("%#v does not contain %#v", list, subset), msgAndArgs...) + } + if !ObjectsAreEqual(ev.Interface(), av.Interface()) { + return Fail(t, fmt.Sprintf("%#v does not contain %#v", list, subset), msgAndArgs...) + } + } + + return true + } + + subsetList := reflect.ValueOf(subset) + for i := 0; i < subsetList.Len(); i++ { + element := subsetList.Index(i).Interface() + ok, found := containsElement(list, element) + if !ok { + return Fail(t, fmt.Sprintf("%#v could not be applied builtin len()", list), msgAndArgs...) + } + if !found { + return Fail(t, fmt.Sprintf("%#v does not contain %#v", list, element), msgAndArgs...) + } + } + + return true +} + +// NotSubset asserts that the specified list(array, slice...) contains not all +// elements given in the specified subset(array, slice...). +// +// assert.NotSubset(t, [1, 3, 4], [1, 2], "But [1, 3, 4] does not contain [1, 2]") +func NotSubset(t TestingT, list, subset interface{}, msgAndArgs ...interface{}) (ok bool) { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if subset == nil { + return Fail(t, "nil is the empty set which is a subset of every set", msgAndArgs...) + } + + listKind := reflect.TypeOf(list).Kind() + if listKind != reflect.Array && listKind != reflect.Slice && listKind != reflect.Map { + return Fail(t, fmt.Sprintf("%q has an unsupported type %s", list, listKind), msgAndArgs...) + } + + subsetKind := reflect.TypeOf(subset).Kind() + if subsetKind != reflect.Array && subsetKind != reflect.Slice && listKind != reflect.Map { + return Fail(t, fmt.Sprintf("%q has an unsupported type %s", subset, subsetKind), msgAndArgs...) + } + + if subsetKind == reflect.Map && listKind == reflect.Map { + subsetMap := reflect.ValueOf(subset) + actualMap := reflect.ValueOf(list) + + for _, k := range subsetMap.MapKeys() { + ev := subsetMap.MapIndex(k) + av := actualMap.MapIndex(k) + + if !av.IsValid() { + return true + } + if !ObjectsAreEqual(ev.Interface(), av.Interface()) { + return true + } + } + + return Fail(t, fmt.Sprintf("%q is a subset of %q", subset, list), msgAndArgs...) + } + + subsetList := reflect.ValueOf(subset) + for i := 0; i < subsetList.Len(); i++ { + element := subsetList.Index(i).Interface() + ok, found := containsElement(list, element) + if !ok { + return Fail(t, fmt.Sprintf("\"%s\" could not be applied builtin len()", list), msgAndArgs...) + } + if !found { + return true + } + } + + return Fail(t, fmt.Sprintf("%q is a subset of %q", subset, list), msgAndArgs...) +} + +// ElementsMatch asserts that the specified listA(array, slice...) is equal to specified +// listB(array, slice...) ignoring the order of the elements. If there are duplicate elements, +// the number of appearances of each of them in both lists should match. +// +// assert.ElementsMatch(t, [1, 3, 2, 3], [1, 3, 3, 2]) +func ElementsMatch(t TestingT, listA, listB interface{}, msgAndArgs ...interface{}) (ok bool) { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if isEmpty(listA) && isEmpty(listB) { + return true + } + + if !isList(t, listA, msgAndArgs...) || !isList(t, listB, msgAndArgs...) { + return false + } + + extraA, extraB := diffLists(listA, listB) + + if len(extraA) == 0 && len(extraB) == 0 { + return true + } + + return Fail(t, formatListDiff(listA, listB, extraA, extraB), msgAndArgs...) +} + +// isList checks that the provided value is array or slice. +func isList(t TestingT, list interface{}, msgAndArgs ...interface{}) (ok bool) { + kind := reflect.TypeOf(list).Kind() + if kind != reflect.Array && kind != reflect.Slice { + return Fail(t, fmt.Sprintf("%q has an unsupported type %s, expecting array or slice", list, kind), + msgAndArgs...) + } + return true +} + +// diffLists diffs two arrays/slices and returns slices of elements that are only in A and only in B. +// If some element is present multiple times, each instance is counted separately (e.g. if something is 2x in A and +// 5x in B, it will be 0x in extraA and 3x in extraB). The order of items in both lists is ignored. +func diffLists(listA, listB interface{}) (extraA, extraB []interface{}) { + aValue := reflect.ValueOf(listA) + bValue := reflect.ValueOf(listB) + + aLen := aValue.Len() + bLen := bValue.Len() + + // Mark indexes in bValue that we already used + visited := make([]bool, bLen) + for i := 0; i < aLen; i++ { + element := aValue.Index(i).Interface() + found := false + for j := 0; j < bLen; j++ { + if visited[j] { + continue + } + if ObjectsAreEqual(bValue.Index(j).Interface(), element) { + visited[j] = true + found = true + break + } + } + if !found { + extraA = append(extraA, element) + } + } + + for j := 0; j < bLen; j++ { + if visited[j] { + continue + } + extraB = append(extraB, bValue.Index(j).Interface()) + } + + return +} + +func formatListDiff(listA, listB interface{}, extraA, extraB []interface{}) string { + var msg bytes.Buffer + + msg.WriteString("elements differ") + if len(extraA) > 0 { + msg.WriteString("\n\nextra elements in list A:\n") + msg.WriteString(spewConfig.Sdump(extraA)) + } + if len(extraB) > 0 { + msg.WriteString("\n\nextra elements in list B:\n") + msg.WriteString(spewConfig.Sdump(extraB)) + } + msg.WriteString("\n\nlistA:\n") + msg.WriteString(spewConfig.Sdump(listA)) + msg.WriteString("\n\nlistB:\n") + msg.WriteString(spewConfig.Sdump(listB)) + + return msg.String() +} + +// Condition uses a Comparison to assert a complex condition. +func Condition(t TestingT, comp Comparison, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + result := comp() + if !result { + Fail(t, "Condition failed!", msgAndArgs...) + } + return result +} + +// PanicTestFunc defines a func that should be passed to the assert.Panics and assert.NotPanics +// methods, and represents a simple func that takes no arguments, and returns nothing. +type PanicTestFunc func() + +// didPanic returns true if the function passed to it panics. Otherwise, it returns false. +func didPanic(f PanicTestFunc) (didPanic bool, message interface{}, stack string) { + didPanic = true + + defer func() { + message = recover() + if didPanic { + stack = string(debug.Stack()) + } + }() + + // call the target function + f() + didPanic = false + + return +} + +// Panics asserts that the code inside the specified PanicTestFunc panics. +// +// assert.Panics(t, func(){ GoCrazy() }) +func Panics(t TestingT, f PanicTestFunc, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + if funcDidPanic, panicValue, _ := didPanic(f); !funcDidPanic { + return Fail(t, fmt.Sprintf("func %#v should panic\n\tPanic value:\t%#v", f, panicValue), msgAndArgs...) + } + + return true +} + +// PanicsWithValue asserts that the code inside the specified PanicTestFunc panics, and that +// the recovered panic value equals the expected panic value. +// +// assert.PanicsWithValue(t, "crazy error", func(){ GoCrazy() }) +func PanicsWithValue(t TestingT, expected interface{}, f PanicTestFunc, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + funcDidPanic, panicValue, panickedStack := didPanic(f) + if !funcDidPanic { + return Fail(t, fmt.Sprintf("func %#v should panic\n\tPanic value:\t%#v", f, panicValue), msgAndArgs...) + } + if panicValue != expected { + return Fail(t, fmt.Sprintf("func %#v should panic with value:\t%#v\n\tPanic value:\t%#v\n\tPanic stack:\t%s", f, expected, panicValue, panickedStack), msgAndArgs...) + } + + return true +} + +// PanicsWithError asserts that the code inside the specified PanicTestFunc +// panics, and that the recovered panic value is an error that satisfies the +// EqualError comparison. +// +// assert.PanicsWithError(t, "crazy error", func(){ GoCrazy() }) +func PanicsWithError(t TestingT, errString string, f PanicTestFunc, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + funcDidPanic, panicValue, panickedStack := didPanic(f) + if !funcDidPanic { + return Fail(t, fmt.Sprintf("func %#v should panic\n\tPanic value:\t%#v", f, panicValue), msgAndArgs...) + } + panicErr, ok := panicValue.(error) + if !ok || panicErr.Error() != errString { + return Fail(t, fmt.Sprintf("func %#v should panic with error message:\t%#v\n\tPanic value:\t%#v\n\tPanic stack:\t%s", f, errString, panicValue, panickedStack), msgAndArgs...) + } + + return true +} + +// NotPanics asserts that the code inside the specified PanicTestFunc does NOT panic. +// +// assert.NotPanics(t, func(){ RemainCalm() }) +func NotPanics(t TestingT, f PanicTestFunc, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + if funcDidPanic, panicValue, panickedStack := didPanic(f); funcDidPanic { + return Fail(t, fmt.Sprintf("func %#v should not panic\n\tPanic value:\t%v\n\tPanic stack:\t%s", f, panicValue, panickedStack), msgAndArgs...) + } + + return true +} + +// WithinDuration asserts that the two times are within duration delta of each other. +// +// assert.WithinDuration(t, time.Now(), time.Now(), 10*time.Second) +func WithinDuration(t TestingT, expected, actual time.Time, delta time.Duration, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + dt := expected.Sub(actual) + if dt < -delta || dt > delta { + return Fail(t, fmt.Sprintf("Max difference between %v and %v allowed is %v, but difference was %v", expected, actual, delta, dt), msgAndArgs...) + } + + return true +} + +// WithinRange asserts that a time is within a time range (inclusive). +// +// assert.WithinRange(t, time.Now(), time.Now().Add(-time.Second), time.Now().Add(time.Second)) +func WithinRange(t TestingT, actual, start, end time.Time, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + if end.Before(start) { + return Fail(t, "Start should be before end", msgAndArgs...) + } + + if actual.Before(start) { + return Fail(t, fmt.Sprintf("Time %v expected to be in time range %v to %v, but is before the range", actual, start, end), msgAndArgs...) + } else if actual.After(end) { + return Fail(t, fmt.Sprintf("Time %v expected to be in time range %v to %v, but is after the range", actual, start, end), msgAndArgs...) + } + + return true +} + +func toFloat(x interface{}) (float64, bool) { + var xf float64 + xok := true + + switch xn := x.(type) { + case uint: + xf = float64(xn) + case uint8: + xf = float64(xn) + case uint16: + xf = float64(xn) + case uint32: + xf = float64(xn) + case uint64: + xf = float64(xn) + case int: + xf = float64(xn) + case int8: + xf = float64(xn) + case int16: + xf = float64(xn) + case int32: + xf = float64(xn) + case int64: + xf = float64(xn) + case float32: + xf = float64(xn) + case float64: + xf = xn + case time.Duration: + xf = float64(xn) + default: + xok = false + } + + return xf, xok +} + +// InDelta asserts that the two numerals are within delta of each other. +// +// assert.InDelta(t, math.Pi, 22/7.0, 0.01) +func InDelta(t TestingT, expected, actual interface{}, delta float64, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + af, aok := toFloat(expected) + bf, bok := toFloat(actual) + + if !aok || !bok { + return Fail(t, "Parameters must be numerical", msgAndArgs...) + } + + if math.IsNaN(af) && math.IsNaN(bf) { + return true + } + + if math.IsNaN(af) { + return Fail(t, "Expected must not be NaN", msgAndArgs...) + } + + if math.IsNaN(bf) { + return Fail(t, fmt.Sprintf("Expected %v with delta %v, but was NaN", expected, delta), msgAndArgs...) + } + + dt := af - bf + if dt < -delta || dt > delta { + return Fail(t, fmt.Sprintf("Max difference between %v and %v allowed is %v, but difference was %v", expected, actual, delta, dt), msgAndArgs...) + } + + return true +} + +// InDeltaSlice is the same as InDelta, except it compares two slices. +func InDeltaSlice(t TestingT, expected, actual interface{}, delta float64, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if expected == nil || actual == nil || + reflect.TypeOf(actual).Kind() != reflect.Slice || + reflect.TypeOf(expected).Kind() != reflect.Slice { + return Fail(t, "Parameters must be slice", msgAndArgs...) + } + + actualSlice := reflect.ValueOf(actual) + expectedSlice := reflect.ValueOf(expected) + + for i := 0; i < actualSlice.Len(); i++ { + result := InDelta(t, actualSlice.Index(i).Interface(), expectedSlice.Index(i).Interface(), delta, msgAndArgs...) + if !result { + return result + } + } + + return true +} + +// InDeltaMapValues is the same as InDelta, but it compares all values between two maps. Both maps must have exactly the same keys. +func InDeltaMapValues(t TestingT, expected, actual interface{}, delta float64, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if expected == nil || actual == nil || + reflect.TypeOf(actual).Kind() != reflect.Map || + reflect.TypeOf(expected).Kind() != reflect.Map { + return Fail(t, "Arguments must be maps", msgAndArgs...) + } + + expectedMap := reflect.ValueOf(expected) + actualMap := reflect.ValueOf(actual) + + if expectedMap.Len() != actualMap.Len() { + return Fail(t, "Arguments must have the same number of keys", msgAndArgs...) + } + + for _, k := range expectedMap.MapKeys() { + ev := expectedMap.MapIndex(k) + av := actualMap.MapIndex(k) + + if !ev.IsValid() { + return Fail(t, fmt.Sprintf("missing key %q in expected map", k), msgAndArgs...) + } + + if !av.IsValid() { + return Fail(t, fmt.Sprintf("missing key %q in actual map", k), msgAndArgs...) + } + + if !InDelta( + t, + ev.Interface(), + av.Interface(), + delta, + msgAndArgs..., + ) { + return false + } + } + + return true +} + +func calcRelativeError(expected, actual interface{}) (float64, error) { + af, aok := toFloat(expected) + bf, bok := toFloat(actual) + if !aok || !bok { + return 0, fmt.Errorf("Parameters must be numerical") + } + if math.IsNaN(af) && math.IsNaN(bf) { + return 0, nil + } + if math.IsNaN(af) { + return 0, errors.New("expected value must not be NaN") + } + if af == 0 { + return 0, fmt.Errorf("expected value must have a value other than zero to calculate the relative error") + } + if math.IsNaN(bf) { + return 0, errors.New("actual value must not be NaN") + } + + return math.Abs(af-bf) / math.Abs(af), nil +} + +// InEpsilon asserts that expected and actual have a relative error less than epsilon +func InEpsilon(t TestingT, expected, actual interface{}, epsilon float64, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if math.IsNaN(epsilon) { + return Fail(t, "epsilon must not be NaN") + } + actualEpsilon, err := calcRelativeError(expected, actual) + if err != nil { + return Fail(t, err.Error(), msgAndArgs...) + } + if actualEpsilon > epsilon { + return Fail(t, fmt.Sprintf("Relative error is too high: %#v (expected)\n"+ + " < %#v (actual)", epsilon, actualEpsilon), msgAndArgs...) + } + + return true +} + +// InEpsilonSlice is the same as InEpsilon, except it compares each value from two slices. +func InEpsilonSlice(t TestingT, expected, actual interface{}, epsilon float64, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if expected == nil || actual == nil || + reflect.TypeOf(actual).Kind() != reflect.Slice || + reflect.TypeOf(expected).Kind() != reflect.Slice { + return Fail(t, "Parameters must be slice", msgAndArgs...) + } + + actualSlice := reflect.ValueOf(actual) + expectedSlice := reflect.ValueOf(expected) + + for i := 0; i < actualSlice.Len(); i++ { + result := InEpsilon(t, actualSlice.Index(i).Interface(), expectedSlice.Index(i).Interface(), epsilon) + if !result { + return result + } + } + + return true +} + +/* + Errors +*/ + +// NoError asserts that a function returned no error (i.e. `nil`). +// +// actualObj, err := SomeFunction() +// if assert.NoError(t, err) { +// assert.Equal(t, expectedObj, actualObj) +// } +func NoError(t TestingT, err error, msgAndArgs ...interface{}) bool { + if err != nil { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Fail(t, fmt.Sprintf("Received unexpected error:\n%+v", err), msgAndArgs...) + } + + return true +} + +// Error asserts that a function returned an error (i.e. not `nil`). +// +// actualObj, err := SomeFunction() +// if assert.Error(t, err) { +// assert.Equal(t, expectedError, err) +// } +func Error(t TestingT, err error, msgAndArgs ...interface{}) bool { + if err == nil { + if h, ok := t.(tHelper); ok { + h.Helper() + } + return Fail(t, "An error is expected but got nil.", msgAndArgs...) + } + + return true +} + +// EqualError asserts that a function returned an error (i.e. not `nil`) +// and that it is equal to the provided error. +// +// actualObj, err := SomeFunction() +// assert.EqualError(t, err, expectedErrorString) +func EqualError(t TestingT, theError error, errString string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if !Error(t, theError, msgAndArgs...) { + return false + } + expected := errString + actual := theError.Error() + // don't need to use deep equals here, we know they are both strings + if expected != actual { + return Fail(t, fmt.Sprintf("Error message not equal:\n"+ + "expected: %q\n"+ + "actual : %q", expected, actual), msgAndArgs...) + } + return true +} + +// ErrorContains asserts that a function returned an error (i.e. not `nil`) +// and that the error contains the specified substring. +// +// actualObj, err := SomeFunction() +// assert.ErrorContains(t, err, expectedErrorSubString) +func ErrorContains(t TestingT, theError error, contains string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if !Error(t, theError, msgAndArgs...) { + return false + } + + actual := theError.Error() + if !strings.Contains(actual, contains) { + return Fail(t, fmt.Sprintf("Error %#v does not contain %#v", actual, contains), msgAndArgs...) + } + + return true +} + +// matchRegexp return true if a specified regexp matches a string. +func matchRegexp(rx interface{}, str interface{}) bool { + + var r *regexp.Regexp + if rr, ok := rx.(*regexp.Regexp); ok { + r = rr + } else { + r = regexp.MustCompile(fmt.Sprint(rx)) + } + + return (r.FindStringIndex(fmt.Sprint(str)) != nil) + +} + +// Regexp asserts that a specified regexp matches a string. +// +// assert.Regexp(t, regexp.MustCompile("start"), "it's starting") +// assert.Regexp(t, "start...$", "it's not starting") +func Regexp(t TestingT, rx interface{}, str interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + match := matchRegexp(rx, str) + + if !match { + Fail(t, fmt.Sprintf("Expect \"%v\" to match \"%v\"", str, rx), msgAndArgs...) + } + + return match +} + +// NotRegexp asserts that a specified regexp does not match a string. +// +// assert.NotRegexp(t, regexp.MustCompile("starts"), "it's starting") +// assert.NotRegexp(t, "^start", "it's not starting") +func NotRegexp(t TestingT, rx interface{}, str interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + match := matchRegexp(rx, str) + + if match { + Fail(t, fmt.Sprintf("Expect \"%v\" to NOT match \"%v\"", str, rx), msgAndArgs...) + } + + return !match + +} + +// Zero asserts that i is the zero value for its type. +func Zero(t TestingT, i interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if i != nil && !reflect.DeepEqual(i, reflect.Zero(reflect.TypeOf(i)).Interface()) { + return Fail(t, fmt.Sprintf("Should be zero, but was %v", i), msgAndArgs...) + } + return true +} + +// NotZero asserts that i is not the zero value for its type. +func NotZero(t TestingT, i interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if i == nil || reflect.DeepEqual(i, reflect.Zero(reflect.TypeOf(i)).Interface()) { + return Fail(t, fmt.Sprintf("Should not be zero, but was %v", i), msgAndArgs...) + } + return true +} + +// FileExists checks whether a file exists in the given path. It also fails if +// the path points to a directory or there is an error when trying to check the file. +func FileExists(t TestingT, path string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + info, err := os.Lstat(path) + if err != nil { + if os.IsNotExist(err) { + return Fail(t, fmt.Sprintf("unable to find file %q", path), msgAndArgs...) + } + return Fail(t, fmt.Sprintf("error when running os.Lstat(%q): %s", path, err), msgAndArgs...) + } + if info.IsDir() { + return Fail(t, fmt.Sprintf("%q is a directory", path), msgAndArgs...) + } + return true +} + +// NoFileExists checks whether a file does not exist in a given path. It fails +// if the path points to an existing _file_ only. +func NoFileExists(t TestingT, path string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + info, err := os.Lstat(path) + if err != nil { + return true + } + if info.IsDir() { + return true + } + return Fail(t, fmt.Sprintf("file %q exists", path), msgAndArgs...) +} + +// DirExists checks whether a directory exists in the given path. It also fails +// if the path is a file rather a directory or there is an error checking whether it exists. +func DirExists(t TestingT, path string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + info, err := os.Lstat(path) + if err != nil { + if os.IsNotExist(err) { + return Fail(t, fmt.Sprintf("unable to find file %q", path), msgAndArgs...) + } + return Fail(t, fmt.Sprintf("error when running os.Lstat(%q): %s", path, err), msgAndArgs...) + } + if !info.IsDir() { + return Fail(t, fmt.Sprintf("%q is a file", path), msgAndArgs...) + } + return true +} + +// NoDirExists checks whether a directory does not exist in the given path. +// It fails if the path points to an existing _directory_ only. +func NoDirExists(t TestingT, path string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + info, err := os.Lstat(path) + if err != nil { + if os.IsNotExist(err) { + return true + } + return true + } + if !info.IsDir() { + return true + } + return Fail(t, fmt.Sprintf("directory %q exists", path), msgAndArgs...) +} + +// JSONEq asserts that two JSON strings are equivalent. +// +// assert.JSONEq(t, `{"hello": "world", "foo": "bar"}`, `{"foo": "bar", "hello": "world"}`) +func JSONEq(t TestingT, expected string, actual string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + var expectedJSONAsInterface, actualJSONAsInterface interface{} + + if err := json.Unmarshal([]byte(expected), &expectedJSONAsInterface); err != nil { + return Fail(t, fmt.Sprintf("Expected value ('%s') is not valid json.\nJSON parsing error: '%s'", expected, err.Error()), msgAndArgs...) + } + + if err := json.Unmarshal([]byte(actual), &actualJSONAsInterface); err != nil { + return Fail(t, fmt.Sprintf("Input ('%s') needs to be valid json.\nJSON parsing error: '%s'", actual, err.Error()), msgAndArgs...) + } + + return Equal(t, expectedJSONAsInterface, actualJSONAsInterface, msgAndArgs...) +} + +// YAMLEq asserts that two YAML strings are equivalent. +func YAMLEq(t TestingT, expected string, actual string, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + var expectedYAMLAsInterface, actualYAMLAsInterface interface{} + + if err := yaml.Unmarshal([]byte(expected), &expectedYAMLAsInterface); err != nil { + return Fail(t, fmt.Sprintf("Expected value ('%s') is not valid yaml.\nYAML parsing error: '%s'", expected, err.Error()), msgAndArgs...) + } + + if err := yaml.Unmarshal([]byte(actual), &actualYAMLAsInterface); err != nil { + return Fail(t, fmt.Sprintf("Input ('%s') needs to be valid yaml.\nYAML error: '%s'", actual, err.Error()), msgAndArgs...) + } + + return Equal(t, expectedYAMLAsInterface, actualYAMLAsInterface, msgAndArgs...) +} + +func typeAndKind(v interface{}) (reflect.Type, reflect.Kind) { + t := reflect.TypeOf(v) + k := t.Kind() + + if k == reflect.Ptr { + t = t.Elem() + k = t.Kind() + } + return t, k +} + +// diff returns a diff of both values as long as both are of the same type and +// are a struct, map, slice, array or string. Otherwise it returns an empty string. +func diff(expected interface{}, actual interface{}) string { + if expected == nil || actual == nil { + return "" + } + + et, ek := typeAndKind(expected) + at, _ := typeAndKind(actual) + + if et != at { + return "" + } + + if ek != reflect.Struct && ek != reflect.Map && ek != reflect.Slice && ek != reflect.Array && ek != reflect.String { + return "" + } + + var e, a string + + switch et { + case reflect.TypeOf(""): + e = reflect.ValueOf(expected).String() + a = reflect.ValueOf(actual).String() + case reflect.TypeOf(time.Time{}): + e = spewConfigStringerEnabled.Sdump(expected) + a = spewConfigStringerEnabled.Sdump(actual) + default: + e = spewConfig.Sdump(expected) + a = spewConfig.Sdump(actual) + } + + diff, _ := difflib.GetUnifiedDiffString(difflib.UnifiedDiff{ + A: difflib.SplitLines(e), + B: difflib.SplitLines(a), + FromFile: "Expected", + FromDate: "", + ToFile: "Actual", + ToDate: "", + Context: 1, + }) + + return "\n\nDiff:\n" + diff +} + +func isFunction(arg interface{}) bool { + if arg == nil { + return false + } + return reflect.TypeOf(arg).Kind() == reflect.Func +} + +var spewConfig = spew.ConfigState{ + Indent: " ", + DisablePointerAddresses: true, + DisableCapacities: true, + SortKeys: true, + DisableMethods: true, + MaxDepth: 10, +} + +var spewConfigStringerEnabled = spew.ConfigState{ + Indent: " ", + DisablePointerAddresses: true, + DisableCapacities: true, + SortKeys: true, + MaxDepth: 10, +} + +type tHelper interface { + Helper() +} + +// Eventually asserts that given condition will be met in waitFor time, +// periodically checking target function each tick. +// +// assert.Eventually(t, func() bool { return true; }, time.Second, 10*time.Millisecond) +func Eventually(t TestingT, condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + ch := make(chan bool, 1) + + timer := time.NewTimer(waitFor) + defer timer.Stop() + + ticker := time.NewTicker(tick) + defer ticker.Stop() + + for tick := ticker.C; ; { + select { + case <-timer.C: + return Fail(t, "Condition never satisfied", msgAndArgs...) + case <-tick: + tick = nil + go func() { ch <- condition() }() + case v := <-ch: + if v { + return true + } + tick = ticker.C + } + } +} + +// CollectT implements the TestingT interface and collects all errors. +type CollectT struct { + errors []error +} + +// Errorf collects the error. +func (c *CollectT) Errorf(format string, args ...interface{}) { + c.errors = append(c.errors, fmt.Errorf(format, args...)) +} + +// FailNow panics. +func (c *CollectT) FailNow() { + panic("Assertion failed") +} + +// Reset clears the collected errors. +func (c *CollectT) Reset() { + c.errors = nil +} + +// Copy copies the collected errors to the supplied t. +func (c *CollectT) Copy(t TestingT) { + if tt, ok := t.(tHelper); ok { + tt.Helper() + } + for _, err := range c.errors { + t.Errorf("%v", err) + } +} + +// EventuallyWithT asserts that given condition will be met in waitFor time, +// periodically checking target function each tick. In contrast to Eventually, +// it supplies a CollectT to the condition function, so that the condition +// function can use the CollectT to call other assertions. +// The condition is considered "met" if no errors are raised in a tick. +// The supplied CollectT collects all errors from one tick (if there are any). +// If the condition is not met before waitFor, the collected errors of +// the last tick are copied to t. +// +// externalValue := false +// go func() { +// time.Sleep(8*time.Second) +// externalValue = true +// }() +// assert.EventuallyWithT(t, func(c *assert.CollectT) { +// // add assertions as needed; any assertion failure will fail the current tick +// assert.True(c, externalValue, "expected 'externalValue' to be true") +// }, 1*time.Second, 10*time.Second, "external state has not changed to 'true'; still false") +func EventuallyWithT(t TestingT, condition func(collect *CollectT), waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + collect := new(CollectT) + ch := make(chan bool, 1) + + timer := time.NewTimer(waitFor) + defer timer.Stop() + + ticker := time.NewTicker(tick) + defer ticker.Stop() + + for tick := ticker.C; ; { + select { + case <-timer.C: + collect.Copy(t) + return Fail(t, "Condition never satisfied", msgAndArgs...) + case <-tick: + tick = nil + collect.Reset() + go func() { + condition(collect) + ch <- len(collect.errors) == 0 + }() + case v := <-ch: + if v { + return true + } + tick = ticker.C + } + } +} + +// Never asserts that the given condition doesn't satisfy in waitFor time, +// periodically checking the target function each tick. +// +// assert.Never(t, func() bool { return false; }, time.Second, 10*time.Millisecond) +func Never(t TestingT, condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + + ch := make(chan bool, 1) + + timer := time.NewTimer(waitFor) + defer timer.Stop() + + ticker := time.NewTicker(tick) + defer ticker.Stop() + + for tick := ticker.C; ; { + select { + case <-timer.C: + return true + case <-tick: + tick = nil + go func() { ch <- condition() }() + case v := <-ch: + if v { + return Fail(t, "Condition satisfied", msgAndArgs...) + } + tick = ticker.C + } + } +} + +// ErrorIs asserts that at least one of the errors in err's chain matches target. +// This is a wrapper for errors.Is. +func ErrorIs(t TestingT, err, target error, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if errors.Is(err, target) { + return true + } + + var expectedText string + if target != nil { + expectedText = target.Error() + } + + chain := buildErrorChainString(err) + + return Fail(t, fmt.Sprintf("Target error should be in err chain:\n"+ + "expected: %q\n"+ + "in chain: %s", expectedText, chain, + ), msgAndArgs...) +} + +// NotErrorIs asserts that at none of the errors in err's chain matches target. +// This is a wrapper for errors.Is. +func NotErrorIs(t TestingT, err, target error, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if !errors.Is(err, target) { + return true + } + + var expectedText string + if target != nil { + expectedText = target.Error() + } + + chain := buildErrorChainString(err) + + return Fail(t, fmt.Sprintf("Target error should not be in err chain:\n"+ + "found: %q\n"+ + "in chain: %s", expectedText, chain, + ), msgAndArgs...) +} + +// ErrorAs asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value. +// This is a wrapper for errors.As. +func ErrorAs(t TestingT, err error, target interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + if errors.As(err, target) { + return true + } + + chain := buildErrorChainString(err) + + return Fail(t, fmt.Sprintf("Should be in error chain:\n"+ + "expected: %q\n"+ + "in chain: %s", target, chain, + ), msgAndArgs...) +} + +func buildErrorChainString(err error) string { + if err == nil { + return "" + } + + e := errors.Unwrap(err) + chain := fmt.Sprintf("%q", err.Error()) + for e != nil { + chain += fmt.Sprintf("\n\t%q", e.Error()) + e = errors.Unwrap(e) + } + return chain +} diff --git a/vendor/github.com/stretchr/testify/assert/doc.go b/vendor/github.com/stretchr/testify/assert/doc.go new file mode 100644 index 0000000000..4953981d38 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/doc.go @@ -0,0 +1,46 @@ +// Package assert provides a set of comprehensive testing tools for use with the normal Go testing system. +// +// # Example Usage +// +// The following is a complete example using assert in a standard test function: +// +// import ( +// "testing" +// "github.com/stretchr/testify/assert" +// ) +// +// func TestSomething(t *testing.T) { +// +// var a string = "Hello" +// var b string = "Hello" +// +// assert.Equal(t, a, b, "The two words should be the same.") +// +// } +// +// if you assert many times, use the format below: +// +// import ( +// "testing" +// "github.com/stretchr/testify/assert" +// ) +// +// func TestSomething(t *testing.T) { +// assert := assert.New(t) +// +// var a string = "Hello" +// var b string = "Hello" +// +// assert.Equal(a, b, "The two words should be the same.") +// } +// +// # Assertions +// +// Assertions allow you to easily write test code, and are global funcs in the `assert` package. +// All assertion functions take, as the first argument, the `*testing.T` object provided by the +// testing framework. This allows the assertion funcs to write the failings and other details to +// the correct place. +// +// Every assertion function also takes an optional string message as the final argument, +// allowing custom error messages to be appended to the message the assertion method outputs. +package assert diff --git a/vendor/github.com/stretchr/testify/assert/errors.go b/vendor/github.com/stretchr/testify/assert/errors.go new file mode 100644 index 0000000000..ac9dc9d1d6 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/errors.go @@ -0,0 +1,10 @@ +package assert + +import ( + "errors" +) + +// AnError is an error instance useful for testing. If the code does not care +// about error specifics, and only needs to return the error for example, this +// error should be used to make the test code more readable. +var AnError = errors.New("assert.AnError general error for testing") diff --git a/vendor/github.com/stretchr/testify/assert/forward_assertions.go b/vendor/github.com/stretchr/testify/assert/forward_assertions.go new file mode 100644 index 0000000000..df189d2348 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/forward_assertions.go @@ -0,0 +1,16 @@ +package assert + +// Assertions provides assertion methods around the +// TestingT interface. +type Assertions struct { + t TestingT +} + +// New makes a new Assertions object for the specified TestingT. +func New(t TestingT) *Assertions { + return &Assertions{ + t: t, + } +} + +//go:generate sh -c "cd ../_codegen && go build && cd - && ../_codegen/_codegen -output-package=assert -template=assertion_forward.go.tmpl -include-format-funcs" diff --git a/vendor/github.com/stretchr/testify/assert/http_assertions.go b/vendor/github.com/stretchr/testify/assert/http_assertions.go new file mode 100644 index 0000000000..d8038c28a7 --- /dev/null +++ b/vendor/github.com/stretchr/testify/assert/http_assertions.go @@ -0,0 +1,162 @@ +package assert + +import ( + "fmt" + "net/http" + "net/http/httptest" + "net/url" + "strings" +) + +// httpCode is a helper that returns HTTP code of the response. It returns -1 and +// an error if building a new request fails. +func httpCode(handler http.HandlerFunc, method, url string, values url.Values) (int, error) { + w := httptest.NewRecorder() + req, err := http.NewRequest(method, url, nil) + if err != nil { + return -1, err + } + req.URL.RawQuery = values.Encode() + handler(w, req) + return w.Code, nil +} + +// HTTPSuccess asserts that a specified handler returns a success status code. +// +// assert.HTTPSuccess(t, myHandler, "POST", "http://www.google.com", nil) +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPSuccess(t TestingT, handler http.HandlerFunc, method, url string, values url.Values, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + code, err := httpCode(handler, method, url, values) + if err != nil { + Fail(t, fmt.Sprintf("Failed to build test request, got error: %s", err)) + } + + isSuccessCode := code >= http.StatusOK && code <= http.StatusPartialContent + if !isSuccessCode { + Fail(t, fmt.Sprintf("Expected HTTP success status code for %q but received %d", url+"?"+values.Encode(), code)) + } + + return isSuccessCode +} + +// HTTPRedirect asserts that a specified handler returns a redirect status code. +// +// assert.HTTPRedirect(t, myHandler, "GET", "/a/b/c", url.Values{"a": []string{"b", "c"}} +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPRedirect(t TestingT, handler http.HandlerFunc, method, url string, values url.Values, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + code, err := httpCode(handler, method, url, values) + if err != nil { + Fail(t, fmt.Sprintf("Failed to build test request, got error: %s", err)) + } + + isRedirectCode := code >= http.StatusMultipleChoices && code <= http.StatusTemporaryRedirect + if !isRedirectCode { + Fail(t, fmt.Sprintf("Expected HTTP redirect status code for %q but received %d", url+"?"+values.Encode(), code)) + } + + return isRedirectCode +} + +// HTTPError asserts that a specified handler returns an error status code. +// +// assert.HTTPError(t, myHandler, "POST", "/a/b/c", url.Values{"a": []string{"b", "c"}} +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPError(t TestingT, handler http.HandlerFunc, method, url string, values url.Values, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + code, err := httpCode(handler, method, url, values) + if err != nil { + Fail(t, fmt.Sprintf("Failed to build test request, got error: %s", err)) + } + + isErrorCode := code >= http.StatusBadRequest + if !isErrorCode { + Fail(t, fmt.Sprintf("Expected HTTP error status code for %q but received %d", url+"?"+values.Encode(), code)) + } + + return isErrorCode +} + +// HTTPStatusCode asserts that a specified handler returns a specified status code. +// +// assert.HTTPStatusCode(t, myHandler, "GET", "/notImplemented", nil, 501) +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPStatusCode(t TestingT, handler http.HandlerFunc, method, url string, values url.Values, statuscode int, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + code, err := httpCode(handler, method, url, values) + if err != nil { + Fail(t, fmt.Sprintf("Failed to build test request, got error: %s", err)) + } + + successful := code == statuscode + if !successful { + Fail(t, fmt.Sprintf("Expected HTTP status code %d for %q but received %d", statuscode, url+"?"+values.Encode(), code)) + } + + return successful +} + +// HTTPBody is a helper that returns HTTP body of the response. It returns +// empty string if building a new request fails. +func HTTPBody(handler http.HandlerFunc, method, url string, values url.Values) string { + w := httptest.NewRecorder() + req, err := http.NewRequest(method, url+"?"+values.Encode(), nil) + if err != nil { + return "" + } + handler(w, req) + return w.Body.String() +} + +// HTTPBodyContains asserts that a specified handler returns a +// body that contains a string. +// +// assert.HTTPBodyContains(t, myHandler, "GET", "www.google.com", nil, "I'm Feeling Lucky") +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPBodyContains(t TestingT, handler http.HandlerFunc, method, url string, values url.Values, str interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + body := HTTPBody(handler, method, url, values) + + contains := strings.Contains(body, fmt.Sprint(str)) + if !contains { + Fail(t, fmt.Sprintf("Expected response body for \"%s\" to contain \"%s\" but found \"%s\"", url+"?"+values.Encode(), str, body)) + } + + return contains +} + +// HTTPBodyNotContains asserts that a specified handler returns a +// body that does not contain a string. +// +// assert.HTTPBodyNotContains(t, myHandler, "GET", "www.google.com", nil, "I'm Feeling Lucky") +// +// Returns whether the assertion was successful (true) or not (false). +func HTTPBodyNotContains(t TestingT, handler http.HandlerFunc, method, url string, values url.Values, str interface{}, msgAndArgs ...interface{}) bool { + if h, ok := t.(tHelper); ok { + h.Helper() + } + body := HTTPBody(handler, method, url, values) + + contains := strings.Contains(body, fmt.Sprint(str)) + if contains { + Fail(t, fmt.Sprintf("Expected response body for \"%s\" to NOT contain \"%s\" but found \"%s\"", url+"?"+values.Encode(), str, body)) + } + + return !contains +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 717873c366..cbe311bb66 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,3 +1,9 @@ +# github.com/Masterminds/semver v1.5.0 +## explicit +github.com/Masterminds/semver +# github.com/NVIDIA/go-dcgm v0.0.0-20240118201113-3385e277e49f +## explicit; go 1.21 +github.com/NVIDIA/go-dcgm/pkg/dcgm # github.com/NVIDIA/go-nvml v0.12.0-1 ## explicit; go 1.15 github.com/NVIDIA/go-nvml/pkg/dl @@ -11,6 +17,9 @@ github.com/aquasecurity/libbpfgo # github.com/beorn7/perks v1.0.1 ## explicit; go 1.11 github.com/beorn7/perks/quantile +# github.com/bits-and-blooms/bitset v1.13.0 +## explicit; go 1.16 +github.com/bits-and-blooms/bitset # github.com/cespare/xxhash/v2 v2.2.0 ## explicit; go 1.11 github.com/cespare/xxhash/v2 @@ -248,6 +257,9 @@ github.com/opencontainers/runtime-spec/specs-go # github.com/pkg/errors v0.9.1 ## explicit github.com/pkg/errors +# github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 +## explicit +github.com/pmezard/go-difflib/difflib # github.com/prometheus/client_golang v1.17.0 ## explicit; go 1.19 github.com/prometheus/client_golang/api @@ -283,6 +295,9 @@ github.com/sirupsen/logrus # github.com/spf13/pflag v1.0.5 ## explicit; go 1.12 github.com/spf13/pflag +# github.com/stretchr/testify v1.8.4 +## explicit; go 1.20 +github.com/stretchr/testify/assert # golang.org/x/exp v0.0.0-20231006140011-7918f672742d ## explicit; go 1.20 golang.org/x/exp/constraints