From b9600c5cd356bfd9aebd44f22cc2e0c2443421f6 Mon Sep 17 00:00:00 2001 From: peiniliu Date: Mon, 15 Aug 2022 10:24:51 +0200 Subject: [PATCH] support config gpu memory factor Signed-off-by: peiniliu --- doc/config.md | 13 ++++++++++++- main.go | 6 ++++++ pkg/apis/config.go | 1 + pkg/apis/flags.go | 6 ++++-- pkg/plugin/nvidia/server.go | 4 ++-- pkg/plugin/nvidia/utils.go | 4 ++-- volcano-device-plugin-GKE.yml | 1 + volcano-device-plugin.yml | 1 + 8 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/config.md b/doc/config.md index fc49d1dc1..708b69188 100644 --- a/doc/config.md +++ b/doc/config.md @@ -7,11 +7,13 @@ The volcano device plugin has a number of options that can be configured. These | Flag | Envvar | Default Value | |--------------------------|-------------------------|-----------------| | `--gpu-strategy` | `$GPU_STRATEGY` | `"share"` | +| `--gpu-memory-factor` | `$GPU_MEMORY_FACTOR` | `1` | | `--config-file` | `$CONFIG_FILE` | `""` | when starting volcano-device-plugin.yml, users can specify these parameters by adding args to the container 'volcano-device-plugin'. For example: - args: ["--gpu-strategy=number"] will let device plugin using the gpu-number strategy + - args: ["--gpu-strategy=share","--gpu-memory-factor=10"] will let device plugin using the gpu-share strategy, and memory factor is 10MB ### As a configuration file ``` @@ -21,7 +23,7 @@ flags: ``` ### Configuration Option Details -**`GPU_STRATEGY`**: +**`GPU_STRATEGY`(string)**: the desired strategy for exposing GPU devices `[number | share ] (default 'share')` @@ -30,6 +32,15 @@ flags: on GPU devices in numbers or sharing mode. More information on what these strategies are and how to use it in Volcano can be found in Volcano scheduler. +**`GPU_MEMORY_FACTOR`(uint)**: + the desired memory factor for exposing GPU shared memory virtual devices + + `(default 1)` + + The `GPU_MEMORY_FACTOR` option configures the daemonset to be able to expose + on GPU shared memory virtual devices size. By default each block is set to be 1MB, + but users who have large gpu memory can specify a larger number such as 10MB, 100MB. + **`CONFIG_FILE`**: point the plugin at a configuration file instead of relying on command line flags or environment variables diff --git a/main.go b/main.go index 4105be84d..8ffb495d7 100644 --- a/main.go +++ b/main.go @@ -80,6 +80,12 @@ func main() { Usage: "the default strategy is using shared GPU devices while using 'number' meaning using GPUs individually. [number| share]", EnvVars: []string{"GPU_STRATEGY"}, }, + &cli.UintFlag{ + Name: "gpu-memory-factor", + Value: 1, + Usage: "the default gpu memory block size is 1MB", + EnvVars: []string{"GPU_MEMORY_FACTOR"}, + }, &cli.StringFlag{ Name: "config-file", Usage: "the path to a config file as an alternative to command line options or environment variables", diff --git a/pkg/apis/config.go b/pkg/apis/config.go index da3772949..36bcb9f21 100644 --- a/pkg/apis/config.go +++ b/pkg/apis/config.go @@ -44,6 +44,7 @@ func NewConfig(c *cli.Context, flags []cli.Flag) (*Config, error) { } log.Println(c.String("gpu-strategy")) + log.Println(c.Uint("gpu-memory-factor")) configFile := c.String("config-file") if configFile != "" { diff --git a/pkg/apis/flags.go b/pkg/apis/flags.go index 3eab9067c..1e371d47f 100644 --- a/pkg/apis/flags.go +++ b/pkg/apis/flags.go @@ -27,11 +27,13 @@ type Flags struct { // CommandLineFlags holds the list of command line flags used to configure the device plugin and GFD. type CommandLineFlags struct { - GPUStrategy string `json:"GPUStrategy" yaml:"GPUStrategy"` + GPUStrategy string `json:"GPUStrategy" yaml:"GPUStrategy"` + GPUMemoryFactor uint `json:"GPUMemoryFactor" yaml:"GPUMemoryFactor"` } func NewCommandLineFlags(c *cli.Context) *CommandLineFlags { return &CommandLineFlags{ - GPUStrategy: c.String("gpu-strategy"), + GPUStrategy: c.String("gpu-strategy"), + GPUMemoryFactor: c.Uint("gpu-memory-factor"), } } diff --git a/pkg/plugin/nvidia/server.go b/pkg/plugin/nvidia/server.go index d59d3d62f..fe08b7352 100644 --- a/pkg/plugin/nvidia/server.go +++ b/pkg/plugin/nvidia/server.go @@ -96,7 +96,7 @@ func (m *NvidiaDevicePlugin) initialize() { m.health = make(chan *Device) m.stop = make(chan struct{}) - m.virtualDevices, m.devicesByIndex = GetDevices() + m.virtualDevices, m.devicesByIndex = GetDevices(m.config.Flags.GPUMemoryFactor) } func (m *NvidiaDevicePlugin) cleanup() { @@ -389,7 +389,7 @@ Allocate: response := pluginapi.ContainerAllocateResponse{ Envs: map[string]string{ VisibleDevice: strings.Trim(strings.Replace(fmt.Sprint(ids), " ", ",", -1), "[]"), - AllocatedGPUResource: fmt.Sprintf("%d", reqGPU), + AllocatedGPUResource: fmt.Sprintf("%d", reqGPU*int(m.config.Flags.GPUMemoryFactor)), TotalGPUMemory: fmt.Sprintf("%d", gpuMemory), }, } diff --git a/pkg/plugin/nvidia/utils.go b/pkg/plugin/nvidia/utils.go index 425a23658..2eafe8114 100644 --- a/pkg/plugin/nvidia/utils.go +++ b/pkg/plugin/nvidia/utils.go @@ -64,7 +64,7 @@ func GetGPUMemory() uint { } // GetDevices returns virtual devices and all physical devices by index. -func GetDevices() ([]*pluginapi.Device, map[uint]string) { +func GetDevices(gpuMemoryFactor uint) ([]*pluginapi.Device, map[uint]string) { n, err := nvml.GetDeviceCount() check(err) @@ -81,7 +81,7 @@ func GetDevices() ([]*pluginapi.Device, map[uint]string) { if GetGPUMemory() == uint(0) { SetGPUMemory(uint(*d.Memory)) } - for j := uint(0); j < GetGPUMemory(); j++ { + for j := uint(0); j < GetGPUMemory()/gpuMemoryFactor; j++ { fakeID := GenerateVirtualDeviceID(id, j) virtualDevs = append(virtualDevs, &pluginapi.Device{ ID: fakeID, diff --git a/volcano-device-plugin-GKE.yml b/volcano-device-plugin-GKE.yml index 53b4808a2..07940673e 100644 --- a/volcano-device-plugin-GKE.yml +++ b/volcano-device-plugin-GKE.yml @@ -92,6 +92,7 @@ spec: - image: volcanosh/volcano-device-plugin:latest name: volcano-device-plugin #args: ["--gpu-strategy=number"] + #args: ["--gpu-strategy=share", "--gpu-memory-factor=1"] env: - name: NODE_NAME valueFrom: diff --git a/volcano-device-plugin.yml b/volcano-device-plugin.yml index 7777f86b8..29b586f32 100644 --- a/volcano-device-plugin.yml +++ b/volcano-device-plugin.yml @@ -84,6 +84,7 @@ spec: containers: - image: volcanosh/volcano-device-plugin:latest #args: ["--gpu-strategy=number"] + #args: ["--gpu-strategy=share", "--gpu-memory-factor=1"] name: volcano-device-plugin env: - name: NODE_NAME