diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index d45804f..f4de3d8 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -17,7 +17,7 @@ Please delete options that are not relevant. # Checklist: - [ ] My code follows the style guidelines of this project. See [contributing-guidelines.md](./../contributing-guidelines.md) -- [ ] Existing workload examples run after my changes (if applicable) +- [ ] Existing workload examples run to completion after my changes (if applicable) - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have made corresponding changes to the documentation diff --git a/.vscode/launch.json b/.vscode/launch.json index f687d18..5e68728 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -6,8 +6,8 @@ "type": "go", "request": "launch", "mode": "debug", - "program": "${workspaceFolder}", - "args": ["submit", "-p", "workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode", "--ray", "-g", "4", "--dry-run"], + "program": "${workspaceFolder}/cmd/cli/main.go", + "args": ["submit", "-p", "${workspaceFolder}/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode", "--ray", "-g", "4", "--dry-run", "--storage=100Gi,longhorn"], "env": { "GO111MODULE": "on" }, @@ -18,8 +18,8 @@ "type": "go", "request": "launch", "mode": "debug", - "program": "${workspaceFolder}", - "args": ["serve", "-p", "workloads/inference/LLMs/online-inference/vllm-online-single-multinode", "--ray", "--replicas", "1", "--gpus-per-replica", "4", "--dry-run"], + "program": "${workspaceFolder}/cmd/cli/main.go", + "args": ["serve", "-p", "${workspaceFolder}/workloads/inference/LLMs/online-inference/vllm-online-single-multinode", "--ray", "--replicas", "1", "--gpus-per-replica", "4", "--dry-run"], "env": { "GO111MODULE": "on" }, @@ -30,8 +30,8 @@ "type": "go", "request": "launch", "mode": "debug", - "program": "${workspaceFolder}", - "args": ["submit", "-p", "workloads/training/LLMs/bert/hf-accelerate-bert", "-g", "4", "--dry-run"], + "program": "${workspaceFolder}/cmd/cli/main.go", + "args": ["submit", "-p", "${workspaceFolder}/workloads/training/LLMs/bert/hf-accelerate-bert", "-g", "4", "--dry-run"], "env": { "GO111MODULE": "on" }, @@ -42,7 +42,7 @@ "type": "go", "request": "launch", "mode": "debug", - "program": "${workspaceFolder}", + "program": "${workspaceFolder}/cmd/cli/main.go", "args": ["submit", "-i", "ghcr.io/silogen/rocm-ray:v0.4", "-g", "4"], "env": { "GO111MODULE": "on" @@ -54,8 +54,8 @@ "type": "go", "request": "launch", "mode": "debug", - "program": "${workspaceFolder}", - "args": ["submit", "-p", "workloads/training/LLMs/lora-supervised-finetuning/ds-zero3-single-multinode", "--ray", "-g", "4"], + "program": "${workspaceFolder}/cmd/cli/main.go", + "args": ["submit", "-p", "${workspaceFolder}/workloads/training/LLMs/lora-supervised-finetuning/ds-zero3-single-multinode", "--ray", "-g", "4"], "env": { "GO111MODULE": "on" }, @@ -66,7 +66,7 @@ "type": "go", "request": "launch", "mode": "debug", - "program": "${workspaceFolder}", + "program": "${workspaceFolder}/cmd/cli/main.go", "args": ["monitor", "deployment/avsuni-gpu-monitoring", "-n", "av-test"], "env": { "GO111MODULE": "on" diff --git a/README.md b/README.md index 8071cbc..5030d3a 100644 --- a/README.md +++ b/README.md @@ -243,6 +243,22 @@ You can access this in the template via {{ .Custom.parent.child }} ``` +### Storage + +You can use the Kaiwo CLI to instruct a workload to use storage from a given storage class. If you do not provide any input for the CLI, the following default values are used: + +* The storage class name is read from the specified namespace's label `kaiwo-cli/default-storage-class` +* The storage amount is read from the specified namespace's label `kaiwo-cli/default-storage-quantity` + +If these values do not exist, an exception is raised. If you are using the cluster-admins examples from this repository, you can modify the namespace at [cluster-admins/kueue/cluster-queue.yaml](cluster-admins/kueue/cluster-queue.yaml) and add these values. If you want to skip adding storage, you must explicitly add the `--no-storage` flag. + +To specify storage, you can use the flags: + +* `--storage=2Gi` to specify the amount of storage and to use the default storage class name from the namespace labels +* `--storage=2Gi,mystorageclass` to specify both the amount of storage and the storage class name + +Note that the storage created is ephemeral and meant for caching, which means that it gets removed when the underlying pods get removed. However, the ephemeral storage is provisioned via a storage class, which ensures that the space requested is available and reserved for all pods before the workload starts. + ## Interacting with workloads While Kaiwo's primary purpose is to deploy workloads, it can also be used as a light tool to discover and interact with running workloads. diff --git a/cluster-admins/kueue/cluster-queue.yaml b/cluster-admins/kueue/cluster-queue.yaml index 768e137..a6cc4ac 100644 --- a/cluster-admins/kueue/cluster-queue.yaml +++ b/cluster-admins/kueue/cluster-queue.yaml @@ -5,7 +5,7 @@ metadata: spec: namespaceSelector: {} # match all. resourceGroups: - - coveredResources: ["cpu", "memory", "amd.com/gpu", "ephemeral-storage"] + - coveredResources: ["cpu", "memory", "amd.com/gpu"] flavors: - name: base-gpu-flavour resources: @@ -15,5 +15,3 @@ spec: nominalQuota: 1800Gi - name: "amd.com/gpu" nominalQuota: 16 - - name: "ephemeral-storage" - nominalQuota: 2000Gi diff --git a/pkg/cli/apply/run.go b/pkg/cli/apply/run.go index 2c6524e..8724e41 100644 --- a/pkg/cli/apply/run.go +++ b/pkg/cli/apply/run.go @@ -22,6 +22,11 @@ import ( "strconv" "strings" + "k8s.io/apimachinery/pkg/api/errors" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" @@ -84,15 +89,50 @@ func RunApply(workload workloads.Workload, workloadMeta any) error { } // Prepare scheduling flags - dynamicClient, err := k8s.GetClient() + clients, err := k8s.GetKubernetesClients() if err != nil { - return fmt.Errorf("error fetching Kubernetes client: %w", err) + return fmt.Errorf("error getting k8s clients: %w", err) } ctx := context.TODO() - schedulingFlags := GetSchedulingFlags() + schedulingFlags, err := GetSchedulingFlags() + if err != nil { + return fmt.Errorf("error getting scheduling flags: %w", err) + } + + if schedulingFlags.Storage != nil { + if schedulingFlags.Storage.StorageClassName == "" || schedulingFlags.Storage.Quantity == "" { + logrus.Info("Storage class name and / or quantity not provided, checking namespace labels for defaults") + + defaultStorageFlags, err := findDefaultStorageFlags(ctx, *clients.Clientset, metaFlags.Namespace) + if err != nil { + return fmt.Errorf("error checking for storage defaults: %w", err) + } + + if schedulingFlags.Storage.StorageClassName == "" { + if defaultStorageFlags.StorageClassName == "" { + return fmt.Errorf("storage requested, but no storage class name provided and no default exists in the namespace '%s' label '%s'", metaFlags.Namespace, workloads.KaiwoDefaultStorageClassNameLabel) + } + schedulingFlags.Storage.StorageClassName = defaultStorageFlags.StorageClassName + } + if schedulingFlags.Storage.Quantity == "" { + if defaultStorageFlags.Quantity == "" { + return fmt.Errorf("storage requested, but no quantity provided and no default exists in the namespace '%s' label '%s'", metaFlags.Namespace, workloads.KaiwoDefaultStorageQuantityLabel) + } + schedulingFlags.Storage.Quantity = defaultStorageFlags.Quantity + } + } + + storageClassExists, err := doesStorageClassExist(ctx, *clients.Clientset, schedulingFlags.Storage.StorageClassName) + if err != nil { + return fmt.Errorf("error checking if storage class exists: %w", err) + } + if !storageClassExists { + return fmt.Errorf("storage class '%s' does not exist", schedulingFlags.Storage.StorageClassName) + } + } - if err := fillSchedulingFlags(ctx, dynamicClient, &schedulingFlags, execFlags.ResourceFlavorGpuNodeLabelKey, metaFlags.EnvVars); err != nil { + if err := fillSchedulingFlags(ctx, clients.Client, schedulingFlags, execFlags.ResourceFlavorGpuNodeLabelKey, metaFlags.EnvVars); err != nil { return fmt.Errorf("error filling scheduling flags: %w", err) } logrus.Debugf("Successfully loaded scheduling info from Kubernetes") @@ -106,18 +146,59 @@ func RunApply(workload workloads.Workload, workloadMeta any) error { WorkloadMeta: workloadMeta, Workload: workloadConfig, Meta: metaFlags, - Scheduling: schedulingFlags, + Scheduling: *schedulingFlags, Custom: customConfig, } // Apply the workload - if err := workloads.ApplyWorkload(ctx, dynamicClient, workload, execFlags, templateContext); err != nil { + if err := workloads.ApplyWorkload(ctx, clients.Client, workload, execFlags, templateContext); err != nil { return fmt.Errorf("error applying workload: %w", err) } return nil } +func findDefaultStorageFlags(ctx context.Context, clientset kubernetes.Clientset, namespace string) (*workloads.StorageSchedulingFlags, error) { + namespaceObject, err := clientset.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + logrus.Warnf("Namespace does not exist, cannot check for storage defaults. Either ensure that the namespace exists and has default values, specify the storage class and amount explicitly, or specify --no-storage to skip adding storage.") + return nil, fmt.Errorf("failed to find default storage class or quantity for namespace that does not exist: %s", namespace) + } + return nil, fmt.Errorf("error getting namespace: %w", err) + } + + flags := &workloads.StorageSchedulingFlags{} + + defaultStorageClassName, ok := namespaceObject.Labels[workloads.KaiwoDefaultStorageClassNameLabel] + if ok { + logrus.Debugf("Default storage class discovered: %s", defaultStorageClassName) + flags.StorageClassName = defaultStorageClassName + } else { + logrus.Debugf("Default storage class not found") + } + defaultStorageQuantity, ok := namespaceObject.Labels[workloads.KaiwoDefaultStorageQuantityLabel] + if ok { + logrus.Debugf("Default storage quantity discovered: %s", defaultStorageQuantity) + flags.Quantity = defaultStorageQuantity + } else { + logrus.Debugf("Default storage quantity not found") + } + + return flags, nil +} + +func doesStorageClassExist(ctx context.Context, clientset kubernetes.Clientset, storageClassName string) (bool, error) { + _, err := clientset.StorageV1().StorageClasses().Get(ctx, storageClassName, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return false, fmt.Errorf("error getting storage class %s: %w", storageClassName, err) + } + return true, nil +} + // loadCustomConfig loads custom configuration data from a file func loadCustomConfig(path string) (any, error) { logrus.Debugln("Loading custom config") @@ -196,11 +277,11 @@ func fillSchedulingFlags( if schedulingFlags.RequestedReplicas > 0 && schedulingFlags.RequestedGPUsPerReplica > 0 { if schedulingFlags.RequestedGPUsPerReplica > schedulingFlags.GPUsAvailablePerNode { - return fmt.Errorf("You requested %d GPUs per replica, but there are only %d GPUs available per node", + return fmt.Errorf("you requested %d GPUs per replica, but there are only %d GPUs available per node", schedulingFlags.RequestedGPUsPerReplica, schedulingFlags.GPUsAvailablePerNode) } if schedulingFlags.TotalRequestedGPUs > 0 { - return fmt.Errorf("Cannot set requested gpus with --gpus when --replicas and --gpus-per-replica are set") + return fmt.Errorf("cannot set requested gpus with --gpus when --replicas and --gpus-per-replica are set") } schedulingFlags.CalculatedNumReplicas = schedulingFlags.RequestedReplicas schedulingFlags.CalculatedGPUsPerReplica = schedulingFlags.RequestedGPUsPerReplica diff --git a/pkg/cli/apply/serve.go b/pkg/cli/apply/serve.go index d36bcb6..91ad40e 100644 --- a/pkg/cli/apply/serve.go +++ b/pkg/cli/apply/serve.go @@ -33,6 +33,9 @@ func BuildServeCmd() *cobra.Command { Use: "serve", Short: "Serve a deployment process", PersistentPreRunE: func(cmd *cobra.Command, args []string) error { + if err := cmd.Parent().PersistentPreRunE(cmd, args); err != nil { + return err + } return PreRunLoadConfig(cmd, args) }, RunE: func(cmd *cobra.Command, args []string) error { diff --git a/pkg/cli/apply/submit.go b/pkg/cli/apply/submit.go index 41fe365..c3b39ef 100644 --- a/pkg/cli/apply/submit.go +++ b/pkg/cli/apply/submit.go @@ -35,6 +35,9 @@ func BuildSubmitCmd() *cobra.Command { Use: "submit", Short: "Submit a job", PersistentPreRunE: func(cmd *cobra.Command, args []string) error { + if err := cmd.Parent().PersistentPreRunE(cmd, args); err != nil { + return err + } return PreRunLoadConfig(cmd, args) }, RunE: func(cmd *cobra.Command, args []string) error { diff --git a/pkg/cli/apply/utils.go b/pkg/cli/apply/utils.go index a533631..2311538 100644 --- a/pkg/cli/apply/utils.go +++ b/pkg/cli/apply/utils.go @@ -18,6 +18,9 @@ import ( "fmt" "os" "path/filepath" + "strings" + + "k8s.io/apimachinery/pkg/api/resource" "github.com/sirupsen/logrus" "github.com/spf13/cobra" @@ -67,7 +70,7 @@ func GetExecFlags() workloads.ExecFlags { const ( defaultNamespace = "kaiwo" - defaultImage = "ghcr.io/silogen/rocm-ray:v0.5" + defaultImage = "ghcr.io/silogen/rocm-ray:v0.6" ) var ( @@ -103,6 +106,8 @@ var ( gpus int replicas int gpusPerReplica int + storage string + noStorage bool ) // AddSchedulingFlags adds flags related to (Kueue) scheduling @@ -110,15 +115,73 @@ func AddSchedulingFlags(cmd *cobra.Command) { cmd.Flags().IntVarP(&gpus, "gpus", "g", 0, "Number of GPUs requested for the workload") cmd.Flags().IntVarP(&replicas, "replicas", "", 0, "Number of replicas requested for the workload") cmd.Flags().IntVarP(&gpusPerReplica, "gpus-per-replica", "", 0, "Number of GPUs requested per replica") + cmd.Flags().StringVarP( + &storage, + "storage", + "", + "default", + "Storage requested for the workload, use: --storage=storageQuantity,storageClassName, --storage=storageQuantity to use the default storage class, or --storage=default (the default) to use defaults for both storage class and amount. "+ + fmt.Sprintf("The default storage class and amount can be configured in the namespace's labels (keys %s and %s). ", workloads.KaiwoDefaultStorageClassNameLabel, workloads.KaiwoDefaultStorageQuantityLabel)+ + "If you do not want to include storage, you must pass --no-storage explicitly.", + ) + cmd.Flags().BoolVarP(&noStorage, "no-storage", "", false, "Don't use storage for the workload") } // GetSchedulingFlags initializes the scheduling flags with the number of GPUs requested -func GetSchedulingFlags() workloads.SchedulingFlags { - return workloads.SchedulingFlags{ +func GetSchedulingFlags() (*workloads.SchedulingFlags, error) { + flags := &workloads.SchedulingFlags{ TotalRequestedGPUs: gpus, RequestedReplicas: replicas, RequestedGPUsPerReplica: gpusPerReplica, } + + if storage != "default" && noStorage { + return nil, fmt.Errorf("you must specify --storage or --no-storage, not both") + } + + if noStorage { + logrus.Info("No storage requested for workload") + return flags, nil + } + + if storage == "" { + return nil, fmt.Errorf("you must specify --storage or --no-storage") + } + + requestedStorage := "" + storageClassName := "" + + if storage != "default" { + split := strings.Split(storage, ",") + + if len(split) > 2 { + return nil, fmt.Errorf("invalid storage specifier %s", storage) + } + if len(split) > 1 { + storageClassName = split[1] + logrus.Infof("Requested storage class name %s", storageClassName) + } else { + logrus.Info("You did not pass a storage class name, the default storage class will be used if it exists") + } + if len(split) > 0 { + requestedStorage = split[0] + + if _, err := resource.ParseQuantity(requestedStorage); err != nil { + return nil, fmt.Errorf("invalid storage quantity %s", requestedStorage) + } + + logrus.Infof("Requested storage %s", requestedStorage) + } else { + logrus.Infof("You did not pass a storage quantity, the default amount (%s) will be used", requestedStorage) + } + } + + flags.Storage = &workloads.StorageSchedulingFlags{ + Quantity: requestedStorage, + StorageClassName: storageClassName, + } + + return flags, nil } type Config struct { @@ -192,7 +255,7 @@ func ApplyConfigToFlags(cmd *cobra.Command, config *Config) { setFlag("gpus-per-replica", fmt.Sprintf("%d", config.RequestedGPUsPerReplica)) } -func PreRunLoadConfig(cmd *cobra.Command, args []string) error { +func PreRunLoadConfig(cmd *cobra.Command, _ []string) error { if path == "" { return nil } diff --git a/pkg/tui/list/pod/select.go b/pkg/tui/list/pod/select.go index f62a33e..e944719 100644 --- a/pkg/tui/list/pod/select.go +++ b/pkg/tui/list/pod/select.go @@ -148,7 +148,7 @@ func runSelectAndDoAction(_ context.Context, _ k8s.KubernetesClients, state *tui var ( viewLogsAction runAction = "View logs" - monitorAction runAction = "Monitor" + monitorAction runAction = "Monitor GPUs" commandAction runAction = "Run command" ) diff --git a/pkg/tui/list/workload/delete.go b/pkg/tui/list/workload/delete.go index 8229172..677bf63 100644 --- a/pkg/tui/list/workload/delete.go +++ b/pkg/tui/list/workload/delete.go @@ -32,7 +32,9 @@ import ( func runDeleteWorkload(ctx context.Context, clients k8s.KubernetesClients, state *tuicomponents.RunState) (tuicomponents.StepResult, tuicomponents.RunStep[tuicomponents.RunState], error) { confirmDelete := false - resourceDescription := fmt.Sprintf("Confirm that you want to delete the %s workload '%s' in namespace %s", state.WorkloadType, state.WorkloadReference.GetName(), state.Namespace) + resourceDescription := fmt.Sprintf("Confirm that you want to delete the %s workload '%s' in namespace %s. "+ + "This will also remove any linked resources, such as automatically created PVCs and ConfigMaps", + state.WorkloadType, state.WorkloadReference.GetName(), state.Namespace) f := huh.NewForm(huh.NewGroup(huh.NewConfirm().Title(resourceDescription).Value(&confirmDelete))) diff --git a/pkg/workloads/apply.go b/pkg/workloads/apply.go index 7dda876..30260e9 100644 --- a/pkg/workloads/apply.go +++ b/pkg/workloads/apply.go @@ -49,8 +49,12 @@ func ApplyWorkload( ) error { var resources []runtime.Object + var namespaceResource *corev1.Namespace + var configMapResource *corev1.ConfigMap + var err error + if execFlags.CreateNamespace { - namespaceResource, err := generateNamespaceManifestIfNotExists(ctx, k8sClient, templateContext.Meta.Namespace) + namespaceResource, err = generateNamespaceManifestIfNotExists(ctx, k8sClient, templateContext.Meta.Namespace) if err != nil { return fmt.Errorf("failed to generate namespace resource: %w", err) } @@ -60,7 +64,7 @@ func ApplyWorkload( } if execFlags.Path != "" { - configMapResource, err := generateConfigMapManifest(execFlags.Path, workload, templateContext.Meta) + configMapResource, err = generateConfigMapManifest(execFlags.Path, workload, templateContext.Meta) if err != nil { return fmt.Errorf("failed to generate configmap resource: %w", err) } @@ -75,14 +79,17 @@ func ApplyWorkload( return fmt.Errorf("failed to get workload template: %w", err) } - templateResources, err := generateManifests(k8sClient, workloadTemplate, templateContext, workload) + workloadResource, err := generateWorkloadManifest(workloadTemplate, templateContext, workload) if err != nil { - return fmt.Errorf("Check workload type. Failed to generate manifests: %w", err) + return fmt.Errorf("check workload type, failed to generate manifests: %w", err) } - if len(templateResources) == 0 { - return fmt.Errorf("failed to generate manifests: no resources found") + + additionalWorkloadManifests, err := workload.GenerateAdditionalResourceManifests(k8sClient, templateContext) + if err != nil { + return fmt.Errorf("failed to generate additional resource manifests: %w", err) } - resources = append(resources, templateResources...) + resources = append(resources, workloadResource) + resources = append(resources, additionalWorkloadManifests...) s, err := k8s.GetScheme() if err != nil { @@ -91,11 +98,42 @@ func ApplyWorkload( if execFlags.DryRun { printResources(&s, resources) - } else { - if err := applyResources(resources, ctx, k8sClient); err != nil { - return fmt.Errorf("failed to apply resources: %w", err) + return nil + } + + if err := applyResources(resources, ctx, k8sClient); err != nil { + return fmt.Errorf("failed to apply resources: %w", err) + } + + scheme, err := k8s.GetScheme() + if err != nil { + return fmt.Errorf("failed to get k8s scheme: %w", err) + } + + if configMapResource != nil { + logrus.Debug("Config map is set, linking it to the workload") + + owner := workloadResource.DeepCopyObject().(client.Object) + err := k8sClient.Get(ctx, client.ObjectKey{Name: owner.GetName(), Namespace: owner.GetNamespace()}, owner) + if err != nil { + return fmt.Errorf("failed to fetch owner resource %s/%s: %w", owner.GetNamespace(), owner.GetName(), err) + } + + // Ensure the UID is available + if owner.GetUID() == "" { + return fmt.Errorf("owner resource %s/%s has no valid UID", owner.GetNamespace(), owner.GetName()) + } + workloadResource = owner + } + + // Attach config map and PVC to the workload, if they are defined + if configMapResource != nil { + logrus.Debug("Updating the config map's owner reference") + if err := updateOwnerReference(ctx, k8sClient, configMapResource, workloadResource, &scheme); err != nil { + return fmt.Errorf("failed to update owner reference of config map: %w", err) } } + return nil } @@ -173,8 +211,8 @@ func generateConfigMapManifest(path string, workload Workload, metaConfig MetaFl return nil, nil } -// generateManifests prepares a list of Kubernetes manifests to apply -func generateManifests(k8sClient client.Client, workloadTemplate []byte, templateContext WorkloadTemplateConfig, workload Workload) ([]runtime.Object, error) { +// generateWorkloadManifest prepares the main workload manifest +func generateWorkloadManifest(workloadTemplate []byte, templateContext WorkloadTemplateConfig, workload Workload) (client.Object, error) { parsedTemplate, err := template.New("main").Funcs(sprig.TxtFuncMap()).Parse(string(workloadTemplate)) if err != nil { return nil, fmt.Errorf("failed to parse template: %w", err) @@ -203,18 +241,13 @@ func generateManifests(k8sClient client.Client, workloadTemplate []byte, templat return nil, fmt.Errorf("failed to convert manifest, ensure it is of the correct type") } - additionalWorkloadManifests, err := workload.GenerateAdditionalResourceManifests(k8sClient, templateContext) - if err != nil { - return nil, fmt.Errorf("failed to generate additional resource manifests: %w", err) - } - - return append(additionalWorkloadManifests, []runtime.Object{converted}...), nil + return converted, nil } // printResources prints each Kubernetes manifest in an array func printResources(s *runtime.Scheme, resources []runtime.Object) { - for _, resource := range resources { - clientObject := resource.(client.Object) + for _, resource_ := range resources { + clientObject := resource_.(client.Object) cleanedResource, err := k8s.MinimalizeAndConvertToYAML(s, clientObject) if err != nil { @@ -229,11 +262,11 @@ func printResources(s *runtime.Scheme, resources []runtime.Object) { // applyResources applies (creates or updates if possible) each Kubernetes object within an array func applyResources(resources []runtime.Object, ctx context.Context, k8sClient client.Client) error { - for _, resource := range resources { + for _, resource_ := range resources { // Ensure the resource implements client.Object - obj, ok := resource.(client.Object) + obj, ok := resource_.(client.Object) if !ok { - return fmt.Errorf("resource does not implement client.Object: %T", resource) + return fmt.Errorf("resource does not implement client.Object: %T", resource_) } // Access metadata for logging @@ -242,7 +275,7 @@ func applyResources(resources []runtime.Object, ctx context.Context, k8sClient c return fmt.Errorf("failed to access metadata for resource: %w", err) } - logrus.Debugf("Applying resource %T: %s/%s", resource, objMeta.GetNamespace(), objMeta.GetName()) + logrus.Debugf("Applying resource %T: %s/%s", resource_, objMeta.GetNamespace(), objMeta.GetName()) // Check if the resource exists key := client.ObjectKey{ @@ -250,7 +283,7 @@ func applyResources(resources []runtime.Object, ctx context.Context, k8sClient c Name: objMeta.GetName(), } - existing := resource.DeepCopyObject().(client.Object) + existing := resource_.DeepCopyObject().(client.Object) err = k8sClient.Get(ctx, key, existing) @@ -267,26 +300,53 @@ func applyResources(resources []runtime.Object, ctx context.Context, k8sClient c return fmt.Errorf("failed to create resource %s/%s: %w", objMeta.GetNamespace(), objMeta.GetName(), err) } - logrus.Infof("resource %s/%s created successfully", objMeta.GetNamespace(), objMeta.GetName()) + logrus.Infof("resource %T: %s/%s created successfully", resource_, objMeta.GetNamespace(), objMeta.GetName()) + + } + logrus.Info("To monitor and manage your workloads interactively, run $ kaiwo list -n mynamespace") + + return nil +} + +func updateOwnerReference(ctx context.Context, k8sClient client.Client, dependent client.Object, owner client.Object, scheme *runtime.Scheme) error { + // Fetch the latest version of the dependent object (PVC or Namespace) + existing := dependent.DeepCopyObject().(client.Object) + err := k8sClient.Get(ctx, client.ObjectKey{Name: existing.GetName(), Namespace: existing.GetNamespace()}, existing) + if err != nil { + return fmt.Errorf("failed to fetch existing resource %s/%s: %w", existing.GetNamespace(), existing.GetName(), err) + } - continue + gvk := owner.GetObjectKind().GroupVersionKind() + if gvk.Empty() { + // Fetch GVK from the scheme if not set + gvks, _, err := scheme.ObjectKinds(owner) + if err != nil || len(gvks) == 0 { + return fmt.Errorf("failed to determine GVK for owner: %w", err) + } + gvk = gvks[0] // Use the first GVK found + } - // TODO: Rethink update logic which now fails with "immutable field" errors - // Resource already exists, update it - // existing, err := c.Resource(gvr).Namespace(namespace).Get(ctx, resource.GetName(), metav1.GetOptions{}) - // if err != nil { - // return fmt.Errorf("failed to get existing %s/%s: %w", resource.GetKind(), resource.GetName(), err) - // } + // Set OwnerReference + ownerRef := metav1.OwnerReference{ + APIVersion: gvk.GroupVersion().String(), + Kind: gvk.Kind, + Name: owner.GetName(), + UID: owner.GetUID(), + Controller: boolPtr(true), + BlockOwnerDeletion: boolPtr(true), + } - // resource.SetResourceVersion(existing.GetResourceVersion()) - // _, err = c.Resource(gvr).Namespace(namespace).Update(ctx, resource, metav1.UpdateOptions{}) - // if err != nil { - // return fmt.Errorf("failed to update %s/%s: %w", resource.GetKind(), resource.GetName(), err) - // } + existing.SetOwnerReferences([]metav1.OwnerReference{ownerRef}) - // logrus.Infof("%s/%s updated successfully", resource.GetKind(), resource.GetName()) + // Update the dependent resource with new OwnerReference + err = k8sClient.Update(ctx, existing) + if err != nil { + return fmt.Errorf("failed to update owner reference for %s/%s: %w", existing.GetNamespace(), existing.GetName(), err) } - logrus.Info("To monitor and manage your workloads interactively, run $ kaiwo list -n mynamespace") + logrus.Debugf("Updated OwnerReference for %s/%s\n", existing.GetNamespace(), existing.GetName()) return nil } + +// Helper function for boolean pointer +func boolPtr(b bool) *bool { return &b } diff --git a/pkg/workloads/config.go b/pkg/workloads/config.go index b2e0b12..7c4ea5c 100644 --- a/pkg/workloads/config.go +++ b/pkg/workloads/config.go @@ -21,9 +21,11 @@ import ( ) const ( - KaiwoconfigFilename = "kaiwoconfig" - EnvFilename = "env" - KaiwoUsernameLabel = "kaiwo-cli/username" + KaiwoconfigFilename = "kaiwoconfig" + EnvFilename = "env" + KaiwoUsernameLabel = "kaiwo-cli/username" + KaiwoDefaultStorageClassNameLabel = "kaiwo-cli/default-storage-class-name" + KaiwoDefaultStorageQuantityLabel = "kaiwo-cli/default-storage-quantity" ) // WorkloadTemplateConfig is the config context that is passed to the workload templates @@ -61,6 +63,13 @@ type SchedulingFlags struct { // CalculatedNumReplicas refers to the number of replicas, calculated from the available GPUs per node CalculatedNumReplicas int + + Storage *StorageSchedulingFlags +} + +type StorageSchedulingFlags struct { + Quantity string + StorageClassName string } // MetaFlags contain flags that are shared by all workloads diff --git a/pkg/workloads/deployments/deployment.go b/pkg/workloads/deployments/deployment.go index ac0e667..b65edd9 100644 --- a/pkg/workloads/deployments/deployment.go +++ b/pkg/workloads/deployments/deployment.go @@ -62,7 +62,7 @@ func (deployment Deployment) GenerateTemplateContext(execFlags workloads.ExecFla return DeploymentFlags{Entrypoint: entrypoint}, nil } -func (deployment Deployment) ConvertObject(object runtime.Object) (runtime.Object, bool) { +func (deployment Deployment) ConvertObject(object runtime.Object) (client.Object, bool) { obj, ok := object.(*appsv1.Deployment) return obj, ok } diff --git a/pkg/workloads/deployments/deployment.yaml.tmpl b/pkg/workloads/deployments/deployment.yaml.tmpl index 7aa1a26..9bd3af4 100644 --- a/pkg/workloads/deployments/deployment.yaml.tmpl +++ b/pkg/workloads/deployments/deployment.yaml.tmpl @@ -31,6 +31,8 @@ spec: - {{ .Workload.Entrypoint }} {{- end }} env: + - name: HF_HOME + value: /workload/.cache/huggingface {{- if .Meta.EnvVars }} {{- range .Meta.EnvVars }} {{- if .Value }} @@ -59,6 +61,8 @@ spec: amd.com/gpu: "{{ .Scheduling.TotalRequestedGPUs }}" {{ end }} volumeMounts: + - mountPath: /workload + name: {{ .Meta.Name }}-main {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -67,12 +71,28 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - mountPath: /workload - name: workload + - mountPath: /workload/mounted + name: workload-mount {{- end }} - mountPath: /dev/shm name: dshm volumes: + {{- if .Scheduling.Storage }} + - name: {{ .Meta.Name }}-main + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: {{ .Scheduling.Storage.StorageClassName }} + resources: + requests: + storage: {{ .Scheduling.Storage.Quantity }} + {{- else }} + - name: {{ .Meta.Name }}-main + emptyDir: + medium: Memory + sizeLimit: 10Mi + {{- end }} {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -84,7 +104,7 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - name: workload + - name: workload-mount configMap: name: {{ .Meta.Name }} {{- end }} diff --git a/pkg/workloads/jobs/job.go b/pkg/workloads/jobs/job.go index e0af869..5408643 100644 --- a/pkg/workloads/jobs/job.go +++ b/pkg/workloads/jobs/job.go @@ -68,7 +68,7 @@ func (job Job) DefaultTemplate() ([]byte, error) { return JobTemplate, nil } -func (job Job) ConvertObject(object runtime.Object) (runtime.Object, bool) { +func (job Job) ConvertObject(object runtime.Object) (client.Object, bool) { obj, ok := object.(*batchv1.Job) return obj, ok } diff --git a/pkg/workloads/jobs/job.yaml.tmpl b/pkg/workloads/jobs/job.yaml.tmpl index 02d25f3..f503782 100644 --- a/pkg/workloads/jobs/job.yaml.tmpl +++ b/pkg/workloads/jobs/job.yaml.tmpl @@ -26,6 +26,8 @@ spec: - {{ .Workload.Entrypoint }} {{- end }} env: + - name: HF_HOME + value: /workload/.cache/huggingface {{- if .Meta.EnvVars }} {{- range .Meta.EnvVars }} {{- if .Value }} @@ -50,6 +52,8 @@ spec: cpu: "{{ mul .Scheduling.TotalRequestedGPUs 4 }}" amd.com/gpu: "{{ .Scheduling.TotalRequestedGPUs }}" volumeMounts: + - mountPath: /workload + name: {{ .Meta.Name }}-main {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -58,12 +62,28 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - mountPath: /workload - name: workload + - mountPath: /workload/mounted + name: workload-mount {{- end }} - mountPath: /dev/shm name: dshm volumes: + {{- if .Scheduling.Storage }} + - name: {{ .Meta.Name }}-main + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: {{ .Scheduling.Storage.StorageClassName }} + resources: + requests: + storage: {{ .Scheduling.Storage.Quantity }} + {{- else }} + - name: {{ .Meta.Name }}-main + emptyDir: + medium: Memory + sizeLimit: 10Mi + {{- end }} {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -75,12 +95,11 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - name: workload + - name: workload-mount configMap: name: {{ .Meta.Name }} {{- end }} - name: dshm emptyDir: medium: Memory - sizeLimit: 200Gi - + sizeLimit: 200Gi \ No newline at end of file diff --git a/pkg/workloads/ray/deployment.go b/pkg/workloads/ray/deployment.go index 010b847..f8ffdd4 100644 --- a/pkg/workloads/ray/deployment.go +++ b/pkg/workloads/ray/deployment.go @@ -53,7 +53,36 @@ func (deployment Deployment) GenerateTemplateContext(execFlags workloads.ExecFla return DeploymentFlags{Serveconfig: strings.TrimSpace(string(contents))}, nil } -func (deployment Deployment) ConvertObject(object runtime.Object) (runtime.Object, bool) { +//func (deployment Deployment) BuildObject(flags workloads.WorkloadTemplateConfig) (client.Object, error) { +// obj := &rayv1.RayService{ +// ObjectMeta: metav1.ObjectMeta{ +// Name: flags.Meta.Name, +// Namespace: flags.Meta.Namespace, +// Labels: map[string]string{ +// "kaiwo-cli/username": flags.Meta.User, +// }, +// }, +// Spec: rayv1.RayServiceSpec{ +// ServeConfigV2: "", +// RayClusterSpec: rayv1.RayClusterSpec{ +// // EnableInTreeAutoscaling: true, +// HeadGroupSpec: rayv1.HeadGroupSpec{ +// RayStartParams: map[string]string{ +// "dashboard-host": "0.0.0.0", +// }, +// Template: corev1.PodTemplateSpec{}, +// }, +// }, +// }, +// } +// return obj, nil +//} +// +//func buildPodSpec(metaFlags workloads.MetaFlags, ports []corev1.ContainerPort) corev1.PodSpec { +// return corev1.PodSpec{} +//} + +func (deployment Deployment) ConvertObject(object runtime.Object) (client.Object, bool) { obj, ok := object.(*rayv1.RayService) return obj, ok diff --git a/pkg/workloads/ray/deployment.yaml.tmpl b/pkg/workloads/ray/deployment.yaml.tmpl index e5dc9ca..76a2eca 100644 --- a/pkg/workloads/ray/deployment.yaml.tmpl +++ b/pkg/workloads/ray/deployment.yaml.tmpl @@ -24,6 +24,8 @@ spec: image: {{ .Meta.Image }} imagePullPolicy: Always env: + - name: HF_HOME + value: /workload/.cache/huggingface {{- if .Meta.EnvVars }} {{- range .Meta.EnvVars }} {{- if .Value }} @@ -46,6 +48,8 @@ spec: cpu: "2" memory: "16Gi" volumeMounts: + - mountPath: /workload + name: {{ .Meta.Name }}-main {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -54,8 +58,8 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - mountPath: /workload/app - name: workload + - mountPath: /workload/mounted + name: workload-mount {{- end }} - mountPath: /dev/shm name: dshm @@ -69,6 +73,22 @@ spec: - containerPort: 8000 name: serve volumes: + {{- if .Scheduling.Storage }} + - name: {{ .Meta.Name }}-main + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: {{ .Scheduling.Storage.StorageClassName }} + resources: + requests: + storage: 10Mi + {{- else }} + - name: {{ .Meta.Name }}-main + emptyDir: + medium: Memory + sizeLimit: 10Mi + {{- end }} {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -80,7 +100,7 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - name: workload + - name: workload-mount configMap: name: {{ .Meta.Name }} {{- end }} @@ -109,6 +129,8 @@ spec: exec: command: ["/bin/sh", "-c", "ray stop"] env: + - name: HF_HOME + value: /workload/.cache/huggingface {{- if .Meta.EnvVars }} {{- range .Meta.EnvVars }} {{- if .Value }} @@ -133,6 +155,8 @@ spec: memory: "{{ mul .Scheduling.CalculatedGPUsPerReplica 32 }}Gi" amd.com/gpu: "{{ .Scheduling.CalculatedGPUsPerReplica }}" volumeMounts: + - mountPath: /workload + name: {{ .Meta.Name }}-main {{- if .Meta.SecretVolumes }} {{- range .SecretVolumes }} - name: {{ .Name }} @@ -141,12 +165,28 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - mountPath: /workload/app - name: workload + - mountPath: /workload/mounted + name: workload-mount {{- end }} - mountPath: /dev/shm name: dshm volumes: + {{- if .Scheduling.Storage }} + - name: {{ .Meta.Name }}-main + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: {{ .Scheduling.Storage.StorageClassName }} + resources: + requests: + storage: {{ .Scheduling.Storage.Quantity }} + {{- else }} + - name: {{ .Meta.Name }}-main + emptyDir: + medium: Memory + sizeLimit: 10Mi + {{- end }} {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -158,7 +198,7 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - name: workload + - name: workload-mount configMap: name: {{ .Meta.Name }} {{- end }} diff --git a/pkg/workloads/ray/job.go b/pkg/workloads/ray/job.go index c986013..e592213 100644 --- a/pkg/workloads/ray/job.go +++ b/pkg/workloads/ray/job.go @@ -57,7 +57,7 @@ func (job Job) DefaultTemplate() ([]byte, error) { return JobTemplate, nil } -func (job Job) ConvertObject(object runtime.Object) (runtime.Object, bool) { +func (job Job) ConvertObject(object runtime.Object) (client.Object, bool) { obj, ok := object.(*rayv1.RayJob) return obj, ok } diff --git a/pkg/workloads/ray/job.yaml.tmpl b/pkg/workloads/ray/job.yaml.tmpl index c20156c..ccb0ec4 100644 --- a/pkg/workloads/ray/job.yaml.tmpl +++ b/pkg/workloads/ray/job.yaml.tmpl @@ -17,6 +17,10 @@ spec: rayStartParams: {} template: spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 {{- if .Meta.ImagePullSecret }} imagePullSecrets: - name: {{ .Meta.ImagePullSecret }} @@ -26,6 +30,8 @@ spec: image: {{ .Meta.Image }} imagePullPolicy: Always env: + - name: HF_HOME + value: /workload/.cache/huggingface {{- if .Meta.EnvVars }} {{- range .Meta.EnvVars }} {{- if .Value }} @@ -55,6 +61,8 @@ spec: - containerPort: 10001 name: client volumeMounts: + - mountPath: /workload + name: {{ .Meta.Name }}-main {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -63,12 +71,28 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - mountPath: /workload - name: workload + - mountPath: /workload/mounted + name: workload-mount {{- end }} - mountPath: /dev/shm name: dshm volumes: + {{- if .Scheduling.Storage }} + - name: {{ .Meta.Name }}-main + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: {{ .Scheduling.Storage.StorageClassName }} + resources: + requests: + storage: 10Mi + {{- else }} + - name: {{ .Meta.Name }}-main + emptyDir: + medium: Memory + sizeLimit: 10Mi + {{- end }} {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -80,7 +104,7 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - name: workload + - name: workload-mount configMap: name: {{ .Meta.Name }} {{- end }} @@ -97,6 +121,10 @@ spec: rayStartParams: {} template: spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 {{- if .Meta.ImagePullSecret }} imagePullSecrets: - name: {{ .Meta.ImagePullSecret }} @@ -106,6 +134,8 @@ spec: image: {{ .Meta.Image }} imagePullPolicy: Always env: + - name: HF_HOME + value: /workload/.cache/huggingface {{- if .Meta.EnvVars }} {{- range .Meta.EnvVars }} {{- if .Value }} @@ -137,6 +167,8 @@ spec: memory: "{{ mul .Scheduling.CalculatedGPUsPerReplica 32 }}Gi" amd.com/gpu: "{{ .Scheduling.CalculatedGPUsPerReplica }}" volumeMounts: + - mountPath: /workload + name: {{ .Meta.Name }}-main {{- if .Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -145,12 +177,28 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - mountPath: /workload - name: workload + - mountPath: /workload/mounted + name: workload-mount {{- end }} - mountPath: /dev/shm name: dshm volumes: + {{- if .Scheduling.Storage }} + - name: {{ .Meta.Name }}-main + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: {{ .Scheduling.Storage.StorageClassName }} + resources: + requests: + storage: {{ .Scheduling.Storage.Quantity }} + {{- else }} + - name: {{ .Meta.Name }}-main + emptyDir: + medium: Memory + sizeLimit: 10Mi + {{- end }} {{- if.Meta.SecretVolumes }} {{- range .Meta.SecretVolumes }} - name: {{ .Name }} @@ -162,7 +210,7 @@ spec: {{- end }} {{- end }} {{- if .Meta.HasConfigMap }} - - name: workload + - name: workload-mount configMap: name: {{ .Meta.Name }} {{- end }} diff --git a/pkg/workloads/workload.go b/pkg/workloads/workload.go index f803a62..dd88740 100644 --- a/pkg/workloads/workload.go +++ b/pkg/workloads/workload.go @@ -33,7 +33,7 @@ type Workload interface { // DefaultTemplate returns a default template to use for this workload DefaultTemplate() ([]byte, error) - ConvertObject(object runtime.Object) (runtime.Object, bool) + ConvertObject(object runtime.Object) (client.Object, bool) // IgnoreFiles lists the files that should be ignored in the ConfigMap IgnoreFiles() []string diff --git a/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/entrypoint b/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/entrypoint index 7a84f97..f20e898 100644 --- a/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/entrypoint +++ b/workloads/inference/LLMs/offline-inference/vllm-batch-single-multinode/entrypoint @@ -1 +1 @@ -python main.py \ No newline at end of file +python mounted/main.py \ No newline at end of file diff --git a/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/serveconfig b/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/serveconfig index 074db37..289a8b8 100644 --- a/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/serveconfig +++ b/workloads/inference/LLMs/online-inference/vllm-online-single-multinode/serveconfig @@ -1,7 +1,7 @@ applications: - name: llm route_prefix: / - import_path: app:deployment + import_path: mounted:deployment deployments: - name: VLLMDeployment autoscaling_config: diff --git a/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/entrypoint b/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/entrypoint index ce82531..77e60f8 100644 --- a/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/entrypoint +++ b/workloads/training/LLMs/full-parameter-pretraining/full-param-zero3-single-multinode/entrypoint @@ -1,6 +1,6 @@ -python main.py +python mounted/main.py --model-name=meta-llama/Llama-3.1-8B-Instruct ---ds-config=./zero_3_offload_optim_param.json +--ds-config=./mounted/zero_3_offload_optim_param.json --bucket=silogen-dev-ray --num-epochs=2 --num-devices=$NUM_GPUS diff --git a/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/entrypoint b/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/entrypoint index 64443b9..a7cdcc7 100644 --- a/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/entrypoint +++ b/workloads/training/LLMs/lora-supervised-finetuning/lora-sft-zero3-single-multinode/entrypoint @@ -1,6 +1,7 @@ -python main.py +python mounted/main.py --model-name=meta-llama/Llama-3.1-8B-Instruct ---ds-config=./zero_3_offload_optim_param.json +--ds-config=./mounted/zero_3_offload_optim_param.json +--lora-config=./mounted/lora-llama.json --bucket=silogen-dev-ray --num-epochs=2 --lora