Merge remote-tracking branch 'upstream/main'

red-hat-data-services · Dec 12, 2024 · 8e0e6cb · 8e0e6cb
2 parents f140a6e + 1aece45
commit 8e0e6cb
Show file tree

Hide file tree

Showing 35 changed files with 144 additions and 135 deletions.
diff --git a/README.md b/README.md
@@ -27,11 +27,11 @@
 
     NOTE: `quay.io/modh/ray:2.35.0-py311-cu121` is the default image used for creating a RayCluster resource. If you have your own custom ray image which suits your purposes, specify it in `CODEFLARE_TEST_RAY_IMAGE` environment variable.
 
-### Environment variables for Training operator test suite
+### Environment variables for fms-hf-tuning test suite
 
 * `FMS_HF_TUNING_IMAGE` - Image tag used in PyTorchJob CR for model training
 
-### Environment variables for Training operator GPU test suite
+### Environment variables for fms-hf-tuning GPU test suite
 
 * `TEST_NAMESPACE_NAME` (Optional) - Existing namespace where will the Training operator GPU tests be executed
 * `HF_TOKEN` - HuggingFace token used to pull models which has limited access

diff --git a/go.mod b/go.mod
@@ -7,7 +7,7 @@ toolchain go1.21.5
 require (
 	github.com/kubeflow/training-operator v1.7.0
 	github.com/onsi/gomega v1.31.1
-	github.com/project-codeflare/codeflare-common v0.0.0-20241203135025-af256802fc2d
+	github.com/project-codeflare/codeflare-common v0.0.0-20241211130338-efe4f3e6f904
 	github.com/prometheus/client_golang v1.20.4
 	github.com/prometheus/common v0.57.0
 	github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0

diff --git a/go.sum b/go.sum
@@ -365,8 +365,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
 github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
-github.com/project-codeflare/codeflare-common v0.0.0-20241203135025-af256802fc2d h1:WN/cN/giLiicdGjnztRYgfR7K7biaGmPO98WdWMppos=
-github.com/project-codeflare/codeflare-common v0.0.0-20241203135025-af256802fc2d/go.mod h1:v7XKwaDoCspsHQlWJNarO7gOpR+iumSS+c1bWs3kJOI=
+github.com/project-codeflare/codeflare-common v0.0.0-20241211130338-efe4f3e6f904 h1:brU4j1V4o+z/sw0TGi360Wdjk1TEQ313ynBRGqSTaNU=
+github.com/project-codeflare/codeflare-common v0.0.0-20241211130338-efe4f3e6f904/go.mod h1:v7XKwaDoCspsHQlWJNarO7gOpR+iumSS+c1bWs3kJOI=
 github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
 github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
 github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=

diff --git a/tests/kfto/core/environment.go → tests/fms/environment.go b/tests/kfto/core/environment.go → tests/fms/environment.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package core
+package fms
 
 import (
 	"fmt"
@@ -26,10 +26,6 @@ import (
 const (
 	// The environment variable for FMS HF Tuning image to be tested
 	fmsHfTuningImageEnvVar = "FMS_HF_TUNING_IMAGE"
-	// The environment variable referring to image containing bloom-560m model
-	bloomModelImageEnvVar = "BLOOM_MODEL_IMAGE"
-	// The environment variable referring to image containing Stanford Alpaca dataset
-	alpacaDatasetImageEnvVar = "ALPACA_DATASET_IMAGE"
 	// The environment variable referring to image containing minio CLI
 	minioCliImageEnvVar = "MINIO_CLI_IMAGE"
 	// The environment variable for HuggingFace token to download models which require authentication
@@ -51,14 +47,6 @@ func GetFmsHfTuningImage(t Test) string {
 	return image
 }
 
-func GetBloomModelImage() string {
-	return lookupEnvOrDefault(bloomModelImageEnvVar, "quay.io/ksuta/bloom-560m@sha256:f6db02bb7b5d09a8d698c04994d747bfb9e581bbb4c07d00290244d207623733")
-}
-
-func GetAlpacaDatasetImage() string {
-	return lookupEnvOrDefault(alpacaDatasetImageEnvVar, "quay.io/ksuta/alpaca-dataset@sha256:2e90f631180c7b2c916f9569b914b336b612e8ae86efad82546adc5c9fcbbb8d")
-}
-
 func GetMinioCliImage() string {
 	return lookupEnvOrDefault(minioCliImageEnvVar, "quay.io/ksuta/mc@sha256:e128ce4caee276bcbfe3bd32ebb01c814f6b2eb2fd52d08ef0d4684f68c1e3d6")
 }

diff --git a/tests/kfto/core/kfto_kueue_sft_GPU_test.go → tests/fms/kfto_kueue_sft_GPU_test.go b/tests/kfto/core/kfto_kueue_sft_GPU_test.go → tests/fms/kfto_kueue_sft_GPU_test.go
@@ -14,14 +14,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package core
+package fms
 
 import (
 	"fmt"
 	"testing"
 	"time"
 
 	. "github.com/onsi/gomega"
+	"github.com/opendatahub-io/distributed-workloads/tests/kfto"
 	. "github.com/project-codeflare/codeflare-common/support"
 
 	corev1 "k8s.io/api/core/v1"
@@ -34,75 +35,76 @@ import (
 )
 
 func TestMultiGpuPytorchjobAllamBeta13bChatGptq(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_allam_beta_13b_chat_gptq.json", 2, mountModelVolumeIntoMaster)
+	runMultiGpuPytorchjob(t, "resources/config_allam_beta_13b_chat_gptq.json", 2, mountModelVolumeIntoMaster)
 }
 
 func TestMultiGpuPytorchjobGranite8bCodeInstructGptq(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_granite_8b_code_instruct_gptq.json", 2, mountModelVolumeIntoMaster)
+	runMultiGpuPytorchjob(t, "resources/config_granite_8b_code_instruct_gptq.json", 2, mountModelVolumeIntoMaster)
 }
 
 func TestMultiGpuPytorchjobGranite20bCodeInstruct(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_granite_20b_code_instruct.json", 4)
+	runMultiGpuPytorchjob(t, "resources/config_granite_20b_code_instruct.json", 4)
 }
 
 func TestMultiGpuPytorchjobGranite34bCodeBaseGptq(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_granite_34b_code_base_gptq.json", 2, mountModelVolumeIntoMaster)
+	runMultiGpuPytorchjob(t, "resources/config_granite_34b_code_base_gptq.json", 2, mountModelVolumeIntoMaster)
 }
 
 func TestMultiGpuPytorchjobGranite34bCodeInstructLoRa(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_granite_34b_code_instruct_lora.json", 4)
+	runMultiGpuPytorchjob(t, "resources/config_granite_34b_code_instruct_lora.json", 4)
 }
 
 func TestMultiGpuPytorchjobMetaLlama318b(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_meta_llama3_1_8b.json", 2)
+	runMultiGpuPytorchjob(t, "resources/config_meta_llama3_1_8b.json", 2)
 }
 
 func TestMultiGpuPytorchjobMetaLlama38bInstruct(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_meta_llama3_8b_instruct.json", 2)
+	runMultiGpuPytorchjob(t, "resources/config_meta_llama3_8b_instruct.json", 2)
 }
 
 func TestMultiGpuPytorchjobMetaLlama370bInstructGptqBlue(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_meta_llama3_70b_instruct_gptq_blue.json", 2, mountModelVolumeIntoMaster)
+	runMultiGpuPytorchjob(t, "resources/config_meta_llama3_70b_instruct_gptq_blue.json", 2, mountModelVolumeIntoMaster)
 }
 
 func TestMultiGpuPytorchjobMetaLlama31405bGptq(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_meta_llama3_1_405b_gptq.json", 8, mountModelVolumeIntoMaster)
+	runMultiGpuPytorchjob(t, "resources/config_meta_llama3_1_405b_gptq.json", 8, mountModelVolumeIntoMaster)
 }
 
 func TestMultiGpuPytorchjobMetaLlama3170bLoRa(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_meta_llama3_1_70b_lora.json", 4)
+	runMultiGpuPytorchjob(t, "resources/config_meta_llama3_1_70b_lora.json", 4)
 }
 
 func TestMultiGpuPytorchjobMetaLlama370bInstructLoRa(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_meta_llama3_70b_instruct_lora.json", 4)
+	runMultiGpuPytorchjob(t, "resources/config_meta_llama3_70b_instruct_lora.json", 4)
 }
 
 func TestMultiGpuPytorchjobMistral7bv03Gptq(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_mistral_7b_v03_gptq.json", 2, mountModelVolumeIntoMaster)
+	runMultiGpuPytorchjob(t, "resources/config_mistral_7b_v03_gptq.json", 2, mountModelVolumeIntoMaster)
 }
 func TestMultiGpuPytorchjobMistral7bv03(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_mistral_7b_v03.json", 2)
+	runMultiGpuPytorchjob(t, "resources/config_mistral_7b_v03.json", 2)
 }
 
 func TestMultiGpuPytorchjobMixtral8x7bv01(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_mixtral_8x7b_v01.json", 8)
+	runMultiGpuPytorchjob(t, "resources/config_mixtral_8x7b_v01.json", 8)
 }
 
 func TestMultiGpuPytorchjobMixtral8x7bInstructv01Gptq(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_mixtral_8x7b_instruct_v01_gptq.json", 2, mountModelVolumeIntoMaster)
+	runMultiGpuPytorchjob(t, "resources/config_mixtral_8x7b_instruct_v01_gptq.json", 2, mountModelVolumeIntoMaster)
 }
+
 func TestMultiGpuPytorchjobMixtral8x7bInstructv01LoRa(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_mixtral_8x7b_instruct_v01_lora.json", 4)
+	runMultiGpuPytorchjob(t, "resources/config_mixtral_8x7b_instruct_v01_lora.json", 4)
 }
 
 func TestMultiGpuPytorchjobMerlinite7b(t *testing.T) {
-	runMultiGpuPytorchjob(t, "config_merlinite_7b.json", 2)
+	runMultiGpuPytorchjob(t, "resources/config_merlinite_7b.json", 2)
 }
 
 func runMultiGpuPytorchjob(t *testing.T, modelConfigFile string, numberOfGpus int, options ...Option[*kftov1.PyTorchJob]) {
 	test := With(t)
 
-	namespace := GetOrCreateTestNamespace(test)
+	namespace := test.CreateOrGetTestNamespace().Name
 
 	// Create a ConfigMap with configuration
 	configData := map[string][]byte{
@@ -173,7 +175,7 @@ func createAlpacaPyTorchJob(test Test, namespace string, config corev1.ConfigMap
 							InitContainers: []corev1.Container{
 								{
 									Name:            "copy-dataset",
-									Image:           GetAlpacaDatasetImage(),
+									Image:           kfto.GetAlpacaDatasetImage(),
 									ImagePullPolicy: corev1.PullIfNotPresent,
 									VolumeMounts: []corev1.VolumeMount{
 										{

diff --git a/tests/kfto/core/kfto_kueue_sft_test.go → tests/fms/kfto_kueue_sft_test.go b/tests/kfto/core/kfto_kueue_sft_test.go → tests/fms/kfto_kueue_sft_test.go
@@ -14,13 +14,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package core
+package fms
 
 import (
 	"fmt"
 	"testing"
 
 	. "github.com/onsi/gomega"
+	"github.com/opendatahub-io/distributed-workloads/tests/kfto"
 	. "github.com/project-codeflare/codeflare-common/support"
 	kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
 
@@ -32,26 +33,26 @@ import (
 )
 
 func TestPytorchjobWithSFTtrainerFinetuning(t *testing.T) {
-	runPytorchjobWithSFTtrainer(t, "config.json", 0)
+	runPytorchjobWithSFTtrainer(t, "resources/config.json", 0)
 }
 
 func TestPytorchjobWithSFTtrainerLoRa(t *testing.T) {
-	runPytorchjobWithSFTtrainer(t, "config_lora.json", 0)
+	runPytorchjobWithSFTtrainer(t, "resources/config_lora.json", 0)
 }
 func TestPytorchjobWithSFTtrainerQLoRa(t *testing.T) {
-	runPytorchjobWithSFTtrainer(t, "config_qlora.json", 1)
+	runPytorchjobWithSFTtrainer(t, "resources/config_qlora.json", 1)
 }
 
 func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string, numGpus int) {
 	test := With(t)
 
 	// Create a namespace
-	namespace := GetOrCreateTestNamespace(test)
+	namespace := test.CreateOrGetTestNamespace().Name
 
 	// Create a ConfigMap with training dataset and configuration
 	configData := map[string][]byte{
 		"config.json":                   ReadFile(test, modelConfigFile),
-		"twitter_complaints_small.json": ReadFile(test, "twitter_complaints_small.json"),
+		"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
 	}
 	config := CreateConfigMap(test, namespace, configData)
 
@@ -125,12 +126,12 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
 	test := With(t)
 
 	// Create a namespace
-	namespace := GetOrCreateTestNamespace(test)
+	namespace := test.CreateOrGetTestNamespace().Name
 
 	// Create a ConfigMap with training dataset and configuration
 	configData := map[string][]byte{
-		"config.json":                   ReadFile(test, "config.json"),
-		"twitter_complaints_small.json": ReadFile(test, "twitter_complaints_small.json"),
+		"config.json":                   ReadFile(test, "resources/config.json"),
+		"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
 	}
 	config := CreateConfigMap(test, namespace, configData)
 
@@ -231,7 +232,7 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
 							InitContainers: []corev1.Container{
 								{
 									Name:            "copy-model",
-									Image:           GetBloomModelImage(),
+									Image:           kfto.GetBloomModelImage(),
 									ImagePullPolicy: corev1.PullIfNotPresent,
 									VolumeMounts: []corev1.VolumeMount{
 										{

diff --git a/...e/kfto_kueue_sft_upgrade_training_test.go → ...s/kfto_kueue_sft_upgrade_training_test.go b/...e/kfto_kueue_sft_upgrade_training_test.go → ...s/kfto_kueue_sft_upgrade_training_test.go
@@ -14,13 +14,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package kfto
+package fms
 
 import (
 	"testing"
 
 	. "github.com/onsi/gomega"
-	kftocore "github.com/opendatahub-io/distributed-workloads/tests/kfto/core"
+	"github.com/opendatahub-io/distributed-workloads/tests/kfto"
 	. "github.com/project-codeflare/codeflare-common/support"
 	kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
 	kueueacv1beta1 "sigs.k8s.io/kueue/client-go/applyconfiguration/kueue/v1beta1"
@@ -48,8 +48,8 @@ func TestSetupPytorchjob(t *testing.T) {
 
 	// Create a ConfigMap with training dataset and configuration
 	configData := map[string][]byte{
-		"config.json":                   kftocore.ReadFile(test, "config.json"),
-		"twitter_complaints_small.json": kftocore.ReadFile(test, "twitter_complaints_small.json"),
+		"config.json":                   ReadFile(test, "resources/config.json"),
+		"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
 	}
 	config := CreateConfigMap(test, namespaceName, configData)
 
@@ -90,11 +90,11 @@ func TestSetupPytorchjob(t *testing.T) {
 	test.T().Logf("Applied Kueue LocalQueue %s/%s successfully", appliedLocalQueue.Namespace, appliedLocalQueue.Name)
 
 	// Create training PyTorch job
-	tuningJob := createPyTorchJob(test, namespaceName, appliedLocalQueue.Name, *config)
+	tuningJob := createUpgradePyTorchJob(test, namespaceName, appliedLocalQueue.Name, *config)
 
 	// Make sure the PyTorch job is suspended, waiting for ClusterQueue to be enabled
-	test.Eventually(kftocore.PyTorchJob(test, tuningJob.Namespace, pyTorchJobName), TestTimeoutShort).
-		Should(WithTransform(kftocore.PyTorchJobConditionSuspended, Equal(corev1.ConditionTrue)))
+	test.Eventually(PyTorchJob(test, tuningJob.Namespace, pyTorchJobName), TestTimeoutShort).
+		Should(WithTransform(PyTorchJobConditionSuspended, Equal(corev1.ConditionTrue)))
 }
 
 func TestRunPytorchjob(t *testing.T) {
@@ -112,22 +112,22 @@ func TestRunPytorchjob(t *testing.T) {
 	test.Expect(err).NotTo(HaveOccurred())
 
 	// PyTorch job should be started now
-	test.Eventually(kftocore.PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
-		Should(WithTransform(kftocore.PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
+	test.Eventually(PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
+		Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
 
 	// Make sure the PyTorch job succeed
-	test.Eventually(kftocore.PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
-		Should(WithTransform(kftocore.PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
+	test.Eventually(PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
+		Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
 }
 
-func createPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
+func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
 	// Does PyTorchJob already exist?
 	_, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Get(test.Ctx(), pyTorchJobName, metav1.GetOptions{})
 	if err == nil {
 		// If yes then delete it and wait until there are no PyTorchJobs in the namespace
 		err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Delete(test.Ctx(), pyTorchJobName, metav1.DeleteOptions{})
 		test.Expect(err).NotTo(HaveOccurred())
-		test.Eventually(kftocore.PyTorchJobs(test, namespace), TestTimeoutShort).Should(BeEmpty())
+		test.Eventually(PyTorchJobs(test, namespace), TestTimeoutShort).Should(BeEmpty())
 	} else if !errors.IsNotFound(err) {
 		test.T().Fatalf("Error retrieving PyTorchJob with name `%s`: %v", pyTorchJobName, err)
 	}
@@ -149,7 +149,7 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
 							InitContainers: []corev1.Container{
 								{
 									Name:            "copy-model",
-									Image:           kftocore.GetBloomModelImage(),
+									Image:           kfto.GetBloomModelImage(),
 									ImagePullPolicy: corev1.PullIfNotPresent,
 									VolumeMounts: []corev1.VolumeMount{
 										{
@@ -164,7 +164,7 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
 							Containers: []corev1.Container{
 								{
 									Name:            "pytorch",
-									Image:           kftocore.GetFmsHfTuningImage(test),
+									Image:           GetFmsHfTuningImage(test),
 									ImagePullPolicy: corev1.PullIfNotPresent,
 									Env: []corev1.EnvVar{
 										{