Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
dchourasia committed Dec 12, 2024
2 parents f140a6e + 1aece45 commit 8e0e6cb
Show file tree
Hide file tree
Showing 35 changed files with 144 additions and 135 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@

NOTE: `quay.io/modh/ray:2.35.0-py311-cu121` is the default image used for creating a RayCluster resource. If you have your own custom ray image which suits your purposes, specify it in `CODEFLARE_TEST_RAY_IMAGE` environment variable.

### Environment variables for Training operator test suite
### Environment variables for fms-hf-tuning test suite

* `FMS_HF_TUNING_IMAGE` - Image tag used in PyTorchJob CR for model training

### Environment variables for Training operator GPU test suite
### Environment variables for fms-hf-tuning GPU test suite

* `TEST_NAMESPACE_NAME` (Optional) - Existing namespace where will the Training operator GPU tests be executed
* `HF_TOKEN` - HuggingFace token used to pull models which has limited access
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ toolchain go1.21.5
require (
github.com/kubeflow/training-operator v1.7.0
github.com/onsi/gomega v1.31.1
github.com/project-codeflare/codeflare-common v0.0.0-20241203135025-af256802fc2d
github.com/project-codeflare/codeflare-common v0.0.0-20241211130338-efe4f3e6f904
github.com/prometheus/client_golang v1.20.4
github.com/prometheus/common v0.57.0
github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -365,8 +365,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
github.com/project-codeflare/codeflare-common v0.0.0-20241203135025-af256802fc2d h1:WN/cN/giLiicdGjnztRYgfR7K7biaGmPO98WdWMppos=
github.com/project-codeflare/codeflare-common v0.0.0-20241203135025-af256802fc2d/go.mod h1:v7XKwaDoCspsHQlWJNarO7gOpR+iumSS+c1bWs3kJOI=
github.com/project-codeflare/codeflare-common v0.0.0-20241211130338-efe4f3e6f904 h1:brU4j1V4o+z/sw0TGi360Wdjk1TEQ313ynBRGqSTaNU=
github.com/project-codeflare/codeflare-common v0.0.0-20241211130338-efe4f3e6f904/go.mod h1:v7XKwaDoCspsHQlWJNarO7gOpR+iumSS+c1bWs3kJOI=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
Expand Down
14 changes: 1 addition & 13 deletions tests/kfto/core/environment.go → tests/fms/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package core
package fms

import (
"fmt"
Expand All @@ -26,10 +26,6 @@ import (
const (
// The environment variable for FMS HF Tuning image to be tested
fmsHfTuningImageEnvVar = "FMS_HF_TUNING_IMAGE"
// The environment variable referring to image containing bloom-560m model
bloomModelImageEnvVar = "BLOOM_MODEL_IMAGE"
// The environment variable referring to image containing Stanford Alpaca dataset
alpacaDatasetImageEnvVar = "ALPACA_DATASET_IMAGE"
// The environment variable referring to image containing minio CLI
minioCliImageEnvVar = "MINIO_CLI_IMAGE"
// The environment variable for HuggingFace token to download models which require authentication
Expand All @@ -51,14 +47,6 @@ func GetFmsHfTuningImage(t Test) string {
return image
}

func GetBloomModelImage() string {
return lookupEnvOrDefault(bloomModelImageEnvVar, "quay.io/ksuta/bloom-560m@sha256:f6db02bb7b5d09a8d698c04994d747bfb9e581bbb4c07d00290244d207623733")
}

func GetAlpacaDatasetImage() string {
return lookupEnvOrDefault(alpacaDatasetImageEnvVar, "quay.io/ksuta/alpaca-dataset@sha256:2e90f631180c7b2c916f9569b914b336b612e8ae86efad82546adc5c9fcbbb8d")
}

func GetMinioCliImage() string {
return lookupEnvOrDefault(minioCliImageEnvVar, "quay.io/ksuta/mc@sha256:e128ce4caee276bcbfe3bd32ebb01c814f6b2eb2fd52d08ef0d4684f68c1e3d6")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package core
package fms

import (
"fmt"
"testing"
"time"

. "github.com/onsi/gomega"
"github.com/opendatahub-io/distributed-workloads/tests/kfto"
. "github.com/project-codeflare/codeflare-common/support"

corev1 "k8s.io/api/core/v1"
Expand All @@ -34,75 +35,76 @@ import (
)

func TestMultiGpuPytorchjobAllamBeta13bChatGptq(t *testing.T) {
runMultiGpuPytorchjob(t, "config_allam_beta_13b_chat_gptq.json", 2, mountModelVolumeIntoMaster)
runMultiGpuPytorchjob(t, "resources/config_allam_beta_13b_chat_gptq.json", 2, mountModelVolumeIntoMaster)
}

func TestMultiGpuPytorchjobGranite8bCodeInstructGptq(t *testing.T) {
runMultiGpuPytorchjob(t, "config_granite_8b_code_instruct_gptq.json", 2, mountModelVolumeIntoMaster)
runMultiGpuPytorchjob(t, "resources/config_granite_8b_code_instruct_gptq.json", 2, mountModelVolumeIntoMaster)
}

func TestMultiGpuPytorchjobGranite20bCodeInstruct(t *testing.T) {
runMultiGpuPytorchjob(t, "config_granite_20b_code_instruct.json", 4)
runMultiGpuPytorchjob(t, "resources/config_granite_20b_code_instruct.json", 4)
}

func TestMultiGpuPytorchjobGranite34bCodeBaseGptq(t *testing.T) {
runMultiGpuPytorchjob(t, "config_granite_34b_code_base_gptq.json", 2, mountModelVolumeIntoMaster)
runMultiGpuPytorchjob(t, "resources/config_granite_34b_code_base_gptq.json", 2, mountModelVolumeIntoMaster)
}

func TestMultiGpuPytorchjobGranite34bCodeInstructLoRa(t *testing.T) {
runMultiGpuPytorchjob(t, "config_granite_34b_code_instruct_lora.json", 4)
runMultiGpuPytorchjob(t, "resources/config_granite_34b_code_instruct_lora.json", 4)
}

func TestMultiGpuPytorchjobMetaLlama318b(t *testing.T) {
runMultiGpuPytorchjob(t, "config_meta_llama3_1_8b.json", 2)
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_1_8b.json", 2)
}

func TestMultiGpuPytorchjobMetaLlama38bInstruct(t *testing.T) {
runMultiGpuPytorchjob(t, "config_meta_llama3_8b_instruct.json", 2)
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_8b_instruct.json", 2)
}

func TestMultiGpuPytorchjobMetaLlama370bInstructGptqBlue(t *testing.T) {
runMultiGpuPytorchjob(t, "config_meta_llama3_70b_instruct_gptq_blue.json", 2, mountModelVolumeIntoMaster)
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_70b_instruct_gptq_blue.json", 2, mountModelVolumeIntoMaster)
}

func TestMultiGpuPytorchjobMetaLlama31405bGptq(t *testing.T) {
runMultiGpuPytorchjob(t, "config_meta_llama3_1_405b_gptq.json", 8, mountModelVolumeIntoMaster)
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_1_405b_gptq.json", 8, mountModelVolumeIntoMaster)
}

func TestMultiGpuPytorchjobMetaLlama3170bLoRa(t *testing.T) {
runMultiGpuPytorchjob(t, "config_meta_llama3_1_70b_lora.json", 4)
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_1_70b_lora.json", 4)
}

func TestMultiGpuPytorchjobMetaLlama370bInstructLoRa(t *testing.T) {
runMultiGpuPytorchjob(t, "config_meta_llama3_70b_instruct_lora.json", 4)
runMultiGpuPytorchjob(t, "resources/config_meta_llama3_70b_instruct_lora.json", 4)
}

func TestMultiGpuPytorchjobMistral7bv03Gptq(t *testing.T) {
runMultiGpuPytorchjob(t, "config_mistral_7b_v03_gptq.json", 2, mountModelVolumeIntoMaster)
runMultiGpuPytorchjob(t, "resources/config_mistral_7b_v03_gptq.json", 2, mountModelVolumeIntoMaster)
}
func TestMultiGpuPytorchjobMistral7bv03(t *testing.T) {
runMultiGpuPytorchjob(t, "config_mistral_7b_v03.json", 2)
runMultiGpuPytorchjob(t, "resources/config_mistral_7b_v03.json", 2)
}

func TestMultiGpuPytorchjobMixtral8x7bv01(t *testing.T) {
runMultiGpuPytorchjob(t, "config_mixtral_8x7b_v01.json", 8)
runMultiGpuPytorchjob(t, "resources/config_mixtral_8x7b_v01.json", 8)
}

func TestMultiGpuPytorchjobMixtral8x7bInstructv01Gptq(t *testing.T) {
runMultiGpuPytorchjob(t, "config_mixtral_8x7b_instruct_v01_gptq.json", 2, mountModelVolumeIntoMaster)
runMultiGpuPytorchjob(t, "resources/config_mixtral_8x7b_instruct_v01_gptq.json", 2, mountModelVolumeIntoMaster)
}

func TestMultiGpuPytorchjobMixtral8x7bInstructv01LoRa(t *testing.T) {
runMultiGpuPytorchjob(t, "config_mixtral_8x7b_instruct_v01_lora.json", 4)
runMultiGpuPytorchjob(t, "resources/config_mixtral_8x7b_instruct_v01_lora.json", 4)
}

func TestMultiGpuPytorchjobMerlinite7b(t *testing.T) {
runMultiGpuPytorchjob(t, "config_merlinite_7b.json", 2)
runMultiGpuPytorchjob(t, "resources/config_merlinite_7b.json", 2)
}

func runMultiGpuPytorchjob(t *testing.T, modelConfigFile string, numberOfGpus int, options ...Option[*kftov1.PyTorchJob]) {
test := With(t)

namespace := GetOrCreateTestNamespace(test)
namespace := test.CreateOrGetTestNamespace().Name

// Create a ConfigMap with configuration
configData := map[string][]byte{
Expand Down Expand Up @@ -173,7 +175,7 @@ func createAlpacaPyTorchJob(test Test, namespace string, config corev1.ConfigMap
InitContainers: []corev1.Container{
{
Name: "copy-dataset",
Image: GetAlpacaDatasetImage(),
Image: kfto.GetAlpacaDatasetImage(),
ImagePullPolicy: corev1.PullIfNotPresent,
VolumeMounts: []corev1.VolumeMount{
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package core
package fms

import (
"fmt"
"testing"

. "github.com/onsi/gomega"
"github.com/opendatahub-io/distributed-workloads/tests/kfto"
. "github.com/project-codeflare/codeflare-common/support"
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"

Expand All @@ -32,26 +33,26 @@ import (
)

func TestPytorchjobWithSFTtrainerFinetuning(t *testing.T) {
runPytorchjobWithSFTtrainer(t, "config.json", 0)
runPytorchjobWithSFTtrainer(t, "resources/config.json", 0)
}

func TestPytorchjobWithSFTtrainerLoRa(t *testing.T) {
runPytorchjobWithSFTtrainer(t, "config_lora.json", 0)
runPytorchjobWithSFTtrainer(t, "resources/config_lora.json", 0)
}
func TestPytorchjobWithSFTtrainerQLoRa(t *testing.T) {
runPytorchjobWithSFTtrainer(t, "config_qlora.json", 1)
runPytorchjobWithSFTtrainer(t, "resources/config_qlora.json", 1)
}

func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string, numGpus int) {
test := With(t)

// Create a namespace
namespace := GetOrCreateTestNamespace(test)
namespace := test.CreateOrGetTestNamespace().Name

// Create a ConfigMap with training dataset and configuration
configData := map[string][]byte{
"config.json": ReadFile(test, modelConfigFile),
"twitter_complaints_small.json": ReadFile(test, "twitter_complaints_small.json"),
"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
}
config := CreateConfigMap(test, namespace, configData)

Expand Down Expand Up @@ -125,12 +126,12 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
test := With(t)

// Create a namespace
namespace := GetOrCreateTestNamespace(test)
namespace := test.CreateOrGetTestNamespace().Name

// Create a ConfigMap with training dataset and configuration
configData := map[string][]byte{
"config.json": ReadFile(test, "config.json"),
"twitter_complaints_small.json": ReadFile(test, "twitter_complaints_small.json"),
"config.json": ReadFile(test, "resources/config.json"),
"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
}
config := CreateConfigMap(test, namespace, configData)

Expand Down Expand Up @@ -231,7 +232,7 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
InitContainers: []corev1.Container{
{
Name: "copy-model",
Image: GetBloomModelImage(),
Image: kfto.GetBloomModelImage(),
ImagePullPolicy: corev1.PullIfNotPresent,
VolumeMounts: []corev1.VolumeMount{
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package kfto
package fms

import (
"testing"

. "github.com/onsi/gomega"
kftocore "github.com/opendatahub-io/distributed-workloads/tests/kfto/core"
"github.com/opendatahub-io/distributed-workloads/tests/kfto"
. "github.com/project-codeflare/codeflare-common/support"
kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
kueueacv1beta1 "sigs.k8s.io/kueue/client-go/applyconfiguration/kueue/v1beta1"
Expand Down Expand Up @@ -48,8 +48,8 @@ func TestSetupPytorchjob(t *testing.T) {

// Create a ConfigMap with training dataset and configuration
configData := map[string][]byte{
"config.json": kftocore.ReadFile(test, "config.json"),
"twitter_complaints_small.json": kftocore.ReadFile(test, "twitter_complaints_small.json"),
"config.json": ReadFile(test, "resources/config.json"),
"twitter_complaints_small.json": ReadFile(test, "resources/twitter_complaints_small.json"),
}
config := CreateConfigMap(test, namespaceName, configData)

Expand Down Expand Up @@ -90,11 +90,11 @@ func TestSetupPytorchjob(t *testing.T) {
test.T().Logf("Applied Kueue LocalQueue %s/%s successfully", appliedLocalQueue.Namespace, appliedLocalQueue.Name)

// Create training PyTorch job
tuningJob := createPyTorchJob(test, namespaceName, appliedLocalQueue.Name, *config)
tuningJob := createUpgradePyTorchJob(test, namespaceName, appliedLocalQueue.Name, *config)

// Make sure the PyTorch job is suspended, waiting for ClusterQueue to be enabled
test.Eventually(kftocore.PyTorchJob(test, tuningJob.Namespace, pyTorchJobName), TestTimeoutShort).
Should(WithTransform(kftocore.PyTorchJobConditionSuspended, Equal(corev1.ConditionTrue)))
test.Eventually(PyTorchJob(test, tuningJob.Namespace, pyTorchJobName), TestTimeoutShort).
Should(WithTransform(PyTorchJobConditionSuspended, Equal(corev1.ConditionTrue)))
}

func TestRunPytorchjob(t *testing.T) {
Expand All @@ -112,22 +112,22 @@ func TestRunPytorchjob(t *testing.T) {
test.Expect(err).NotTo(HaveOccurred())

// PyTorch job should be started now
test.Eventually(kftocore.PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
Should(WithTransform(kftocore.PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
test.Eventually(PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))

// Make sure the PyTorch job succeed
test.Eventually(kftocore.PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
Should(WithTransform(kftocore.PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
test.Eventually(PyTorchJob(test, namespaceName, pyTorchJobName), TestTimeoutLong).
Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
}

func createPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
func createUpgradePyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
// Does PyTorchJob already exist?
_, err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Get(test.Ctx(), pyTorchJobName, metav1.GetOptions{})
if err == nil {
// If yes then delete it and wait until there are no PyTorchJobs in the namespace
err := test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Delete(test.Ctx(), pyTorchJobName, metav1.DeleteOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.Eventually(kftocore.PyTorchJobs(test, namespace), TestTimeoutShort).Should(BeEmpty())
test.Eventually(PyTorchJobs(test, namespace), TestTimeoutShort).Should(BeEmpty())
} else if !errors.IsNotFound(err) {
test.T().Fatalf("Error retrieving PyTorchJob with name `%s`: %v", pyTorchJobName, err)
}
Expand All @@ -149,7 +149,7 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
InitContainers: []corev1.Container{
{
Name: "copy-model",
Image: kftocore.GetBloomModelImage(),
Image: kfto.GetBloomModelImage(),
ImagePullPolicy: corev1.PullIfNotPresent,
VolumeMounts: []corev1.VolumeMount{
{
Expand All @@ -164,7 +164,7 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
Containers: []corev1.Container{
{
Name: "pytorch",
Image: kftocore.GetFmsHfTuningImage(test),
Image: GetFmsHfTuningImage(test),
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{
Expand Down
Loading

0 comments on commit 8e0e6cb

Please sign in to comment.