diff --git a/docker-compose.yaml b/docker-compose.yaml index 2097c77b8..7fa523750 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -109,6 +109,10 @@ services: - PYTHONPATH=/mzai/lumigator/python/mzai/backend - EVALUATOR_PIP_REQS=/mzai/lumigator/python/mzai/jobs/evaluator/requirements.txt - EVALUATOR_WORK_DIR=/mzai/lumigator/python/mzai/jobs/evaluator + # TODO: the following two rows should be renamed to EVALUATOR_* + # and the two above should be removed when we depreate evaluator + - EVALUATOR_LITE_PIP_REQS=/mzai/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt + - EVALUATOR_LITE_WORK_DIR=/mzai/lumigator/python/mzai/jobs/evaluator_lite - INFERENCE_PIP_REQS=/mzai/lumigator/python/mzai/jobs/inference/requirements.txt - INFERENCE_WORK_DIR=/mzai/lumigator/python/mzai/jobs/inference - RAY_DASHBOARD_PORT diff --git a/lumigator/python/mzai/backend/backend/api/routes/jobs.py b/lumigator/python/mzai/backend/backend/api/routes/jobs.py index 05eeb5817..1dd5bf751 100644 --- a/lumigator/python/mzai/backend/backend/api/routes/jobs.py +++ b/lumigator/python/mzai/backend/backend/api/routes/jobs.py @@ -11,6 +11,7 @@ Job, JobAnnotateCreate, JobEvalCreate, + JobEvalLiteCreate, JobInferenceCreate, JobLogsResponse, JobResponse, @@ -82,6 +83,23 @@ def create_evaluation_job( return job_response +# TODO: remove the code above and refactor the method below to answer +# to "/evaluate/" when we deprecate evaluator +@router.post("/eval_lite/", status_code=status.HTTP_201_CREATED) +def create_evaluation_lite_job( + service: JobServiceDep, + job_create_request: JobEvalLiteCreate, + request: Request, + response: Response, +) -> JobResponse: + job_response = service.create_job(job_create_request) + + url = request.url_for(get_job.__name__, job_id=job_response.id) + response.headers[HttpHeaders.LOCATION] = f"{url}" + + return job_response + + @router.get("/") def list_jobs( service: JobServiceDep, diff --git a/lumigator/python/mzai/backend/backend/config_templates.py b/lumigator/python/mzai/backend/backend/config_templates.py index 473e0a442..1133aed0f 100644 --- a/lumigator/python/mzai/backend/backend/config_templates.py +++ b/lumigator/python/mzai/backend/backend/config_templates.py @@ -68,6 +68,24 @@ }} }}""" +# TODO: this default evaluation template should serve for most purposes +# after we deprecate evaluator, as it won't include predictions (model +# name is just passed for reference and not for actual inference). +# We can remove all the above templates then. +default_eval_template = """{{ + "name": "{job_name}/{job_id}", + "model": {{ "path": "{model_uri}" }}, + "dataset": {{ "path": "{dataset_path}" }}, + "evaluation": {{ + "metrics": ["rouge", "meteor", "bertscore"], + "max_samples": {max_samples}, + "return_input_data": true, + "return_predictions": true, + "storage_path": "{storage_path}" + }} +}}""" + + # Inference templates default_infer_template = """{{ @@ -153,4 +171,10 @@ "mistral://open-mistral-7b": oai_eval_template, "llamafile://mistralai/Mistral-7B-Instruct-v0.2": oai_eval_template, }, + # TODO: Remove the old EVALUATION section and rename EVALUATION_LITE + # to EVALUATION after we deprecate evaluator. Also remove the + # unused templates above (all the eval templates except default) + JobType.EVALUATION_LITE: { + "default": default_eval_template, + }, } diff --git a/lumigator/python/mzai/backend/backend/services/experiments.py b/lumigator/python/mzai/backend/backend/services/experiments.py index 460fc5901..304b82ccf 100644 --- a/lumigator/python/mzai/backend/backend/services/experiments.py +++ b/lumigator/python/mzai/backend/backend/services/experiments.py @@ -12,7 +12,7 @@ from lumigator_schemas.experiments import ExperimentCreate, ExperimentResponse from lumigator_schemas.extras import ListingResponse from lumigator_schemas.jobs import ( - JobEvalCreate, + JobEvalLiteCreate, JobInferenceCreate, JobStatus, ) @@ -138,7 +138,7 @@ def _run_eval( # submit the job self._job_service.create_job( - JobEvalCreate.model_validate(job_eval_dict), experiment_id=experiment_id + JobEvalLiteCreate.model_validate(job_eval_dict), experiment_id=experiment_id ) def create_experiment( diff --git a/lumigator/python/mzai/backend/backend/services/jobs.py b/lumigator/python/mzai/backend/backend/services/jobs.py index 9081b03da..f592d9653 100644 --- a/lumigator/python/mzai/backend/backend/services/jobs.py +++ b/lumigator/python/mzai/backend/backend/services/jobs.py @@ -7,6 +7,7 @@ from lumigator_schemas.jobs import ( JobConfig, JobEvalCreate, + JobEvalLiteCreate, JobInferenceCreate, JobResponse, JobResultDownloadResponse, @@ -44,6 +45,13 @@ class JobService: "ray_worker_gpus_fraction": settings.RAY_WORKER_GPUS_FRACTION, "ray_worker_gpus": settings.RAY_WORKER_GPUS, }, + JobType.EVALUATION_LITE: { + "command": settings.EVALUATOR_LITE_COMMAND, + "pip": settings.EVALUATOR_LITE_PIP_REQS, + "work_dir": settings.EVALUATOR_LITE_WORK_DIR, + "ray_worker_gpus_fraction": settings.RAY_WORKER_GPUS_FRACTION, + "ray_worker_gpus": settings.RAY_WORKER_GPUS, + }, } def __init__( @@ -122,8 +130,9 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict: model_url = self._set_model_type(request) # provide a reasonable system prompt for services where none was specified - if request.system_prompt is None and not request.model.startswith("hf://"): - request.system_prompt = settings.DEFAULT_SUMMARIZER_PROMPT + if job_type == JobType.EVALUATION or job_type == JobType.INFERENCE: + if request.system_prompt is None and not request.model.startswith("hf://"): + request.system_prompt = settings.DEFAULT_SUMMARIZER_PROMPT # this section differs between inference and eval if job_type == JobType.EVALUATION: @@ -138,6 +147,15 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict: "system_prompt": request.system_prompt, "skip_inference": request.skip_inference, } + elif job_type == JobType.EVALUATION_LITE: + job_params = { + "job_id": record.id, + "job_name": request.name, + "model_uri": request.model, + "dataset_path": dataset_s3_path, + "max_samples": request.max_samples, + "storage_path": self.storage_path, + } else: job_params = { "job_id": record.id, @@ -164,11 +182,15 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict: return job_params def create_job( - self, request: JobEvalCreate | JobInferenceCreate, experiment_id: UUID = None + self, + request: JobEvalCreate | JobEvalLiteCreate | JobInferenceCreate, + experiment_id: UUID = None, ) -> JobResponse: """Creates a new evaluation workload to run on Ray and returns the response status.""" if isinstance(request, JobEvalCreate): job_type = JobType.EVALUATION + elif isinstance(request, JobEvalLiteCreate): + job_type = JobType.EVALUATION_LITE elif isinstance(request, JobInferenceCreate): job_type = JobType.INFERENCE else: diff --git a/lumigator/python/mzai/backend/backend/settings.py b/lumigator/python/mzai/backend/backend/settings.py index 8720b6fef..eb04a5a65 100644 --- a/lumigator/python/mzai/backend/backend/settings.py +++ b/lumigator/python/mzai/backend/backend/settings.py @@ -43,8 +43,6 @@ class BackendSettings(BaseSettings): # Eval job details EVALUATOR_WORK_DIR: str | None = None EVALUATOR_PIP_REQS: str | None = None - # TODO: change once we remove old eval - NEW_EVALUATOR_COMMAND: str = "python evaluator/evaluator.py" @computed_field @property @@ -65,6 +63,12 @@ def EVALUATOR_COMMAND_WITH_LD_PRELOAD(self) -> str: # noqa: N802 """ return f"{self.LD_PRELOAD_PREFIX} {self.NEW_EVALUATOR_COMMAND}" + # TODO: the following should all be refactored to EVALUATOR_* and the above should + # be removed when we deprecate evaluator + EVALUATOR_LITE_WORK_DIR: str | None = None + EVALUATOR_LITE_PIP_REQS: str | None = None + EVALUATOR_LITE_COMMAND: str = "python eval_lite.py" + # Inference job details INFERENCE_WORK_DIR: str | None = None INFERENCE_PIP_REQS: str | None = None diff --git a/lumigator/python/mzai/jobs/evaluator_lite/eval_config.py b/lumigator/python/mzai/jobs/evaluator_lite/eval_config.py index 76dd2815d..3d4b3c327 100644 --- a/lumigator/python/mzai/jobs/evaluator_lite/eval_config.py +++ b/lumigator/python/mzai/jobs/evaluator_lite/eval_config.py @@ -3,23 +3,25 @@ class DatasetConfig(BaseModel): path: str + model_config = ConfigDict(extra="forbid") class ModelConfig(BaseModel): path: str + model_config = ConfigDict(extra="forbid") class EvaluationConfig(BaseModel): metrics: list[str] = Field(default=["rouge", "meteor", "bertscore"]) - use_pipeline: bool = False max_samples: int = 0 return_input_data: bool = False return_predictions: bool = False - storage_path: str = "" + storage_path: str + model_config = ConfigDict(extra="forbid") class EvalJobConfig(BaseModel): - name: str | None + name: str dataset: DatasetConfig model: ModelConfig evaluation: EvaluationConfig diff --git a/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py b/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py index f065c2392..86b76cc23 100644 --- a/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py +++ b/lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py @@ -4,8 +4,8 @@ import click import s3fs from datasets import load_from_disk +from eval_config import EvalJobConfig from eval_metrics import EvaluationMetrics -from evaluate_config import EvalJobConfig from loguru import logger @@ -16,18 +16,21 @@ def save_to_disk(local_path: Path, data_dict: dict): json.dump(data_dict, f) -def save_to_s3(config: dict, local_path: Path, storage_path: str): +def save_to_s3(config: EvalJobConfig, local_path: Path, storage_path: str): s3 = s3fs.S3FileSystem() if storage_path.endswith("/"): - storage_path = "s3://" + str(Path(storage_path[5:]) / config.get("name") / "results.json") - logger.info(f"Storing evaluation results into {local_path}...") + storage_path = "s3://" + str(Path(storage_path[5:]) / config.name / "results.json") + logger.info(f"Storing evaluation results into {storage_path}...") s3.put_file(local_path, storage_path) -def save_outputs(config: dict, eval_results: dict) -> Path: - storage_path = config.get("evaluation").get("storage_path") +def save_outputs(config: EvalJobConfig, eval_results: dict) -> Path: + storage_path = config.evaluation.storage_path - local_path = Path("results.json") + # generate local temp file ANYWAY: + # - if storage_path is not provided, it will be stored and kept into a default dir + # - if storage_path is provided AND saving to S3 is successful, local file is deleted + local_path = Path(Path.home() / ".lumigator" / "results" / config.name / "results.json") try: save_to_disk(local_path, eval_results) @@ -58,11 +61,12 @@ def run_eval(config: EvalJobConfig) -> Path: # Load dataset given its URI dataset = load_from_disk(config.dataset.path) logger.info(f"Retrieving {config.dataset.path} for evaluation") - if max_samples: + + # Limit dataset length if max_samples is specified + if max_samples < 1 or max_samples > len(dataset): logger.info(f"max_samples ({max_samples}) resized to dataset size ({len(dataset)})") - # select data between the minimum and total length of dataset - num_samples = range(min(max_samples, len(dataset))) - dataset = dataset.select(num_samples) + max_samples = len(dataset) + dataset = dataset.select(range(max_samples)) # run evaluation and append to results dict predictions = dataset["predictions"] diff --git a/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt b/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt index 59b266244..c8f1efbe8 100644 --- a/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt +++ b/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt @@ -1,22 +1,17 @@ -accelerate==0.33.0 +absl-py==2.1.0 bert_score==0.3.13 -bitsandbytes==0.42.0 click>=8.1.7 datasets==2.20.0 -einops==0.8.0 evaluate==0.4.2 loguru==0.7.2 -mistralai==0.4.2 nltk==3.8.1 numpy<2.0.0 -openai==1.38.0 -protobuf>=3.20.2 +platformdirs>=2.1 pydantic-yaml>=1.2.0 pydantic>=2.6.4 -ray[default]==2.30.0 +rouge-score==0.1.2 ruff==0.5.5 s3fs==2024.5.0 -scipy==1.13.1 -sentencepiece==0.2.0 -transformers==4.43.4 -urllib3>=1.26.18,<2 +six>=1.14 +transformers==4.48.0 +urllib3==2.3.0 diff --git a/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py b/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py index bfe3428d9..06f9bc807 100644 --- a/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py +++ b/lumigator/python/mzai/schemas/lumigator_schemas/jobs.py @@ -9,6 +9,7 @@ class JobType(str, Enum): INFERENCE = "inference" EVALUATION = "evaluate" + EVALUATION_LITE = "eval_lite" class JobStatus(str, Enum): @@ -66,6 +67,17 @@ class JobEvalCreate(BaseModel): skip_inference: bool = False +# TODO: this has to be renamed to JobEvalCreate and the code above +# has to be removed when we deprecate evaluator +class JobEvalLiteCreate(BaseModel): + name: str + description: str = "" + model: str + dataset: UUID + max_samples: int = -1 # set to all samples by default + config_template: str | None = None + + class JobInferenceCreate(BaseModel): name: str description: str = ""