Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

568 wire eval lite #616

Merged
merged 10 commits into from
Jan 15, 2025
4 changes: 4 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ services:
- PYTHONPATH=/mzai/lumigator/python/mzai/backend
- EVALUATOR_PIP_REQS=/mzai/lumigator/python/mzai/jobs/evaluator/requirements.txt
- EVALUATOR_WORK_DIR=/mzai/lumigator/python/mzai/jobs/evaluator
# TODO: the following two rows should be renamed to EVALUATOR_*
aittalam marked this conversation as resolved.
Show resolved Hide resolved
# and the two above should be removed when we depreate evaluator
- EVALUATOR_LITE_PIP_REQS=/mzai/lumigator/python/mzai/jobs/evaluator_lite/requirements.txt
- EVALUATOR_LITE_WORK_DIR=/mzai/lumigator/python/mzai/jobs/evaluator_lite
- INFERENCE_PIP_REQS=/mzai/lumigator/python/mzai/jobs/inference/requirements.txt
- INFERENCE_WORK_DIR=/mzai/lumigator/python/mzai/jobs/inference
- RAY_DASHBOARD_PORT
Expand Down
18 changes: 18 additions & 0 deletions lumigator/python/mzai/backend/backend/api/routes/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Job,
JobAnnotateCreate,
JobEvalCreate,
JobEvalLiteCreate,
JobInferenceCreate,
JobLogsResponse,
JobResponse,
Expand Down Expand Up @@ -82,6 +83,23 @@ def create_evaluation_job(
return job_response


# TODO: remove the code above and refactor the method below to answer
# to "/evaluate/" when we deprecate evaluator
@router.post("/eval_lite/", status_code=status.HTTP_201_CREATED)
def create_evaluation_lite_job(
service: JobServiceDep,
job_create_request: JobEvalLiteCreate,
request: Request,
response: Response,
) -> JobResponse:
job_response = service.create_job(job_create_request)

url = request.url_for(get_job.__name__, job_id=job_response.id)
response.headers[HttpHeaders.LOCATION] = f"{url}"

return job_response


@router.get("/")
def list_jobs(
service: JobServiceDep,
Expand Down
24 changes: 24 additions & 0 deletions lumigator/python/mzai/backend/backend/config_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,24 @@
}}
}}"""

# TODO: this default evaluation template should serve for most purposes
# after we deprecate evaluator, as it won't include predictions (model
# name is just passed for reference and not for actual inference).
# We can remove all the above templates then.
default_eval_template = """{{
"name": "{job_name}/{job_id}",
"model": {{ "path": "{model_uri}" }},
"dataset": {{ "path": "{dataset_path}" }},
"evaluation": {{
"metrics": ["rouge", "meteor", "bertscore"],
"max_samples": {max_samples},
"return_input_data": true,
"return_predictions": true,
"storage_path": "{storage_path}"
}}
}}"""


# Inference templates

default_infer_template = """{{
Expand Down Expand Up @@ -153,4 +171,10 @@
"mistral://open-mistral-7b": oai_eval_template,
"llamafile://mistralai/Mistral-7B-Instruct-v0.2": oai_eval_template,
},
# TODO: Remove the old EVALUATION section and rename EVALUATION_LITE
# to EVALUATION after we deprecate evaluator. Also remove the
# unused templates above (all the eval templates except default)
JobType.EVALUATION_LITE: {
"default": default_eval_template,
},
}
4 changes: 2 additions & 2 deletions lumigator/python/mzai/backend/backend/services/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from lumigator_schemas.experiments import ExperimentCreate, ExperimentResponse
from lumigator_schemas.extras import ListingResponse
from lumigator_schemas.jobs import (
JobEvalCreate,
JobEvalLiteCreate,
JobInferenceCreate,
JobStatus,
)
Expand Down Expand Up @@ -138,7 +138,7 @@ def _run_eval(

# submit the job
self._job_service.create_job(
JobEvalCreate.model_validate(job_eval_dict), experiment_id=experiment_id
JobEvalLiteCreate.model_validate(job_eval_dict), experiment_id=experiment_id
)

def create_experiment(
Expand Down
28 changes: 25 additions & 3 deletions lumigator/python/mzai/backend/backend/services/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from lumigator_schemas.jobs import (
JobConfig,
JobEvalCreate,
JobEvalLiteCreate,
JobInferenceCreate,
JobResponse,
JobResultDownloadResponse,
Expand Down Expand Up @@ -44,6 +45,13 @@ class JobService:
"ray_worker_gpus_fraction": settings.RAY_WORKER_GPUS_FRACTION,
"ray_worker_gpus": settings.RAY_WORKER_GPUS,
},
JobType.EVALUATION_LITE: {
"command": settings.EVALUATOR_LITE_COMMAND,
"pip": settings.EVALUATOR_LITE_PIP_REQS,
"work_dir": settings.EVALUATOR_LITE_WORK_DIR,
"ray_worker_gpus_fraction": settings.RAY_WORKER_GPUS_FRACTION,
"ray_worker_gpus": settings.RAY_WORKER_GPUS,
},
}

def __init__(
Expand Down Expand Up @@ -122,8 +130,9 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict:
model_url = self._set_model_type(request)

# provide a reasonable system prompt for services where none was specified
if request.system_prompt is None and not request.model.startswith("hf://"):
request.system_prompt = settings.DEFAULT_SUMMARIZER_PROMPT
if job_type == JobType.EVALUATION or job_type == JobType.INFERENCE:
if request.system_prompt is None and not request.model.startswith("hf://"):
request.system_prompt = settings.DEFAULT_SUMMARIZER_PROMPT
aittalam marked this conversation as resolved.
Show resolved Hide resolved

# this section differs between inference and eval
if job_type == JobType.EVALUATION:
Expand All @@ -138,6 +147,15 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict:
"system_prompt": request.system_prompt,
"skip_inference": request.skip_inference,
}
elif job_type == JobType.EVALUATION_LITE:
job_params = {
"job_id": record.id,
"job_name": request.name,
"model_uri": request.model,
"dataset_path": dataset_s3_path,
"max_samples": request.max_samples,
"storage_path": self.storage_path,
}
else:
job_params = {
"job_id": record.id,
Expand All @@ -164,11 +182,15 @@ def _get_job_params(self, job_type: str, record, request: BaseModel) -> dict:
return job_params

def create_job(
self, request: JobEvalCreate | JobInferenceCreate, experiment_id: UUID = None
self,
request: JobEvalCreate | JobEvalLiteCreate | JobInferenceCreate,
experiment_id: UUID = None,
) -> JobResponse:
"""Creates a new evaluation workload to run on Ray and returns the response status."""
if isinstance(request, JobEvalCreate):
job_type = JobType.EVALUATION
elif isinstance(request, JobEvalLiteCreate):
job_type = JobType.EVALUATION_LITE
elif isinstance(request, JobInferenceCreate):
job_type = JobType.INFERENCE
else:
Expand Down
8 changes: 6 additions & 2 deletions lumigator/python/mzai/backend/backend/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ class BackendSettings(BaseSettings):
# Eval job details
EVALUATOR_WORK_DIR: str | None = None
EVALUATOR_PIP_REQS: str | None = None
# TODO: change once we remove old eval
NEW_EVALUATOR_COMMAND: str = "python evaluator/evaluator.py"

@computed_field
@property
Expand All @@ -65,6 +63,12 @@ def EVALUATOR_COMMAND_WITH_LD_PRELOAD(self) -> str: # noqa: N802
"""
return f"{self.LD_PRELOAD_PREFIX} {self.NEW_EVALUATOR_COMMAND}"

# TODO: the following should all be refactored to EVALUATOR_* and the above should
aittalam marked this conversation as resolved.
Show resolved Hide resolved
# be removed when we deprecate evaluator
EVALUATOR_LITE_WORK_DIR: str | None = None
EVALUATOR_LITE_PIP_REQS: str | None = None
EVALUATOR_LITE_COMMAND: str = "python eval_lite.py"

# Inference job details
INFERENCE_WORK_DIR: str | None = None
INFERENCE_PIP_REQS: str | None = None
Expand Down
26 changes: 15 additions & 11 deletions lumigator/python/mzai/jobs/evaluator_lite/eval_lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import click
import s3fs
from datasets import load_from_disk
from eval_config import EvalJobConfig
from eval_metrics import EvaluationMetrics
from evaluate_config import EvalJobConfig
from loguru import logger


Expand All @@ -16,18 +16,21 @@ def save_to_disk(local_path: Path, data_dict: dict):
json.dump(data_dict, f)


def save_to_s3(config: dict, local_path: Path, storage_path: str):
def save_to_s3(config: EvalJobConfig, local_path: Path, storage_path: str):
s3 = s3fs.S3FileSystem()
if storage_path.endswith("/"):
storage_path = "s3://" + str(Path(storage_path[5:]) / config.get("name") / "results.json")
logger.info(f"Storing evaluation results into {local_path}...")
storage_path = "s3://" + str(Path(storage_path[5:]) / config.name / "results.json")
logger.info(f"Storing evaluation results into {storage_path}...")
s3.put_file(local_path, storage_path)


def save_outputs(config: dict, eval_results: dict) -> Path:
storage_path = config.get("evaluation").get("storage_path")
def save_outputs(config: EvalJobConfig, eval_results: dict) -> Path:
storage_path = config.evaluation.storage_path
aittalam marked this conversation as resolved.
Show resolved Hide resolved

local_path = Path("results.json")
# generate local temp file ANYWAY:
# - if storage_path is not provided, it will be stored and kept into a default dir
# - if storage_path is provided AND saving to S3 is successful, local file is deleted
local_path = Path(Path.home() / ".lumigator" / "results" / config.name / "results.json")

try:
save_to_disk(local_path, eval_results)
Expand Down Expand Up @@ -58,11 +61,12 @@ def run_eval(config: EvalJobConfig) -> Path:
# Load dataset given its URI
dataset = load_from_disk(config.dataset.path)
logger.info(f"Retrieving {config.dataset.path} for evaluation")
if max_samples:

# Limit dataset length if max_samples is specified
if max_samples < 1 or max_samples > len(dataset):
logger.info(f"max_samples ({max_samples}) resized to dataset size ({len(dataset)})")
# select data between the minimum and total length of dataset
num_samples = range(min(max_samples, len(dataset)))
dataset = dataset.select(num_samples)
max_samples = len(dataset)
dataset = dataset.select(range(max_samples))

# run evaluation and append to results dict
predictions = dataset["predictions"]
Expand Down
17 changes: 6 additions & 11 deletions lumigator/python/mzai/jobs/evaluator_lite/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
accelerate==0.33.0
absl-py==2.1.0
bert_score==0.3.13
bitsandbytes==0.42.0
click>=8.1.7
datasets==2.20.0
einops==0.8.0
evaluate==0.4.2
loguru==0.7.2
mistralai==0.4.2
nltk==3.8.1
numpy<2.0.0
openai==1.38.0
protobuf>=3.20.2
platformdirs>=2.1
aittalam marked this conversation as resolved.
Show resolved Hide resolved
pydantic-yaml>=1.2.0
pydantic>=2.6.4
ray[default]==2.30.0
rouge-score==0.1.2
ruff==0.5.5
s3fs==2024.5.0
scipy==1.13.1
sentencepiece==0.2.0
transformers==4.43.4
urllib3>=1.26.18,<2
six>=1.14
transformers==4.48.0
urllib3==2.3.0
12 changes: 12 additions & 0 deletions lumigator/python/mzai/schemas/lumigator_schemas/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
class JobType(str, Enum):
INFERENCE = "inference"
EVALUATION = "evaluate"
EVALUATION_LITE = "eval_lite"


class JobStatus(str, Enum):
Expand Down Expand Up @@ -66,6 +67,17 @@ class JobEvalCreate(BaseModel):
skip_inference: bool = False


# TODO: this has to be renamed to JobEvalCreate and the code above
# has to be removed when we deprecate evaluator
class JobEvalLiteCreate(BaseModel):
name: str
description: str = ""
model: str
dataset: UUID
max_samples: int = -1 # set to all samples by default
config_template: str | None = None


class JobInferenceCreate(BaseModel):
name: str
description: str = ""
Expand Down
Loading