From b894b299db3a779ffc6e8940560b019485b02b61 Mon Sep 17 00:00:00 2001 From: "I. Vidal" Date: Fri, 17 Jan 2025 10:29:08 +0100 Subject: [PATCH] fix: crash if requested output field for inference doesn't exist in dataset (#624) * fix: crash if requested output field for inference doesn't exist in dataset * refactor: overwrite column is a warning, update output with dataset content * test: both empty and entirely missing ground-truth datasets --- .../mzai/backend/backend/tests/conftest.py | 9 ++- .../api/routes/test_api_workflows.py | 7 ++- .../python/mzai/jobs/inference/inference.py | 13 ++++- .../mzai/sample_data/dialogsum_mini_no_gt.csv | 55 +++++++++++++++++++ 4 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 lumigator/python/mzai/sample_data/dialogsum_mini_no_gt.csv diff --git a/lumigator/python/mzai/backend/backend/tests/conftest.py b/lumigator/python/mzai/backend/backend/tests/conftest.py index e39385fa5..0e9541d47 100644 --- a/lumigator/python/mzai/backend/backend/tests/conftest.py +++ b/lumigator/python/mzai/backend/backend/tests/conftest.py @@ -94,12 +94,19 @@ def dialog_dataset(): @pytest.fixture(scope="function") -def dialog_no_gt_dataset(): +def dialog_empty_gt_dataset(): filename = common_resources_dir() / "sample_data" / "dialogsum_mini_empty_gt.csv" with Path(filename).open("rb") as f: yield f +@pytest.fixture(scope="function") +def dialog_no_gt_dataset(): + filename = common_resources_dir() / "sample_data" / "dialogsum_mini_no_gt.csv" + with Path(filename).open("rb") as f: + yield f + + @pytest.fixture(scope="session", autouse=True) def db_engine(): """Initialize a DB engine and create tables.""" diff --git a/lumigator/python/mzai/backend/backend/tests/integration/api/routes/test_api_workflows.py b/lumigator/python/mzai/backend/backend/tests/integration/api/routes/test_api_workflows.py index 126a5ba0b..b9b914c97 100644 --- a/lumigator/python/mzai/backend/backend/tests/integration/api/routes/test_api_workflows.py +++ b/lumigator/python/mzai/backend/backend/tests/integration/api/routes/test_api_workflows.py @@ -135,17 +135,20 @@ def test_upload_data_launch_job( logger.info(f"#{logs_infer_job_response_model.logs}#") +@pytest.mark.parametrize("unnanotated_dataset", ["dialog_empty_gt_dataset", "dialog_no_gt_dataset"]) def test_upload_data_no_gt_launch_annotation( + request: pytest.FixtureRequest, local_client: TestClient, - dialog_no_gt_dataset, + unnanotated_dataset, simple_eval_template, simple_infer_template, dependency_overrides_services, ): + dataset = request.getfixturevalue(unnanotated_dataset) create_response = local_client.post( "/datasets/", data={}, - files={"dataset": dialog_no_gt_dataset, "format": (None, DatasetFormat.JOB.value)}, + files={"dataset": dataset, "format": (None, DatasetFormat.JOB.value)}, ) assert create_response.status_code == 201 diff --git a/lumigator/python/mzai/jobs/inference/inference.py b/lumigator/python/mzai/jobs/inference/inference.py index e6aa8a5ad..6948e3aae 100644 --- a/lumigator/python/mzai/jobs/inference/inference.py +++ b/lumigator/python/mzai/jobs/inference/inference.py @@ -109,10 +109,17 @@ def run_inference(config: InferenceJobConfig) -> Path: else: raise NotImplementedError("Inference pipeline not supported.") - # run inference + # We keep any columns that were already there (not just the original input + # samples, but also past predictions under another column name) + output.update(dataset.to_dict()) + + # We are trusting the user: if the dataset already had a column with the output_field + # they selected, we overwrite it with the values from our inference. + + if config.job.output_field in dataset.column_names: + logger.warning(f"Overwriting {config.job.output_field}") + output[config.job.output_field] = predict(dataset_iterable, model_client) - output["examples"] = dataset["examples"] - output["ground_truth"] = dataset["ground_truth"] output["model"] = output_model_name output_path = save_outputs(config, output) diff --git a/lumigator/python/mzai/sample_data/dialogsum_mini_no_gt.csv b/lumigator/python/mzai/sample_data/dialogsum_mini_no_gt.csv new file mode 100644 index 000000000..42f9ee2f6 --- /dev/null +++ b/lumigator/python/mzai/sample_data/dialogsum_mini_no_gt.csv @@ -0,0 +1,55 @@ +examples +"#Person1#: Hello, how are you doing today? +#Person2#: I ' Ve been having trouble breathing lately. +#Person1#: Have you had any type of cold lately? +#Person2#: No, I haven ' t had a cold. I just have a heavy feeling in my chest when I try to breathe. +#Person1#: Do you have any allergies that you know of? +#Person2#: No, I don ' t have any allergies that I know of. +#Person1#: Does this happen all the time or mostly when you are active? +#Person2#: It happens a lot when I work out. +#Person1#: I am going to send you to a pulmonary specialist who can run tests on you for asthma. +#Person2#: Thank you for your help, doctor." +"#Person1#: Hey Jimmy. Let's go workout later today. +#Person2#: Sure. What time do you want to go? +#Person1#: How about at 3:30? +#Person2#: That sounds good. Today we work on Legs and forearm. +#Person1#: Hey. I just played basketball earlier, so my legs are a little sore. Let's work out on arms and stomach today. +#Person2#: I'm on a weekly schedule. You're messing everything up. +#Person1#: C'mon. We're only switching two days. You can do legs on Friday. +#Person2#: Aright. I'll meet you at the gym at 3:30 then." +"#Person1#: I need to stop eating such unhealthy foods. +#Person2#: I know what you mean. I've started eating better myself. +#Person1#: What foods do you eat now? +#Person2#: I tend to stick to fruits, vegetables, and chicken. +#Person1#: Those are the only things you eat? +#Person2#: That's basically what I eat. +#Person1#: Why aren't you eating anything else? +#Person2#: Well, fruits and vegetables are very healthy. +#Person1#: And the chicken? +#Person2#: It's really healthy to eat when you bake it. +#Person1#: I guess that does sound a lot healthier." +"#Person1#: Do you believe in UFOs? +#Person2#: Of course, they are out there. +#Person1#: But I never saw them. +#Person2#: Are you stupid? They are called UFOs, so not everybody can see them. +#Person1#: You mean that you can them. +#Person2#: That's right. I can see them in my dreams. +#Person1#: They come to the earth? +#Person2#: No. Their task is to send the aliens here from the outer space. +#Person1#: Aliens from the outer space? Do you talk to them? What do they look like? +#Person2#: OK, OK, one by one, please! They look like robots, but they can speak. Their mission is to make friends with human beings. +#Person1#: That means that you talk to them? In which language? +#Person2#: Of course in English, they learn English on Mars too. +#Person1#: Wow. Sounds fantastic!" +"#Person1#: Did you go to school today? +#Person2#: Of course. Did you? +#Person1#: I didn't want to, so I didn't. +#Person2#: That's sad, but have you gone to the movies recently? +#Person1#: That's a switch. +#Person2#: I'm serious, have you? +#Person1#: No, I haven't. Why? +#Person2#: I really want to go to the movies this weekend. +#Person1#: So go then. +#Person2#: I really don't want to go by myself. +#Person1#: Well anyway, do you plan on going to school tomorrow? +#Person2#: No, I think I'm going to go to the movies."