From 1e0c89761bb77c1029dd89dafb17bf2d0fcb59a6 Mon Sep 17 00:00:00 2001 From: David Yastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Tue, 12 Nov 2024 16:24:10 -0800 Subject: [PATCH] Remove unnecessary filename field in FileData (#173) --- .../inputs/retrievers/file_input_retriever.py | 7 ++++--- .../inputs/retrievers/generic_dataset.py | 21 +++++++------------ .../retrievers/synthetic_data_retriever.py | 2 +- genai-perf/tests/test_embeddings_converter.py | 3 --- genai-perf/tests/test_file_input_retriever.py | 10 --------- genai-perf/tests/test_inputs.py | 2 -- genai-perf/tests/test_nvclip_converter.py | 4 +--- .../tests/test_openai_chat_converter.py | 1 - .../test_openai_completions_converter.py | 1 - genai-perf/tests/test_rankings_converter.py | 2 -- .../test_tensorrtllm_engine_converter.py | 1 - .../test_triton_tensorrtllm_converter.py | 1 - .../tests/test_triton_vllm_converter.py | 1 - 13 files changed, 14 insertions(+), 42 deletions(-) diff --git a/genai-perf/genai_perf/inputs/retrievers/file_input_retriever.py b/genai-perf/genai_perf/inputs/retrievers/file_input_retriever.py index 51e00d81..30cb345f 100644 --- a/genai-perf/genai_perf/inputs/retrievers/file_input_retriever.py +++ b/genai-perf/genai_perf/inputs/retrievers/file_input_retriever.py @@ -64,8 +64,9 @@ def retrieve_data(self) -> GenericDataset: if self.config.input_filename.is_dir(): files_data = self._get_input_datasets_from_dir() else: - file_data = self._get_input_dataset_from_file(self.config.input_filename) - files_data = {file_data.filename: file_data} + input_file = self.config.input_filename + file_data = self._get_input_dataset_from_file(input_file) + files_data = {str(input_file): file_data} return GenericDataset(files_data) @@ -274,4 +275,4 @@ def _convert_content_to_data_file( for image in images: data_rows.append(DataRow(texts=[], images=[image])) - return FileData(str(filename), data_rows) + return FileData(data_rows) diff --git a/genai-perf/genai_perf/inputs/retrievers/generic_dataset.py b/genai-perf/genai_perf/inputs/retrievers/generic_dataset.py index 126c2dd2..a38b2b25 100644 --- a/genai-perf/genai_perf/inputs/retrievers/generic_dataset.py +++ b/genai-perf/genai_perf/inputs/retrievers/generic_dataset.py @@ -48,21 +48,18 @@ def to_dict(self) -> DataRowDict: @dataclass class FileData: - filename: str rows: List[DataRow] - def to_dict(self) -> Dict[Filename, List[DataRowDict]]: + def to_list(self) -> List[DataRowDict]: """ - Converts the FileData object to a dictionary. + Converts the FileData object to a list. Output format example for two payloads from a file: - { - 'file_0': [ - {'texts': ['text1', 'text2'], 'images': ['image1', 'image2']}, - {'texts': ['text3', 'text4'], 'images': ['image3', 'image4']} - ] - } + [ + {'texts': ['text1', 'text2'], 'images': ['image1', 'image2']}, + {'texts': ['text3', 'text4'], 'images': ['image3', 'image4']} + ] """ - return {self.filename: [row.to_dict() for row in self.rows]} + return [row.to_dict() for row in self.rows] @dataclass @@ -74,13 +71,11 @@ def to_dict(self) -> GenericDatasetDict: Converts the entire DataStructure object to a dictionary. Output format example for one payload from two files: { - { 'file_0': [{'texts': ['text1', 'text2'], 'images': ['image1', 'image2']}], 'file_1': [{'texts': ['text1', 'text2'], 'images': ['image1', 'image2']}] } - } """ return { - filename: file_data.to_dict()[filename] + filename: file_data.to_list() for filename, file_data in self.files_data.items() } diff --git a/genai-perf/genai_perf/inputs/retrievers/synthetic_data_retriever.py b/genai-perf/genai_perf/inputs/retrievers/synthetic_data_retriever.py index 9813001b..3a9e7cf9 100644 --- a/genai-perf/genai_perf/inputs/retrievers/synthetic_data_retriever.py +++ b/genai-perf/genai_perf/inputs/retrievers/synthetic_data_retriever.py @@ -76,7 +76,7 @@ def retrieve_data(self) -> GenericDataset: data_rows.append(row) - file_data = FileData(file, data_rows) + file_data = FileData(data_rows) synthetic_dataset.files_data[file] = file_data diff --git a/genai-perf/tests/test_embeddings_converter.py b/genai-perf/tests/test_embeddings_converter.py index 65db90ef..9e22ab14 100644 --- a/genai-perf/tests/test_embeddings_converter.py +++ b/genai-perf/tests/test_embeddings_converter.py @@ -58,7 +58,6 @@ def test_convert_default(self): generic_dataset = GenericDataset( files_data={ "file1": FileData( - filename="file1", rows=[DataRow(texts=["text_1"]), DataRow(texts=["text_2"])], ) } @@ -101,7 +100,6 @@ def test_convert_batched(self): generic_dataset = GenericDataset( files_data={ "file1": FileData( - filename="file1", rows=[ DataRow(texts=["text_1", "text_2"]), DataRow(texts=["text_3", "text_4"]), @@ -149,7 +147,6 @@ def test_convert_with_request_parameters(self): generic_dataset = GenericDataset( files_data={ "file1": FileData( - filename="file1", rows=[DataRow(texts=["text_1"]), DataRow(texts=["text_2"])], ) } diff --git a/genai-perf/tests/test_file_input_retriever.py b/genai-perf/tests/test_file_input_retriever.py index 446e7bf8..77cca3fc 100644 --- a/genai-perf/tests/test_file_input_retriever.py +++ b/genai-perf/tests/test_file_input_retriever.py @@ -104,7 +104,6 @@ def test_get_input_file_single_image( ) assert file_data is not None - assert file_data.filename == "single_image.jsonl" assert len(file_data.rows) == 1 assert file_data.rows[0].images[0] == "mock_base64_image" @@ -131,7 +130,6 @@ def test_get_input_file_multiple_images( ) assert file_data is not None - assert file_data.filename == "multiple_images.jsonl" assert len(file_data.rows) == 3 expected_images = [ "mock_base64_image1", @@ -157,7 +155,6 @@ def test_get_input_file_single_prompt(self, mock_file, mock_exists): ) assert file_data is not None - assert file_data.filename == "single_prompt.jsonl" assert len(file_data.rows) == 1 assert file_data.rows[0].texts[0] == "What is the capital of France?" @@ -177,7 +174,6 @@ def test_get_input_file_multiple_prompts(self, mock_file, mock_exists): ) assert file_data is not None - assert file_data.filename == "multiple_prompts.jsonl" assert len(file_data.rows) == 3 expected_prompts = [ "What is the capital of France?", @@ -210,7 +206,6 @@ def test_get_input_file_multi_modal( ) assert file_data is not None - assert file_data.filename == "multi_modal.jsonl" assert len(file_data.rows) == 2 assert file_data.rows[0].texts[0] == "What is this image?" assert file_data.rows[0].images[0] == "mock_base64_image" @@ -231,7 +226,6 @@ def test_get_input_file_deprecated_text_input(self, mock_file, mock_exists): ) assert file_data is not None - assert file_data.filename == "deprecated_text_input.jsonl" assert len(file_data.rows) == 2 assert file_data.rows[0].texts[0] == "Who is Albert Einstein?" assert file_data.rows[1].texts[0] == "What is the speed of light?" @@ -317,14 +311,12 @@ def test_get_input_datasets_from_dir( assert len(file_data) == 4 - assert file_data["single_prompt"].filename == "single_prompt.jsonl" assert len(file_data["single_prompt"].rows) == 1 assert ( file_data["single_prompt"].rows[0].texts[0] == "What is the capital of France?" ) - assert file_data["multiple_prompts"].filename == "multiple_prompts.jsonl" assert len(file_data["multiple_prompts"].rows) == 3 expected_prompts = [ "What is the capital of France?", @@ -334,11 +326,9 @@ def test_get_input_datasets_from_dir( for i, prompt in enumerate(expected_prompts): assert file_data["multiple_prompts"].rows[i].texts[0] == prompt - assert file_data["single_image"].filename == "single_image.jsonl" assert len(file_data["single_image"].rows) == 1 assert file_data["single_image"].rows[0].images[0] == "mock_base64_image" - assert file_data["multi_modal"].filename == "multi_modal.jsonl" assert len(file_data["multi_modal"].rows) == 2 assert file_data["multi_modal"].rows[0].texts[0] == "What is this image?" assert file_data["multi_modal"].rows[0].images[0] == "mock_base64_image" diff --git a/genai-perf/tests/test_inputs.py b/genai-perf/tests/test_inputs.py index 3c698207..0db6fbc1 100644 --- a/genai-perf/tests/test_inputs.py +++ b/genai-perf/tests/test_inputs.py @@ -50,7 +50,6 @@ def test_data_retrieval_and_conversion( generic_dataset = GenericDataset( files_data={ "file1.jsonl": FileData( - filename="file1.jsonl", rows=[DataRow(texts=["test input"], images=[])], ) } @@ -97,7 +96,6 @@ def test_write_json_to_file( generic_dataset = GenericDataset( files_data={ "file1.jsonl": FileData( - filename="file1.jsonl", rows=[DataRow(texts=["test input one"], images=[])], ) } diff --git a/genai-perf/tests/test_nvclip_converter.py b/genai-perf/tests/test_nvclip_converter.py index ba320e01..db87924f 100644 --- a/genai-perf/tests/test_nvclip_converter.py +++ b/genai-perf/tests/test_nvclip_converter.py @@ -41,9 +41,7 @@ class TestNVClipConverter: @staticmethod def create_generic_dataset(rows) -> GenericDataset: - return GenericDataset( - files_data={"file1": FileData(filename="file1", rows=rows)} - ) + return GenericDataset(files_data={"file1": FileData(rows)}) def test_convert_default(self): generic_dataset = self.create_generic_dataset( diff --git a/genai-perf/tests/test_openai_chat_converter.py b/genai-perf/tests/test_openai_chat_converter.py index 73533201..362b7f89 100644 --- a/genai-perf/tests/test_openai_chat_converter.py +++ b/genai-perf/tests/test_openai_chat_converter.py @@ -61,7 +61,6 @@ def clean_image(row): return GenericDataset( files_data={ "file1": FileData( - filename="file1", rows=[ DataRow(texts=clean_text(row), images=clean_image(row)) for row in rows diff --git a/genai-perf/tests/test_openai_completions_converter.py b/genai-perf/tests/test_openai_completions_converter.py index ef8e3299..52b08ddb 100644 --- a/genai-perf/tests/test_openai_completions_converter.py +++ b/genai-perf/tests/test_openai_completions_converter.py @@ -47,7 +47,6 @@ def create_generic_dataset() -> GenericDataset: return GenericDataset( files_data={ "file1": FileData( - filename="file1", rows=[ DataRow(texts=["text input one"]), DataRow(texts=["text input two"]), diff --git a/genai-perf/tests/test_rankings_converter.py b/genai-perf/tests/test_rankings_converter.py index 6ddcb6d9..ae5596b9 100644 --- a/genai-perf/tests/test_rankings_converter.py +++ b/genai-perf/tests/test_rankings_converter.py @@ -49,13 +49,11 @@ def create_generic_dataset( if queries_data is not None: files_data["queries"] = FileData( - filename="queries", rows=[DataRow(texts=query) for query in queries_data], ) if passages_data is not None: files_data["passages"] = FileData( - filename="passages", rows=[DataRow(texts=passage) for passage in passages_data], ) diff --git a/genai-perf/tests/test_tensorrtllm_engine_converter.py b/genai-perf/tests/test_tensorrtllm_engine_converter.py index bda65691..1de20d13 100644 --- a/genai-perf/tests/test_tensorrtllm_engine_converter.py +++ b/genai-perf/tests/test_tensorrtllm_engine_converter.py @@ -49,7 +49,6 @@ def create_generic_dataset() -> GenericDataset: return GenericDataset( files_data={ "file1": FileData( - filename="file1", rows=[ DataRow(texts=["text input one"]), DataRow(texts=["text input two"]), diff --git a/genai-perf/tests/test_triton_tensorrtllm_converter.py b/genai-perf/tests/test_triton_tensorrtllm_converter.py index e98f3cc5..95d3315a 100644 --- a/genai-perf/tests/test_triton_tensorrtllm_converter.py +++ b/genai-perf/tests/test_triton_tensorrtllm_converter.py @@ -47,7 +47,6 @@ def create_generic_dataset(): return GenericDataset( files_data={ "file1": FileData( - filename="file1", rows=[ DataRow(texts=["text input one"]), DataRow(texts=["text input two"]), diff --git a/genai-perf/tests/test_triton_vllm_converter.py b/genai-perf/tests/test_triton_vllm_converter.py index 1c173943..1231c5d7 100644 --- a/genai-perf/tests/test_triton_vllm_converter.py +++ b/genai-perf/tests/test_triton_vllm_converter.py @@ -45,7 +45,6 @@ def create_generic_dataset(): return GenericDataset( files_data={ "file1": FileData( - filename="file1", rows=[ DataRow(texts=["text input one"]), DataRow(texts=["text input two"]),