Skip to content

Commit

Permalink
Remove unnecessary filename field in FileData (#173)
Browse files Browse the repository at this point in the history
  • Loading branch information
dyastremsky authored Nov 13, 2024
1 parent 0304415 commit 1e0c897
Show file tree
Hide file tree
Showing 13 changed files with 14 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ def retrieve_data(self) -> GenericDataset:
if self.config.input_filename.is_dir():
files_data = self._get_input_datasets_from_dir()
else:
file_data = self._get_input_dataset_from_file(self.config.input_filename)
files_data = {file_data.filename: file_data}
input_file = self.config.input_filename
file_data = self._get_input_dataset_from_file(input_file)
files_data = {str(input_file): file_data}

return GenericDataset(files_data)

Expand Down Expand Up @@ -274,4 +275,4 @@ def _convert_content_to_data_file(
for image in images:
data_rows.append(DataRow(texts=[], images=[image]))

return FileData(str(filename), data_rows)
return FileData(data_rows)
21 changes: 8 additions & 13 deletions genai-perf/genai_perf/inputs/retrievers/generic_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,18 @@ def to_dict(self) -> DataRowDict:

@dataclass
class FileData:
filename: str
rows: List[DataRow]

def to_dict(self) -> Dict[Filename, List[DataRowDict]]:
def to_list(self) -> List[DataRowDict]:
"""
Converts the FileData object to a dictionary.
Converts the FileData object to a list.
Output format example for two payloads from a file:
{
'file_0': [
{'texts': ['text1', 'text2'], 'images': ['image1', 'image2']},
{'texts': ['text3', 'text4'], 'images': ['image3', 'image4']}
]
}
[
{'texts': ['text1', 'text2'], 'images': ['image1', 'image2']},
{'texts': ['text3', 'text4'], 'images': ['image3', 'image4']}
]
"""
return {self.filename: [row.to_dict() for row in self.rows]}
return [row.to_dict() for row in self.rows]


@dataclass
Expand All @@ -74,13 +71,11 @@ def to_dict(self) -> GenericDatasetDict:
Converts the entire DataStructure object to a dictionary.
Output format example for one payload from two files:
{
{
'file_0': [{'texts': ['text1', 'text2'], 'images': ['image1', 'image2']}],
'file_1': [{'texts': ['text1', 'text2'], 'images': ['image1', 'image2']}]
}
}
"""
return {
filename: file_data.to_dict()[filename]
filename: file_data.to_list()
for filename, file_data in self.files_data.items()
}
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def retrieve_data(self) -> GenericDataset:

data_rows.append(row)

file_data = FileData(file, data_rows)
file_data = FileData(data_rows)

synthetic_dataset.files_data[file] = file_data

Expand Down
3 changes: 0 additions & 3 deletions genai-perf/tests/test_embeddings_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def test_convert_default(self):
generic_dataset = GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[DataRow(texts=["text_1"]), DataRow(texts=["text_2"])],
)
}
Expand Down Expand Up @@ -101,7 +100,6 @@ def test_convert_batched(self):
generic_dataset = GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text_1", "text_2"]),
DataRow(texts=["text_3", "text_4"]),
Expand Down Expand Up @@ -149,7 +147,6 @@ def test_convert_with_request_parameters(self):
generic_dataset = GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[DataRow(texts=["text_1"]), DataRow(texts=["text_2"])],
)
}
Expand Down
10 changes: 0 additions & 10 deletions genai-perf/tests/test_file_input_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def test_get_input_file_single_image(
)

assert file_data is not None
assert file_data.filename == "single_image.jsonl"
assert len(file_data.rows) == 1
assert file_data.rows[0].images[0] == "mock_base64_image"

Expand All @@ -131,7 +130,6 @@ def test_get_input_file_multiple_images(
)

assert file_data is not None
assert file_data.filename == "multiple_images.jsonl"
assert len(file_data.rows) == 3
expected_images = [
"mock_base64_image1",
Expand All @@ -157,7 +155,6 @@ def test_get_input_file_single_prompt(self, mock_file, mock_exists):
)

assert file_data is not None
assert file_data.filename == "single_prompt.jsonl"
assert len(file_data.rows) == 1
assert file_data.rows[0].texts[0] == "What is the capital of France?"

Expand All @@ -177,7 +174,6 @@ def test_get_input_file_multiple_prompts(self, mock_file, mock_exists):
)

assert file_data is not None
assert file_data.filename == "multiple_prompts.jsonl"
assert len(file_data.rows) == 3
expected_prompts = [
"What is the capital of France?",
Expand Down Expand Up @@ -210,7 +206,6 @@ def test_get_input_file_multi_modal(
)

assert file_data is not None
assert file_data.filename == "multi_modal.jsonl"
assert len(file_data.rows) == 2
assert file_data.rows[0].texts[0] == "What is this image?"
assert file_data.rows[0].images[0] == "mock_base64_image"
Expand All @@ -231,7 +226,6 @@ def test_get_input_file_deprecated_text_input(self, mock_file, mock_exists):
)

assert file_data is not None
assert file_data.filename == "deprecated_text_input.jsonl"
assert len(file_data.rows) == 2
assert file_data.rows[0].texts[0] == "Who is Albert Einstein?"
assert file_data.rows[1].texts[0] == "What is the speed of light?"
Expand Down Expand Up @@ -317,14 +311,12 @@ def test_get_input_datasets_from_dir(

assert len(file_data) == 4

assert file_data["single_prompt"].filename == "single_prompt.jsonl"
assert len(file_data["single_prompt"].rows) == 1
assert (
file_data["single_prompt"].rows[0].texts[0]
== "What is the capital of France?"
)

assert file_data["multiple_prompts"].filename == "multiple_prompts.jsonl"
assert len(file_data["multiple_prompts"].rows) == 3
expected_prompts = [
"What is the capital of France?",
Expand All @@ -334,11 +326,9 @@ def test_get_input_datasets_from_dir(
for i, prompt in enumerate(expected_prompts):
assert file_data["multiple_prompts"].rows[i].texts[0] == prompt

assert file_data["single_image"].filename == "single_image.jsonl"
assert len(file_data["single_image"].rows) == 1
assert file_data["single_image"].rows[0].images[0] == "mock_base64_image"

assert file_data["multi_modal"].filename == "multi_modal.jsonl"
assert len(file_data["multi_modal"].rows) == 2
assert file_data["multi_modal"].rows[0].texts[0] == "What is this image?"
assert file_data["multi_modal"].rows[0].images[0] == "mock_base64_image"
Expand Down
2 changes: 0 additions & 2 deletions genai-perf/tests/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def test_data_retrieval_and_conversion(
generic_dataset = GenericDataset(
files_data={
"file1.jsonl": FileData(
filename="file1.jsonl",
rows=[DataRow(texts=["test input"], images=[])],
)
}
Expand Down Expand Up @@ -97,7 +96,6 @@ def test_write_json_to_file(
generic_dataset = GenericDataset(
files_data={
"file1.jsonl": FileData(
filename="file1.jsonl",
rows=[DataRow(texts=["test input one"], images=[])],
)
}
Expand Down
4 changes: 1 addition & 3 deletions genai-perf/tests/test_nvclip_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ class TestNVClipConverter:

@staticmethod
def create_generic_dataset(rows) -> GenericDataset:
return GenericDataset(
files_data={"file1": FileData(filename="file1", rows=rows)}
)
return GenericDataset(files_data={"file1": FileData(rows)})

def test_convert_default(self):
generic_dataset = self.create_generic_dataset(
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_openai_chat_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def clean_image(row):
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=clean_text(row), images=clean_image(row))
for row in rows
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_openai_completions_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def create_generic_dataset() -> GenericDataset:
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
2 changes: 0 additions & 2 deletions genai-perf/tests/test_rankings_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,11 @@ def create_generic_dataset(

if queries_data is not None:
files_data["queries"] = FileData(
filename="queries",
rows=[DataRow(texts=query) for query in queries_data],
)

if passages_data is not None:
files_data["passages"] = FileData(
filename="passages",
rows=[DataRow(texts=passage) for passage in passages_data],
)

Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_tensorrtllm_engine_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def create_generic_dataset() -> GenericDataset:
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_triton_tensorrtllm_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def create_generic_dataset():
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_triton_vllm_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def create_generic_dataset():
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down

0 comments on commit 1e0c897

Please sign in to comment.