Skip to content

Commit

Permalink
Remove unnecessary filename field
Browse files Browse the repository at this point in the history
  • Loading branch information
dyastremsky committed Nov 8, 2024
1 parent 0304415 commit cdf624e
Show file tree
Hide file tree
Showing 13 changed files with 9 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ def retrieve_data(self) -> GenericDataset:
if self.config.input_filename.is_dir():
files_data = self._get_input_datasets_from_dir()
else:
file_data = self._get_input_dataset_from_file(self.config.input_filename)
files_data = {file_data.filename: file_data}
input_file = self.config.input_filename
file_data = self._get_input_dataset_from_file(input_file)
files_data = {str(input_file): file_data}

return GenericDataset(files_data)

Expand Down Expand Up @@ -274,4 +275,4 @@ def _convert_content_to_data_file(
for image in images:
data_rows.append(DataRow(texts=[], images=[image]))

return FileData(str(filename), data_rows)
return FileData(data_rows)
7 changes: 3 additions & 4 deletions genai-perf/genai_perf/inputs/retrievers/generic_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ def to_dict(self) -> DataRowDict:

@dataclass
class FileData:
filename: str
rows: List[DataRow]

def to_dict(self) -> Dict[Filename, List[DataRowDict]]:
def to_list(self) -> List[DataRowDict]:
"""
Converts the FileData object to a dictionary.
Output format example for two payloads from a file:
Expand All @@ -62,7 +61,7 @@ def to_dict(self) -> Dict[Filename, List[DataRowDict]]:
]
}
"""
return {self.filename: [row.to_dict() for row in self.rows]}
return [row.to_dict() for row in self.rows]


@dataclass
Expand All @@ -81,6 +80,6 @@ def to_dict(self) -> GenericDatasetDict:
}
"""
return {
filename: file_data.to_dict()[filename]
filename: file_data.to_list()
for filename, file_data in self.files_data.items()
}
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def retrieve_data(self) -> GenericDataset:

data_rows.append(row)

file_data = FileData(file, data_rows)
file_data = FileData(data_rows)

synthetic_dataset.files_data[file] = file_data

Expand Down
3 changes: 0 additions & 3 deletions genai-perf/tests/test_embeddings_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def test_convert_default(self):
generic_dataset = GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[DataRow(texts=["text_1"]), DataRow(texts=["text_2"])],
)
}
Expand Down Expand Up @@ -101,7 +100,6 @@ def test_convert_batched(self):
generic_dataset = GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text_1", "text_2"]),
DataRow(texts=["text_3", "text_4"]),
Expand Down Expand Up @@ -149,7 +147,6 @@ def test_convert_with_request_parameters(self):
generic_dataset = GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[DataRow(texts=["text_1"]), DataRow(texts=["text_2"])],
)
}
Expand Down
10 changes: 0 additions & 10 deletions genai-perf/tests/test_file_input_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def test_get_input_file_single_image(
)

assert file_data is not None
assert file_data.filename == "single_image.jsonl"
assert len(file_data.rows) == 1
assert file_data.rows[0].images[0] == "mock_base64_image"

Expand All @@ -131,7 +130,6 @@ def test_get_input_file_multiple_images(
)

assert file_data is not None
assert file_data.filename == "multiple_images.jsonl"
assert len(file_data.rows) == 3
expected_images = [
"mock_base64_image1",
Expand All @@ -157,7 +155,6 @@ def test_get_input_file_single_prompt(self, mock_file, mock_exists):
)

assert file_data is not None
assert file_data.filename == "single_prompt.jsonl"
assert len(file_data.rows) == 1
assert file_data.rows[0].texts[0] == "What is the capital of France?"

Expand All @@ -177,7 +174,6 @@ def test_get_input_file_multiple_prompts(self, mock_file, mock_exists):
)

assert file_data is not None
assert file_data.filename == "multiple_prompts.jsonl"
assert len(file_data.rows) == 3
expected_prompts = [
"What is the capital of France?",
Expand Down Expand Up @@ -210,7 +206,6 @@ def test_get_input_file_multi_modal(
)

assert file_data is not None
assert file_data.filename == "multi_modal.jsonl"
assert len(file_data.rows) == 2
assert file_data.rows[0].texts[0] == "What is this image?"
assert file_data.rows[0].images[0] == "mock_base64_image"
Expand All @@ -231,7 +226,6 @@ def test_get_input_file_deprecated_text_input(self, mock_file, mock_exists):
)

assert file_data is not None
assert file_data.filename == "deprecated_text_input.jsonl"
assert len(file_data.rows) == 2
assert file_data.rows[0].texts[0] == "Who is Albert Einstein?"
assert file_data.rows[1].texts[0] == "What is the speed of light?"
Expand Down Expand Up @@ -317,14 +311,12 @@ def test_get_input_datasets_from_dir(

assert len(file_data) == 4

assert file_data["single_prompt"].filename == "single_prompt.jsonl"
assert len(file_data["single_prompt"].rows) == 1
assert (
file_data["single_prompt"].rows[0].texts[0]
== "What is the capital of France?"
)

assert file_data["multiple_prompts"].filename == "multiple_prompts.jsonl"
assert len(file_data["multiple_prompts"].rows) == 3
expected_prompts = [
"What is the capital of France?",
Expand All @@ -334,11 +326,9 @@ def test_get_input_datasets_from_dir(
for i, prompt in enumerate(expected_prompts):
assert file_data["multiple_prompts"].rows[i].texts[0] == prompt

assert file_data["single_image"].filename == "single_image.jsonl"
assert len(file_data["single_image"].rows) == 1
assert file_data["single_image"].rows[0].images[0] == "mock_base64_image"

assert file_data["multi_modal"].filename == "multi_modal.jsonl"
assert len(file_data["multi_modal"].rows) == 2
assert file_data["multi_modal"].rows[0].texts[0] == "What is this image?"
assert file_data["multi_modal"].rows[0].images[0] == "mock_base64_image"
Expand Down
2 changes: 0 additions & 2 deletions genai-perf/tests/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def test_data_retrieval_and_conversion(
generic_dataset = GenericDataset(
files_data={
"file1.jsonl": FileData(
filename="file1.jsonl",
rows=[DataRow(texts=["test input"], images=[])],
)
}
Expand Down Expand Up @@ -97,7 +96,6 @@ def test_write_json_to_file(
generic_dataset = GenericDataset(
files_data={
"file1.jsonl": FileData(
filename="file1.jsonl",
rows=[DataRow(texts=["test input one"], images=[])],
)
}
Expand Down
4 changes: 1 addition & 3 deletions genai-perf/tests/test_nvclip_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ class TestNVClipConverter:

@staticmethod
def create_generic_dataset(rows) -> GenericDataset:
return GenericDataset(
files_data={"file1": FileData(filename="file1", rows=rows)}
)
return GenericDataset(files_data={"file1": FileData(rows)})

def test_convert_default(self):
generic_dataset = self.create_generic_dataset(
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_openai_chat_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def clean_image(row):
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=clean_text(row), images=clean_image(row))
for row in rows
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_openai_completions_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def create_generic_dataset() -> GenericDataset:
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
2 changes: 0 additions & 2 deletions genai-perf/tests/test_rankings_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,11 @@ def create_generic_dataset(

if queries_data is not None:
files_data["queries"] = FileData(
filename="queries",
rows=[DataRow(texts=query) for query in queries_data],
)

if passages_data is not None:
files_data["passages"] = FileData(
filename="passages",
rows=[DataRow(texts=passage) for passage in passages_data],
)

Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_tensorrtllm_engine_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def create_generic_dataset() -> GenericDataset:
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_triton_tensorrtllm_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def create_generic_dataset():
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_triton_vllm_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def create_generic_dataset():
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down

0 comments on commit cdf624e

Please sign in to comment.