Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove unnecessary filename field in FileData #173

Merged
merged 3 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ def retrieve_data(self) -> GenericDataset:
if self.config.input_filename.is_dir():
files_data = self._get_input_datasets_from_dir()
else:
file_data = self._get_input_dataset_from_file(self.config.input_filename)
files_data = {file_data.filename: file_data}
input_file = self.config.input_filename
file_data = self._get_input_dataset_from_file(input_file)
files_data = {str(input_file): file_data}

return GenericDataset(files_data)

Expand Down Expand Up @@ -274,4 +275,4 @@ def _convert_content_to_data_file(
for image in images:
data_rows.append(DataRow(texts=[], images=[image]))

return FileData(str(filename), data_rows)
return FileData(data_rows)
7 changes: 3 additions & 4 deletions genai-perf/genai_perf/inputs/retrievers/generic_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ def to_dict(self) -> DataRowDict:

@dataclass
class FileData:
filename: str
rows: List[DataRow]

def to_dict(self) -> Dict[Filename, List[DataRowDict]]:
def to_list(self) -> List[DataRowDict]:
"""
Converts the FileData object to a dictionary.
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
Output format example for two payloads from a file:
Expand All @@ -62,7 +61,7 @@ def to_dict(self) -> Dict[Filename, List[DataRowDict]]:
]
}
"""
return {self.filename: [row.to_dict() for row in self.rows]}
return [row.to_dict() for row in self.rows]


@dataclass
Expand All @@ -81,6 +80,6 @@ def to_dict(self) -> GenericDatasetDict:
}
nv-hwoo marked this conversation as resolved.
Show resolved Hide resolved
"""
return {
filename: file_data.to_dict()[filename]
filename: file_data.to_list()
for filename, file_data in self.files_data.items()
}
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def retrieve_data(self) -> GenericDataset:

data_rows.append(row)

file_data = FileData(file, data_rows)
file_data = FileData(data_rows)

synthetic_dataset.files_data[file] = file_data

Expand Down
3 changes: 0 additions & 3 deletions genai-perf/tests/test_embeddings_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def test_convert_default(self):
generic_dataset = GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[DataRow(texts=["text_1"]), DataRow(texts=["text_2"])],
)
}
Expand Down Expand Up @@ -101,7 +100,6 @@ def test_convert_batched(self):
generic_dataset = GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text_1", "text_2"]),
DataRow(texts=["text_3", "text_4"]),
Expand Down Expand Up @@ -149,7 +147,6 @@ def test_convert_with_request_parameters(self):
generic_dataset = GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[DataRow(texts=["text_1"]), DataRow(texts=["text_2"])],
)
}
Expand Down
10 changes: 0 additions & 10 deletions genai-perf/tests/test_file_input_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def test_get_input_file_single_image(
)

assert file_data is not None
assert file_data.filename == "single_image.jsonl"
assert len(file_data.rows) == 1
assert file_data.rows[0].images[0] == "mock_base64_image"

Expand All @@ -131,7 +130,6 @@ def test_get_input_file_multiple_images(
)

assert file_data is not None
assert file_data.filename == "multiple_images.jsonl"
assert len(file_data.rows) == 3
expected_images = [
"mock_base64_image1",
Expand All @@ -157,7 +155,6 @@ def test_get_input_file_single_prompt(self, mock_file, mock_exists):
)

assert file_data is not None
assert file_data.filename == "single_prompt.jsonl"
assert len(file_data.rows) == 1
assert file_data.rows[0].texts[0] == "What is the capital of France?"

Expand All @@ -177,7 +174,6 @@ def test_get_input_file_multiple_prompts(self, mock_file, mock_exists):
)

assert file_data is not None
assert file_data.filename == "multiple_prompts.jsonl"
assert len(file_data.rows) == 3
expected_prompts = [
"What is the capital of France?",
Expand Down Expand Up @@ -210,7 +206,6 @@ def test_get_input_file_multi_modal(
)

assert file_data is not None
assert file_data.filename == "multi_modal.jsonl"
assert len(file_data.rows) == 2
assert file_data.rows[0].texts[0] == "What is this image?"
assert file_data.rows[0].images[0] == "mock_base64_image"
Expand All @@ -231,7 +226,6 @@ def test_get_input_file_deprecated_text_input(self, mock_file, mock_exists):
)

assert file_data is not None
assert file_data.filename == "deprecated_text_input.jsonl"
assert len(file_data.rows) == 2
assert file_data.rows[0].texts[0] == "Who is Albert Einstein?"
assert file_data.rows[1].texts[0] == "What is the speed of light?"
Expand Down Expand Up @@ -317,14 +311,12 @@ def test_get_input_datasets_from_dir(

assert len(file_data) == 4

assert file_data["single_prompt"].filename == "single_prompt.jsonl"
assert len(file_data["single_prompt"].rows) == 1
assert (
file_data["single_prompt"].rows[0].texts[0]
== "What is the capital of France?"
)

assert file_data["multiple_prompts"].filename == "multiple_prompts.jsonl"
assert len(file_data["multiple_prompts"].rows) == 3
expected_prompts = [
"What is the capital of France?",
Expand All @@ -334,11 +326,9 @@ def test_get_input_datasets_from_dir(
for i, prompt in enumerate(expected_prompts):
assert file_data["multiple_prompts"].rows[i].texts[0] == prompt

assert file_data["single_image"].filename == "single_image.jsonl"
assert len(file_data["single_image"].rows) == 1
assert file_data["single_image"].rows[0].images[0] == "mock_base64_image"

assert file_data["multi_modal"].filename == "multi_modal.jsonl"
assert len(file_data["multi_modal"].rows) == 2
assert file_data["multi_modal"].rows[0].texts[0] == "What is this image?"
assert file_data["multi_modal"].rows[0].images[0] == "mock_base64_image"
Expand Down
2 changes: 0 additions & 2 deletions genai-perf/tests/test_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def test_data_retrieval_and_conversion(
generic_dataset = GenericDataset(
files_data={
"file1.jsonl": FileData(
filename="file1.jsonl",
rows=[DataRow(texts=["test input"], images=[])],
)
}
Expand Down Expand Up @@ -97,7 +96,6 @@ def test_write_json_to_file(
generic_dataset = GenericDataset(
files_data={
"file1.jsonl": FileData(
filename="file1.jsonl",
rows=[DataRow(texts=["test input one"], images=[])],
)
}
Expand Down
4 changes: 1 addition & 3 deletions genai-perf/tests/test_nvclip_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ class TestNVClipConverter:

@staticmethod
def create_generic_dataset(rows) -> GenericDataset:
return GenericDataset(
files_data={"file1": FileData(filename="file1", rows=rows)}
)
return GenericDataset(files_data={"file1": FileData(rows)})

def test_convert_default(self):
generic_dataset = self.create_generic_dataset(
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_openai_chat_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def clean_image(row):
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=clean_text(row), images=clean_image(row))
for row in rows
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_openai_completions_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def create_generic_dataset() -> GenericDataset:
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
2 changes: 0 additions & 2 deletions genai-perf/tests/test_rankings_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,11 @@ def create_generic_dataset(

if queries_data is not None:
files_data["queries"] = FileData(
filename="queries",
rows=[DataRow(texts=query) for query in queries_data],
)

if passages_data is not None:
files_data["passages"] = FileData(
filename="passages",
rows=[DataRow(texts=passage) for passage in passages_data],
)

Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_tensorrtllm_engine_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def create_generic_dataset() -> GenericDataset:
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_triton_tensorrtllm_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def create_generic_dataset():
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
1 change: 0 additions & 1 deletion genai-perf/tests/test_triton_vllm_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def create_generic_dataset():
return GenericDataset(
files_data={
"file1": FileData(
filename="file1",
rows=[
DataRow(texts=["text input one"]),
DataRow(texts=["text input two"]),
Expand Down
Loading