diff --git a/src/cloudimagedirectory/connection/connection.py b/src/cloudimagedirectory/connection/connection.py index 94048820..8c3ee3f6 100644 --- a/src/cloudimagedirectory/connection/connection.py +++ b/src/cloudimagedirectory/connection/connection.py @@ -31,6 +31,25 @@ def is_provided_by(self, name: str) -> bool: """Check the origin of the file.""" return f"{name}/" in self.filename + def is_API(self, api: str) -> bool: + """Check if the file is the actual API entry and not a sub url.""" + path = self.filename.split("/") + if path[0] != api: + return False + + if path[0] == "v1": + return True + + slash_count = self.filename.count("/") + if slash_count != 10: + return False + + # NOTE: check length of hash value. + if len(path[len(path) - 1]) != 40: + return False + + return True + class ConnectionFS: """Handles the connection to the filesystem.""" diff --git a/src/cloudimagedirectory/transform/transform.py b/src/cloudimagedirectory/transform/transform.py index 0011241a..ca242906 100644 --- a/src/cloudimagedirectory/transform/transform.py +++ b/src/cloudimagedirectory/transform/transform.py @@ -1,5 +1,6 @@ """Transforms the raw data into useful data.""" import copy +import hashlib import os from datetime import datetime from typing import Any, Callable, no_type_check @@ -84,7 +85,6 @@ class TransformerIdxListImageLatest(Transformer): # TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later. @no_type_check def run(self, data: Transformer) -> list: # noqa: C901 - """Sort the raw data.""" # NOTE: Verify that the data is not raw. entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")] @@ -281,7 +281,6 @@ class TransformerIdxListImageNames(Transformer): # TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later. @no_type_check def run(self, data: type[Transformer]) -> list: - """Sort the raw data.""" # NOTE: Verify that the data is not raw. entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")] @@ -295,27 +294,143 @@ def run(self, data: type[Transformer]) -> list: return [connection.DataEntry("v1/idx/list/image-names", results)] +class TransformerAWSV2RHEL(Transformer): + """Transform raw rhel AWS data into the schema.""" + + def run(self, data: list) -> list: + """Transform the raw data.""" + # NOTE: Verify that the data is raw. + entries = [x for x in data if x.is_provided_by("aws") and x.is_raw()] + + results = [] + for e in entries: + entry = copy.deepcopy(e) + + raw = self.src_conn.get_content(entry) + region = os.path.basename(raw.filename).split(".")[0] + + for content in raw.content: + if content["OwnerId"] != config.AWS_RHEL_OWNER_ID: + continue + + image_data = format_aws.image_rhel(content, region) + image_name = image_data["name"].replace(" ", "_").lower() + os_name = "rhel" + provider = "aws" + version = image_data["version"] + # NOTE: Due to consistency issues between the cloud providers and the fact + # that they do not all have unique numbers to identify their images, we decided + # to use this solution instead. + image_id = hashlib.sha1(image_name.encode()).hexdigest() # noqa: S324 + + # NOTE: example of expected paths + # v2/os/rhel/provider/aws/version/8.6.0/region/eu-west-3/image/71d0a7aaa1f0dc06840e46f6ce316a7acfb022d4 + # v2/os/rhel/provider/aws/version/8.2.0/region/eu-north-1/image/14e4eab326cc5a2ef13cb5c0f36bc9bfa41025d9 + path = f"v2/os/{os_name}/provider/{provider}/version/{version}/region/{region}/image/{image_id}" + data_entry = connection.DataEntry(path, image_data) + + results.append(data_entry) + return results + + +class TransformerAzureV2RHEL(Transformer): + """Transform raw rhel Azure data into the schema.""" + + def run(self, data: list) -> list: + """Transform the raw data.""" + # NOTE: Verify that the data is raw and provided by azure. + entries = [x for x in data if x.is_provided_by("azure") and x.is_raw()] + + results = [] + for e in entries: + entry = copy.deepcopy(e) + raw = self.src_conn.get_content(entry) + + for content in raw.content: + if content["publisher"] != "RedHat": + continue + + content["hyperVGeneration"] = "unknown" + + image_data = format_azure.image_rhel(content) + image_name = image_data["name"].replace(" ", "_").lower() + os_name = "rhel" + provider = "azure" + region = "global" + version = image_data["version"] + # NOTE: Due to consistency issues between the cloud providers and the fact + # that they do not all have unique numbers to identify their images, we decided + # to use this solution instead. + image_id = hashlib.sha1(image_name.encode()).hexdigest() # noqa: S324 + + # NOTE: example of expected paths + # v2/os/rhel/provider/azure/version/8.6.0/region/southcentralus/image/71d0a7aaa1f0dc06840e46f6ce316a7acfb022d4 + # v2/os/rhel/provider/azure/version/8.2.0/region/southcentralus/image/14e4eab326cc5a2ef13cb5c0f36bc9bfa41025d9 + path = f"v2/os/{os_name}/provider/{provider}/version/{version}/region/{region}/image/{image_id}" + data_entry = connection.DataEntry(path, image_data) + + results.append(data_entry) + return results + + +class TransformerGoogleV2RHEL(Transformer): + """Transform raw rhel Google data into the schema.""" + + def run(self, data: list) -> list: + """Transform the raw data.""" + # NOTE: Verify that the data is raw and provided by google. + entries = [x for x in data if x.is_provided_by("google") and x.is_raw()] + + results = [] + for e in entries: + entry = copy.deepcopy(e) + raw = self.src_conn.get_content(entry) + + for content in raw.content: + content["creation_timestamp"] = content["creationTimestamp"] + if "rhel" in content["name"]: + image_data = format_google.image_rhel(content) + image_name = image_data["name"].replace(" ", "_").lower() + region = "global" + os_name = "rhel" + provider = "google" + version = image_data["version"] + # NOTE: Due to consistency issues between the cloud providers and the fact + # that they do not all have unique numbers to identify their images, we decided + # to use this solution instead. + image_id = hashlib.sha1(image_name.encode()).hexdigest() # noqa: S324 + + # NOTE: example of expected paths + # v2/os/rhel/provider/google/version/8.6.0/region/global/image/71d0a7aaa1f0dc06840e46f6ce316a7acfb022d4 + # v2/os/rhel/provider/google/version/8.2.0/region/global/image/14e4eab326cc5a2ef13cb5c0f36bc9bfa41025d9 + path = f"v2/os/{os_name}/provider/{provider}/version/{version}/region/{region}/image/{image_id}" + data_entry = connection.DataEntry(path, image_data) + + results.append(data_entry) + return results + + class TransformerV2All(Transformer): """Genearate list of all image details.""" # TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later. @no_type_check def run(self, data: type[Transformer]) -> list: - """Sort the raw data.""" - # NOTE: Verify that the data is not raw. - entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")] + # NOTE: Verify that the data is from api v2. + entries = [x for x in data if x.is_API("v2")] results = [] for e in entries: entry = copy.deepcopy(e) + filename = entry.filename.split("/") if len(filename) < 3: print("warn: could not determine region or provider of image: " + entry.filename) continue - entry.content["provider"] = filename[1] - entry.content["region"] = filename[2] + entry.content["provider"] = filename[4] + entry.content["region"] = filename[8] results.append(entry.content) results.sort(key=lambda x: x["name"], reverse=False) @@ -339,20 +454,18 @@ def display_name(self) -> dict: # TODO: Mypy says that 'data' below is not iterable. This needs to be fixed later. @no_type_check def run(self, data: type[Transformer]) -> list: - """Sort the raw data.""" - # NOTE: Verify that the data is not raw. - entries = [x for x in data if not x.is_raw() and not x.is_provided_by("idx")] + # NOTE: Verify that the data is from api v2. + entries = [x for x in data if x.is_API("v2")] results = [] os_list = {} for e in entries: entry = copy.deepcopy(e) + filename = entry.filename.split("/")[10] try: - filename = entry.filename.split("/")[3] - print(entry.filename) - os = filename.split("_")[0] + os = entry.filename.split("/")[2] if os not in os_list: os_list[os] = 1 @@ -361,27 +474,7 @@ def run(self, data: type[Transformer]) -> list: except IndexError: print(f"Could not format image, filename: {filename}") - rhel_products = { - "rh-ocp-worker", - "rh-oke-worker", - "rh-opp-worker", - "rh-rhel", - "rhel-arm64", - "rhel-byos", - "rhel-raw", - "rhel-sap-apps", - "rhel-sap-ha", - "rh", - } - - os_list_final: dict[Any, Any] = {} - for os, val in list(os_list.items()): - key = os - if os in rhel_products: - key = "rhel" - os_list_final[key] = os_list_final.get(key, 0) + val - - for os, val in os_list_final.items(): + for os, val in os_list.items(): desc = self.description.get(os, "no description") disp_name = self.display_name.get(os, "no display name") @@ -394,4 +487,5 @@ def run(self, data: type[Transformer]) -> list: results.append(entry_object) + # NOTE: Add /list suffix to prevent collision with "os" folder. return [connection.DataEntry("v2/os/list", results)] diff --git a/src/cloudimagedirectory/transformer.py b/src/cloudimagedirectory/transformer.py index 13016727..7465bc63 100644 --- a/src/cloudimagedirectory/transformer.py +++ b/src/cloudimagedirectory/transformer.py @@ -53,6 +53,7 @@ def run(origin_path: str, destination_path: str, arg_files: str, filter_until: s filters = [ filter.FilterImageByFilename("test"), filter.FilterImageByFilename("beta"), + filter.FilterImageByFilename("raw"), filter.FilterImageByUniqueName(), ] @@ -64,7 +65,7 @@ def run(origin_path: str, destination_path: str, arg_files: str, filter_until: s filter_after = pd.to_datetime(filter_until) filters.append(filter.FilterImageByLatestUpdate(filter_after)) - pipeline = transform.Pipeline( + pipeline_v1 = transform.Pipeline( origin_connection, [ transform.TransformerAWS, @@ -78,12 +79,29 @@ def run(origin_path: str, destination_path: str, arg_files: str, filter_until: s transform.TransformerIdxListImageLatestGoogle, transform.TransformerIdxListImageLatestAWS, transform.TransformerIdxListImageLatestAZURE, + ], + ) + print("run pipeline v1") + results = pipeline_v1.run(filenames) + + # NOTE: Introducing a second pipeline, to avoid filtering of v1/v2 data + # based on the image filename. + # We do not adapt the filter, since v1 will be removed soon. + pipeline_v2 = transform.Pipeline( + origin_connection, + [ + transform.TransformerAWSV2RHEL, + transform.TransformerAzureV2RHEL, + transform.TransformerGoogleV2RHEL, + ], + filters, + [ transform.TransformerV2All, transform.TransformerV2ListOS, ], ) - print("run pipeline") - results = pipeline.run(filenames) + print("run pipeline v2") + results.extend(pipeline_v2.run(filenames)) for result in results: result.filename = destination_path + "/" + result.filename diff --git a/tests/transformer/test_aws_rhel.py b/tests/transformer/test_aws_rhel.py new file mode 100644 index 00000000..b5c3f406 --- /dev/null +++ b/tests/transformer/test_aws_rhel.py @@ -0,0 +1,33 @@ +"""Tests for the v2 AWS RHEL transformer.""" +import filecmp +import os + +from cloudimagedirectory import transformer + + +def test_aws_v2_rhel_transformer_command(runner, tmp_path): + """Verify that we can transform AWS data for RHEL.""" + result = runner.invoke( + transformer.run, + [ + "-f", + "tests/transformer/testdata/input/raw/aws/af-south-1.json", + "-op=.", + f"-dp={tmp_path}", + "--filter.until=none", + ], + ) + + assert result.exit_code == 0, f"expected no error, but got code {result.exit_code} and output:\n{result.output}" + + # Ensure the directory was made. + assert os.path.isdir(f"{tmp_path}/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image") + + # Get current directory + pwd = os.getcwd() + + # Check image data by comparing the expected file and the output file byte by byte. + assert filecmp.cmp( + f"{pwd}/tests/transformer/testdata/expected/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image/4031b089d970c84bf7fad57831ba552e36517a3f", + f"{tmp_path}/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image/4031b089d970c84bf7fad57831ba552e36517a3f", + ) diff --git a/tests/transformer/test_azure_rhel.py b/tests/transformer/test_azure_rhel.py new file mode 100644 index 00000000..5dca149a --- /dev/null +++ b/tests/transformer/test_azure_rhel.py @@ -0,0 +1,33 @@ +"""Tests for the v2 Azure RHEL transformer.""" +import filecmp +import os + +from cloudimagedirectory import transformer + + +def test_aws_v2_rhel_transformer_command(runner, tmp_path): + """Verify that we can transform Azure data for RHEL.""" + result = runner.invoke( + transformer.run, + [ + "-f", + "tests/transformer/testdata/input/raw/azure/eastus.json", + "-op=.", + f"-dp={tmp_path}", + "--filter.until=none", + ], + ) + + assert result.exit_code == 0, f"expected no error, but got code {result.exit_code} and output:\n{result.output}" + + # Ensure the directory was made. + assert os.path.isdir(f"{tmp_path}/v2/os/rhel/provider/azure/version/311.161/region/global/image") + + # Get current directory + pwd = os.getcwd() + + # Check image data by comparing the expected file and the output file byte by byte. + assert filecmp.cmp( + f"{pwd}/tests/transformer/testdata/expected/v2/os/rhel/provider/azure/version/311.161/region/global/image/93212c01392a1e372edd399bde5838066089b22c", + f"{tmp_path}/v2/os/rhel/provider/azure/version/311.161/region/global/image/93212c01392a1e372edd399bde5838066089b22c", + ) diff --git a/tests/transformer/test_google_rhel.py b/tests/transformer/test_google_rhel.py new file mode 100644 index 00000000..fdf4b4f5 --- /dev/null +++ b/tests/transformer/test_google_rhel.py @@ -0,0 +1,33 @@ +"""Tests for the v2 Google RHEL transformer.""" +import filecmp +import os + +from cloudimagedirectory import transformer + + +def test_aws_v2_rhel_transformer_command(runner, tmp_path): + """Verify that we can transform Google data for RHEL.""" + result = runner.invoke( + transformer.run, + [ + "-f", + "tests/transformer/testdata/input/raw/google/all.json", + "-op=.", + f"-dp={tmp_path}", + "--filter.until=none", + ], + ) + + assert result.exit_code == 0, f"expected no error, but got code {result.exit_code} and output:\n{result.output}" + + # Ensure the directory was made. + assert os.path.isdir(f"{tmp_path}/v2/os/rhel/provider/google/version/7/region/global/image") + + # Get current directory + pwd = os.getcwd() + + # Check image data by comparing the expected file and the output file byte by byte. + assert filecmp.cmp( + f"{pwd}/tests/transformer/testdata/expected/v2/os/rhel/provider/google/version/7/region/global/image/a2f9b1c21e096445099c419aa0c0c9bc32657059", + f"{tmp_path}/v2/os/rhel/provider/google/version/7/region/global/image/a2f9b1c21e096445099c419aa0c0c9bc32657059", + ) diff --git a/tests/transformer/test_list_os.py b/tests/transformer/test_list_os.py index 2857fa09..b48e465a 100644 --- a/tests/transformer/test_list_os.py +++ b/tests/transformer/test_list_os.py @@ -40,34 +40,34 @@ def test_transformerV2ListOS(tmpdir): runner.chunk_size = chunk_size data = [ transformer.connection.DataEntry( - "v1/azure/global/rh-ocp-worker_rh-ocp-worker_x64", + "v2/os/rhel/provider/azure/version/8/region/global/image/dba7673010f19a94af4345453005933fd511bea9", { "date": "2019-01-01", "name": "test1", "arch": "arch1", - "region": "region-1", + "region": "global", }, ), transformer.connection.DataEntry( - "v1/google/global/rhel_9.0_sap_x86_64", + "v2/os/rhel/provider/google/version/9/region/global/image/9054fbe0b622c638224d50d20824d2ff6782e308", { - "date": "2020-01-01", + "date": "2023-03-06T12:57:17.827-08:00", "name": "test2", - "arch": "arch2", - "region": "region-1", + "arch": "ARM64", + "region": "global", }, ), transformer.connection.DataEntry( - "v1/aws/ap-northeast-2/rhel_8.5_hvm_arm64_hourly2", + "v2/os/rhel/provider/aws/version/8/region/ap-south-2/image/9054fbe0b622c638224d50d20824d2ff6782e308", { "date": "2020-01-01", "name": "test2", "arch": "arch2", - "region": "region-1", + "region": "ap-south-2", }, ), transformer.connection.DataEntry( - "v1/aws/some-region-1/unkown_distro", + "v2/os/unkown/provider/aws/version/7/region/some-region-1/image/9054fbe0b622c638224d50d20824d2ff6782e308", { "date": "2020-01-01", "name": "test2", diff --git a/tests/transformer/testdata/expected/v2/os/list b/tests/transformer/testdata/expected/v2/os/list index 2d54aa13..ca92a5df 100644 --- a/tests/transformer/testdata/expected/v2/os/list +++ b/tests/transformer/testdata/expected/v2/os/list @@ -1 +1 @@ -[{"name": "rhel", "display_name": "Red Hat Enterprise Linux", "description": "Red Hat Enterprise Linux", "count": 2}, {"name": "osa", "display_name": "no display name", "description": "no description", "count": 1}] +[{"name": "rhel", "display_name": "Red Hat Enterprise Linux", "description": "Red Hat Enterprise Linux", "count": 3}] diff --git a/tests/transformer/testdata/expected/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image/4031b089d970c84bf7fad57831ba552e36517a3f b/tests/transformer/testdata/expected/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image/4031b089d970c84bf7fad57831ba552e36517a3f new file mode 100644 index 00000000..cdc2c7b0 --- /dev/null +++ b/tests/transformer/testdata/expected/v2/os/rhel/provider/aws/version/6.10/region/af-south-1/image/4031b089d970c84bf7fad57831ba552e36517a3f @@ -0,0 +1 @@ +{"name": "RHEL 6.10 hvm x86_64 Hourly2", "arch": "x86_64", "version": "6.10", "imageId": "ami-0c22ca1423e1721e7", "date": "2021-03-18T15:22:40.000Z", "virt": "hvm", "selflink": "https://console.aws.amazon.com/ec2/home?region=af-south-1#launchAmi=ami-0c22ca1423e1721e7", "region": "af-south-1"} diff --git a/tests/transformer/testdata/expected/v2/os/rhel/provider/azure/version/311.161/region/global/image/93212c01392a1e372edd399bde5838066089b22c b/tests/transformer/testdata/expected/v2/os/rhel/provider/azure/version/311.161/region/global/image/93212c01392a1e372edd399bde5838066089b22c new file mode 100644 index 00000000..27ab9921 --- /dev/null +++ b/tests/transformer/testdata/expected/v2/os/rhel/provider/azure/version/311.161/region/global/image/93212c01392a1e372edd399bde5838066089b22c @@ -0,0 +1 @@ +{"name": "osa osa_311 x64", "arch": "x64", "version": "311.161", "imageId": "RedHat:osa:osa_311:311.161.20200115", "date": "2020-01-15", "virt": "unknown"} diff --git a/tests/transformer/testdata/expected/v2/os/rhel/provider/google/version/7/region/global/image/a2f9b1c21e096445099c419aa0c0c9bc32657059 b/tests/transformer/testdata/expected/v2/os/rhel/provider/google/version/7/region/global/image/a2f9b1c21e096445099c419aa0c0c9bc32657059 new file mode 100644 index 00000000..094e4897 --- /dev/null +++ b/tests/transformer/testdata/expected/v2/os/rhel/provider/google/version/7/region/global/image/a2f9b1c21e096445099c419aa0c0c9bc32657059 @@ -0,0 +1 @@ +{"name": "RHEL 7 X86_64", "arch": "X86_64", "version": "7", "imageId": "https://www.googleapis.com/compute/v1/projects/rhel-cloud/global/images/rhel-7-v20230306", "date": "2023-03-06T12:57:17.210-08:00", "selflink": "https://console.cloud.google.com/compute/imagesDetail/projects/rhel-cloud/global/images/rhel-7-v20230306"}