From ca46dcafe7122c8950efffafef4fa2f2e34bd922 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:45:05 -0500 Subject: [PATCH 01/62] First version backup --- .../clp_package_utils/scripts/start_clp.py | 25 +++++- .../clp-py-utils/clp_py_utils/clp_config.py | 53 +++++++++++-- .../clp-py-utils/clp_py_utils/result.py | 10 +++ .../executor/compress/fs_compression_task.py | 76 ++++++++++++++++--- .../job_orchestration/executor/s3_utils.py | 35 +++++++++ components/job-orchestration/pyproject.toml | 1 + 6 files changed, 181 insertions(+), 19 deletions(-) create mode 100644 components/clp-py-utils/clp_py_utils/result.py create mode 100644 components/job-orchestration/job_orchestration/executor/s3_utils.py diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 8097929f1..77f6a006a 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -624,6 +624,8 @@ def start_compression_worker( num_cpus: int, mounts: CLPDockerMounts, ): + print(clp_config) + print(container_clp_config) celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" generic_start_worker( @@ -713,6 +715,7 @@ def generic_start_worker( "-w", str(CONTAINER_CLP_HOME), "--name", container_name, "--log-driver", "local", + "-u", f"{os.getuid()}:{os.getgid()}", "-e", f"PYTHONPATH={clp_site_packages_dir}", "-e", ( f"BROKER_URL=amqp://" @@ -729,13 +732,26 @@ def generic_start_worker( "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", - "-u", f"{os.getuid()}:{os.getgid()}", ] if worker_specific_env: for env_name, env_value in worker_specific_env.items(): container_start_cmd.append("-e") container_start_cmd.append(f"{env_name}={env_value}") + s3_config = clp_config.archive_output.s3_config + if s3_config is not None: + container_start_cmd += [ + "-e", f"ENABLE_S3_ARCHIVE=1", + "-e", f"REGION_NAME={s3_config.region_name}", + "-e", f"BUCKET={s3_config.bucket}", + "-e", f"KEY_PREFIX={s3_config.key_prefix}" + ] + if s3_config.secret_access_key is not None and s3_config.secret_access_key is not None: + container_start_cmd += [ + "-e", f"ACCESS_KEY_ID={s3_config.access_key_id}", + "-e", f"SECRET_ACCESS_KEY={s3_config.secret_access_key}" + ] + # fmt: on necessary_mounts = [ mounts.clp_home, @@ -1126,6 +1142,13 @@ def main(argv): ): validate_and_load_redis_credentials_file(clp_config, clp_home, True) + # Might be a good place to verification for s3 config. + # if target in ( + # ALL_TARGET_NAME, + # COMPRESSION_WORKER_COMPONENT_NAME + # ): + # validate_s3_config(clp_config.archive_output, clp_home) + clp_config.validate_data_dir() clp_config.validate_logs_dir() except: diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 79a94505d..07983d9f2 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -1,6 +1,6 @@ import pathlib -import typing from enum import auto +from typing import Optional from dotenv import dotenv_values from pydantic import BaseModel, PrivateAttr, validator @@ -69,12 +69,12 @@ class Database(BaseModel): host: str = "localhost" port: int = 3306 name: str = "clp-db" - ssl_cert: typing.Optional[str] = None + ssl_cert: Optional[str] = None auto_commit: bool = False compress: bool = True - username: typing.Optional[str] = None - password: typing.Optional[str] = None + username: Optional[str] = None + password: Optional[str] = None @validator("type") def validate_database_type(cls, field): @@ -227,7 +227,7 @@ class Redis(BaseModel): query_backend_database: int = 0 compression_backend_database: int = 1 # redis can perform authentication without a username - password: typing.Optional[str] + password: Optional[str] @validator("host") def validate_host(cls, field): @@ -300,8 +300,44 @@ class Queue(BaseModel): host: str = "localhost" port: int = 5672 - username: typing.Optional[str] - password: typing.Optional[str] + username: Optional[str] + password: Optional[str] + + +class S3Config(BaseModel): + # TODO: need to think of a way to verify the account and + # keys. Maybe need to be outside of the config because every config + # can have different privilege + # Things to verify: + # 1. Can only be enabled if clp-s is used. + # 2. Does key_prefix need to end with '/'? maybe it doesn't but will cause some issue for list bucket. + + # Required fields + region_name: str + bucket: str + key_prefix: str + + # Optional fields + access_key_id: Optional[str] = None + secret_access_key: Optional[str] = None + + @validator("region_name") + def validate_region_name(cls, field): + if field == "": + raise ValueError("region_name is not provided") + return field + + @validator("bucket") + def validate_bucket(cls, field): + if field == "": + raise ValueError("bucket is not provided") + return field + + @validator("key_prefix") + def validate_key_prefix(cls, field): + if field == "": + raise ValueError("key_prefix is not provided") + return field class ArchiveOutput(BaseModel): @@ -310,6 +346,7 @@ class ArchiveOutput(BaseModel): target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB target_segment_size: int = 256 * 1024 * 1024 # 256 MB + s3_config: Optional[S3Config] = None @validator("target_archive_size") def validate_target_archive_size(cls, field): @@ -408,7 +445,7 @@ def validate_port(cls, field): class CLPConfig(BaseModel): - execution_container: typing.Optional[str] + execution_container: Optional[str] input_logs_directory: pathlib.Path = pathlib.Path("/") diff --git a/components/clp-py-utils/clp_py_utils/result.py b/components/clp-py-utils/clp_py_utils/result.py new file mode 100644 index 000000000..9c72cebf9 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/result.py @@ -0,0 +1,10 @@ +from typing import Optional + + +class Result: + def __init__(self, success: bool, error: Optional[str] = None): + self.success = success + self.error = error + + def __repr__(self): + return f"Result(success={self.success}, error={self.error})" diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index ce88ad185..772a358d7 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -4,6 +4,7 @@ import pathlib import subprocess from contextlib import closing +from typing import Any, Dict, Optional import yaml from celery.app.task import Task @@ -12,11 +13,14 @@ COMPRESSION_JOBS_TABLE_NAME, COMPRESSION_TASKS_TABLE_NAME, Database, + S3Config, StorageEngine, ) from clp_py_utils.clp_logging import set_logging_level +from clp_py_utils.result import Result from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app +from job_orchestration.executor.s3_utils import s3_put from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult @@ -71,6 +75,40 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi ) +def upload_single_file_archive_to_s3( + archive_stats: Dict[str, Any], + archive_dir: pathlib.Path, + s3_config: S3Config, +) -> Result: + logger.info("Starting to upload to S3...") + archive_id = archive_stats["id"] + src_file = archive_dir / archive_id + result = s3_put(s3_config, src_file, archive_id) + if not result.success: + logger.error(f"Failed to upload to S3: {result.error}") + return result + + logger.info("Finished uploading to S3...") + src_file.unlink() + return Result(success=True) + + +def get_s3_config() -> Optional[S3Config]: + enable_s3_write = os.getenv("ENABLE_S3_ARCHIVE") + if enable_s3_write is None: + return None + + # TODO: this method is very errorprone since it doesn't check individual members + # Let's leave this for now before we properly implement the config file. + s3_config = S3Config( + region_name=os.getenv("REGION_NAME"), + bucket=os.getenv("BUCKET"), + key_prefix=os.getenv("KEY_PREFIX"), + access_key_id=os.getenv("ACCESS_KEY_ID"), + secret_access_key=os.getenv("SECRET_ACCESS_KEY") + ) + return s3_config + def make_clp_command( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, @@ -113,6 +151,7 @@ def make_clp_s_command( compression_cmd = [ str(clp_home / "bin" / "clp-s"), "c", str(archive_output_dir), + "--single-file-archive", "--print-archive-stats", "--target-encoded-size", str(clp_config.output.target_segment_size + clp_config.output.target_dictionaries_size), @@ -212,18 +251,31 @@ def run_clp( # Compute the total amount of data compressed last_archive_stats = None - total_uncompressed_size = 0 - total_compressed_size = 0 + worker_output = { + "total_uncompressed_size": 0, + "total_compressed_size": 0, + } + + s3_config = get_s3_config() + while True: line = proc.stdout.readline() if not line: break stats = json.loads(line.decode("ascii")) if last_archive_stats is not None and stats["id"] != last_archive_stats["id"]: + if s3_config is not None: + result = upload_single_file_archive_to_s3( + last_archive_stats, archive_output_dir, s3_config + ) + if not result.success: + # TODO: think about how to handle S3 only error + continue + # We've started a new archive so add the previous archive's last # reported size to the total - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] + worker_output["total_uncompressed_size"] += last_archive_stats["uncompressed_size"] + worker_output["total_compressed_size"] += last_archive_stats["size"] with closing(sql_adapter.create_connection(True)) as db_conn, closing( db_conn.cursor(dictionary=True) ) as db_cursor: @@ -239,9 +291,17 @@ def run_clp( last_archive_stats = stats if last_archive_stats is not None: + if s3_config is not None: + result = upload_single_file_archive_to_s3( + last_archive_stats, archive_output_dir, s3_config + ) + if not result.success: + logger.error(f"THINK ABOUT WHAT TO DO HERE") + # TODO: think about how to handle S3 only error + # Add the last archive's last reported size - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] + worker_output["total_uncompressed_size"] += last_archive_stats["uncompressed_size"] + worker_output["total_compressed_size"] += last_archive_stats["size"] with closing(sql_adapter.create_connection(True)) as db_conn, closing( db_conn.cursor(dictionary=True) ) as db_cursor: @@ -270,10 +330,6 @@ def run_clp( # Close stderr log file stderr_log_file.close() - worker_output = { - "total_uncompressed_size": total_uncompressed_size, - "total_compressed_size": total_compressed_size, - } if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output else: diff --git a/components/job-orchestration/job_orchestration/executor/s3_utils.py b/components/job-orchestration/job_orchestration/executor/s3_utils.py new file mode 100644 index 000000000..6909150d5 --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/s3_utils.py @@ -0,0 +1,35 @@ +from pathlib import Path + +import boto3 +from botocore.exceptions import ClientError +from clp_py_utils.clp_config import S3Config +from clp_py_utils.result import Result + +def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: + + if not src_file.exists(): + return Result(success=False, error=f"{src_file} doesn't exist") + if not src_file.is_file(): + return Result(success=False, error=f"{src_file} is not a file") + + s3_client_args = { + "region_name": s3_config.region_name, + "aws_access_key_id": s3_config.access_key_id, + "aws_secret_access_key": s3_config.secret_access_key + } + + my_s3_client = boto3.client("s3", **s3_client_args) + + with open(src_file, "rb") as file_data: + try: + my_s3_client.put_object( + Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name + ) + except ClientError as e: + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + return Result(success=False, error=f"ClientError: {error_code} - {error_message}") + except Exception as e: + return Result(success=False, error=f"An unexpected error occurred: {e}") + + return Result(success=True) diff --git a/components/job-orchestration/pyproject.toml b/components/job-orchestration/pyproject.toml index c89d84dec..2db22cab0 100644 --- a/components/job-orchestration/pyproject.toml +++ b/components/job-orchestration/pyproject.toml @@ -10,6 +10,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" +boto3 = "^1.35.76" Brotli = "^1.1.0" celery = "^5.3.6" # mariadb version must be compatible with libmariadev installed in runtime env. From b763e8b63c017083adb7097a8465eb87fe8a8341 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:02:10 -0500 Subject: [PATCH 02/62] Small refactor --- .../clp_package_utils/scripts/start_clp.py | 9 ++- .../executor/compress/fs_compression_task.py | 62 ++++++++----------- .../job_orchestration/executor/s3_utils.py | 4 +- 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 77f6a006a..0acd3a4c3 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -728,7 +728,6 @@ def generic_start_worker( ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", "-e", f"CLP_DATA_DIR={container_clp_config.data_directory}", - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", @@ -751,18 +750,24 @@ def generic_start_worker( "-e", f"ACCESS_KEY_ID={s3_config.access_key_id}", "-e", f"SECRET_ACCESS_KEY={s3_config.secret_access_key}" ] + else: + container_start_cmd += [ + "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", + ] # fmt: on necessary_mounts = [ mounts.clp_home, mounts.data_dir, mounts.logs_dir, - mounts.archives_output_dir, mounts.input_logs_dir, ] if worker_specific_mount: necessary_mounts.extend(worker_specific_mount) + if s3_config is None: + necessary_mounts.append(mounts.archives_output_dir) + for mount in necessary_mounts: if not mount: raise ValueError(f"Required mount configuration is empty: {necessary_mounts}") diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 772a358d7..7eb8e258d 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -105,10 +105,11 @@ def get_s3_config() -> Optional[S3Config]: bucket=os.getenv("BUCKET"), key_prefix=os.getenv("KEY_PREFIX"), access_key_id=os.getenv("ACCESS_KEY_ID"), - secret_access_key=os.getenv("SECRET_ACCESS_KEY") + secret_access_key=os.getenv("SECRET_ACCESS_KEY"), ) return s3_config + def make_clp_command( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, @@ -205,6 +206,10 @@ def run_clp( yaml.safe_dump(clp_metadata_db_connection_config, db_config_file) db_config_file.close() + # Get s3 config + s3_config = get_s3_config() + s3_upload_failed = False + if StorageEngine.CLP == clp_storage_engine: compression_cmd = make_clp_command( clp_home=clp_home, @@ -251,26 +256,32 @@ def run_clp( # Compute the total amount of data compressed last_archive_stats = None + last_line_decoded = False worker_output = { "total_uncompressed_size": 0, "total_compressed_size": 0, } - s3_config = get_s3_config() - - while True: + while not last_line_decoded: + stats = None line = proc.stdout.readline() - if not line: - break - stats = json.loads(line.decode("ascii")) - if last_archive_stats is not None and stats["id"] != last_archive_stats["id"]: + if line: + stats = json.loads(line.decode("ascii")) + else: + last_line_decoded = True + + if last_archive_stats is not None and ( + last_line_decoded or stats["id"] != last_archive_stats["id"] + ): if s3_config is not None: result = upload_single_file_archive_to_s3( last_archive_stats, archive_output_dir, s3_config ) if not result.success: - # TODO: think about how to handle S3 only error - continue + worker_output["error_message"] = result.error + s3_upload_failed = True + # Upon failure, skip updating the archive tags and job metadata. + break # We've started a new archive so add the previous archive's last # reported size to the total @@ -290,30 +301,6 @@ def run_clp( last_archive_stats = stats - if last_archive_stats is not None: - if s3_config is not None: - result = upload_single_file_archive_to_s3( - last_archive_stats, archive_output_dir, s3_config - ) - if not result.success: - logger.error(f"THINK ABOUT WHAT TO DO HERE") - # TODO: think about how to handle S3 only error - - # Add the last archive's last reported size - worker_output["total_uncompressed_size"] += last_archive_stats["uncompressed_size"] - worker_output["total_compressed_size"] += last_archive_stats["size"] - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_job_metadata_and_tags( - db_cursor, - job_id, - clp_metadata_db_connection_config["table_prefix"], - tag_ids, - last_archive_stats, - ) - db_conn.commit() - # Wait for compression to finish return_code = proc.wait() if 0 != return_code: @@ -330,6 +317,8 @@ def run_clp( # Close stderr log file stderr_log_file.close() + if s3_upload_failed: + return CompressionTaskStatus.FAILED, worker_output if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output else: @@ -349,9 +338,12 @@ def compress( ): clp_home_str = os.getenv("CLP_HOME") data_dir_str = os.getenv("CLP_DATA_DIR") - archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") logs_dir_str = os.getenv("CLP_LOGS_DIR") + archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") + if archive_output_dir_str is None: + archive_output_dir_str = "/tmp/archives" + # Set logging level clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) set_logging_level(logger, clp_logging_level) diff --git a/components/job-orchestration/job_orchestration/executor/s3_utils.py b/components/job-orchestration/job_orchestration/executor/s3_utils.py index 6909150d5..3a3be0ef0 100644 --- a/components/job-orchestration/job_orchestration/executor/s3_utils.py +++ b/components/job-orchestration/job_orchestration/executor/s3_utils.py @@ -5,6 +5,7 @@ from clp_py_utils.clp_config import S3Config from clp_py_utils.result import Result + def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: if not src_file.exists(): @@ -15,12 +16,13 @@ def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: s3_client_args = { "region_name": s3_config.region_name, "aws_access_key_id": s3_config.access_key_id, - "aws_secret_access_key": s3_config.secret_access_key + "aws_secret_access_key": s3_config.secret_access_key, } my_s3_client = boto3.client("s3", **s3_client_args) with open(src_file, "rb") as file_data: + # TODO: Consider retry? try: my_s3_client.put_object( Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name From 4e9529c4e9d2497e3d23bb90edad9e4afceb4acd Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:51:38 -0500 Subject: [PATCH 03/62] First trial for new config --- .../clp_package_utils/general.py | 12 +-- .../clp_package_utils/scripts/start_clp.py | 17 ++-- .../clp-py-utils/clp_py_utils/clp_config.py | 81 ++++++++++++++++--- .../package-template/src/etc/clp-config.yml | 4 +- 4 files changed, 83 insertions(+), 31 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 5fae8166f..64b684fe1 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -239,17 +239,19 @@ def generate_container_config( DockerMountType.BIND, clp_config.logs_directory, container_clp_config.logs_directory ) - container_clp_config.archive_output.directory = pathlib.Path("/") / "mnt" / "archive-output" + container_clp_config.archive_output.set_archive_directory( + pathlib.Path("/") / "mnt" / "archive-output" + ) if not is_path_already_mounted( clp_home, CONTAINER_CLP_HOME, - clp_config.archive_output.directory, - container_clp_config.archive_output.directory, + clp_config.archive_output.archive_directory(), + container_clp_config.archive_output.archive_directory(), ): docker_mounts.archives_output_dir = DockerMount( DockerMountType.BIND, - clp_config.archive_output.directory, - container_clp_config.archive_output.directory, + clp_config.archive_output.archive_directory(), + container_clp_config.archive_output.archive_directory(), ) container_clp_config.stream_output.directory = pathlib.Path("/") / "mnt" / "stream-output" diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 0acd3a4c3..6e519e63d 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -624,8 +624,6 @@ def start_compression_worker( num_cpus: int, mounts: CLPDockerMounts, ): - print(clp_config) - print(container_clp_config) celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" generic_start_worker( @@ -703,7 +701,7 @@ def generic_start_worker( container_logs_dir = container_clp_config.logs_directory / component_name # Create necessary directories - clp_config.archive_output.directory.mkdir(parents=True, exist_ok=True) + clp_config.archive_output.archive_directory().mkdir(parents=True, exist_ok=True) clp_config.stream_output.directory.mkdir(parents=True, exist_ok=True) clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages" @@ -731,14 +729,15 @@ def generic_start_worker( "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", + "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.archive_directory()}", ] if worker_specific_env: for env_name, env_value in worker_specific_env.items(): container_start_cmd.append("-e") container_start_cmd.append(f"{env_name}={env_value}") - s3_config = clp_config.archive_output.s3_config - if s3_config is not None: + if "s3" == clp_config.archive_output.storage.type: + s3_config = clp_config.archive_output.storage.s3_config container_start_cmd += [ "-e", f"ENABLE_S3_ARCHIVE=1", "-e", f"REGION_NAME={s3_config.region_name}", @@ -750,10 +749,6 @@ def generic_start_worker( "-e", f"ACCESS_KEY_ID={s3_config.access_key_id}", "-e", f"SECRET_ACCESS_KEY={s3_config.secret_access_key}" ] - else: - container_start_cmd += [ - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", - ] # fmt: on necessary_mounts = [ @@ -761,13 +756,11 @@ def generic_start_worker( mounts.data_dir, mounts.logs_dir, mounts.input_logs_dir, + mounts.archives_output_dir ] if worker_specific_mount: necessary_mounts.extend(worker_specific_mount) - if s3_config is None: - necessary_mounts.append(mounts.archives_output_dir) - for mount in necessary_mounts: if not mount: raise ValueError(f"Required mount configuration is empty: {necessary_mounts}") diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 07983d9f2..56e3caf99 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -1,6 +1,6 @@ import pathlib from enum import auto -from typing import Optional +from typing import Literal, Optional, Union from dotenv import dotenv_values from pydantic import BaseModel, PrivateAttr, validator @@ -340,13 +340,64 @@ def validate_key_prefix(cls, field): return field -class ArchiveOutput(BaseModel): +class FSStorage(BaseModel): + type: Literal["fs"] directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" + + @validator("directory") + def validate_directory(cls, field): + if "" == field: + raise ValueError("directory can not be empty") + return field + + def make_config_path_absolute(self, clp_home: pathlib.Path): + self.directory = make_config_path_absolute(clp_home, self.directory) + + def dump_to_primitive_dict(self): + d = self.dict() + d["directory"] = str(d["directory"]) + return d + + +class S3Storage(BaseModel): + type: Literal["s3"] + staging_directory: pathlib.Path = pathlib.Path("var") / "data" / "staged_archives" + s3_config: S3Config + + @validator("staging_directory") + def validate_staging_directory(cls, field): + if "" == field: + raise ValueError("staging_directory can not be empty") + return field + + @validator("s3_config") + def validate_s3_config(cls, field): + if None == field: + raise ValueError("s3_config must be provided") + return field + + def make_config_path_absolute(self, clp_home: pathlib.Path): + self.staging_directory = make_config_path_absolute(clp_home, self.staging_directory) + + def dump_to_primitive_dict(self): + d = self.dict() + d["staging_directory"] = str(d["staging_directory"]) + return d + + +class ArchiveOutput(BaseModel): + # TODO: For whatever weird reason, must first assign it to Null + storage: Union[FSStorage, S3Storage, None] target_archive_size: int = 256 * 1024 * 1024 # 256 MB target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB target_segment_size: int = 256 * 1024 * 1024 # 256 MB - s3_config: Optional[S3Config] = None + + @validator("storage") + def validate_storage(cls, field) -> bool: + if None == field: + raise ValueError("storage must be provided") + return field @validator("target_archive_size") def validate_target_archive_size(cls, field): @@ -372,14 +423,18 @@ def validate_target_segment_size(cls, field): raise ValueError("target_segment_size must be greater than 0") return field - def make_config_paths_absolute(self, clp_home: pathlib.Path): - self.directory = make_config_path_absolute(clp_home, self.directory) + def set_archive_directory(self, directory: pathlib.Path) -> None: + storage_config = self.storage + if "fs" == storage_config.type: + storage_config.directory = directory + else: + storage_config.staging_directory = directory - def dump_to_primitive_dict(self): - d = self.dict() - # Turn directory (pathlib.Path) into a primitive string - d["directory"] = str(d["directory"]) - return d + def archive_directory(self) -> pathlib.Path: + storage_config = self.storage + if "fs" == storage_config.type: + return storage_config.directory + return storage_config.staging_directory class StreamOutput(BaseModel): @@ -473,7 +528,7 @@ class CLPConfig(BaseModel): def make_config_paths_absolute(self, clp_home: pathlib.Path): self.input_logs_directory = make_config_path_absolute(clp_home, self.input_logs_directory) self.credentials_file_path = make_config_path_absolute(clp_home, self.credentials_file_path) - self.archive_output.make_config_paths_absolute(clp_home) + self.archive_output.storage.make_config_path_absolute(clp_home) self.stream_output.make_config_paths_absolute(clp_home) self.data_directory = make_config_path_absolute(clp_home, self.data_directory) self.logs_directory = make_config_path_absolute(clp_home, self.logs_directory) @@ -490,7 +545,7 @@ def validate_input_logs_dir(self): def validate_archive_output_dir(self): try: - validate_path_could_be_dir(self.archive_output.directory) + validate_path_could_be_dir(self.archive_output.archive_directory()) except ValueError as ex: raise ValueError(f"archive_output.directory is invalid: {ex}") @@ -566,7 +621,7 @@ def load_redis_credentials_from_file(self): def dump_to_primitive_dict(self): d = self.dict() - d["archive_output"] = self.archive_output.dump_to_primitive_dict() + d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() d["stream_output"] = self.stream_output.dump_to_primitive_dict() # Turn paths into primitive strings d["input_logs_directory"] = str(self.input_logs_directory) diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index f19b93463..22b03b889 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -66,7 +66,9 @@ # ## Where archives should be output to #archive_output: -# directory: "var/data/archives" +# storage: +# type: "fs" +# directory: "var/data/archives" # # # How much data CLP should try to compress into each archive # target_archive_size: 268435456 # 256 MB From e9cdea4055106d486af82b7c7754e43d1b7014c9 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 17:09:20 -0500 Subject: [PATCH 04/62] Further refactor and polishing --- .../clp_package_utils/general.py | 21 +++++++----- .../clp_package_utils/scripts/start_clp.py | 14 +++----- .../clp-py-utils/clp_py_utils/clp_config.py | 32 +++++++++++++------ .../clp-py-utils/clp_py_utils/s3_utils.py | 13 ++++++++ .../executor/compress/fs_compression_task.py | 7 ++-- 5 files changed, 54 insertions(+), 33 deletions(-) create mode 100644 components/clp-py-utils/clp_py_utils/s3_utils.py diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 64b684fe1..fbeb5de64 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -20,6 +20,7 @@ REDIS_COMPONENT_NAME, REDUCER_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, + StorageType, WEBUI_COMPONENT_NAME, ) from clp_py_utils.core import ( @@ -28,6 +29,7 @@ read_yaml_config_file, validate_path_could_be_dir, ) +from clp_py_utils.s3_utils import verify_s3_config_for_archive_output from strenum import KebabCaseStrEnum # CONSTANTS @@ -239,19 +241,17 @@ def generate_container_config( DockerMountType.BIND, clp_config.logs_directory, container_clp_config.logs_directory ) - container_clp_config.archive_output.set_archive_directory( - pathlib.Path("/") / "mnt" / "archive-output" - ) + container_clp_config.archive_output.set_directory(pathlib.Path("/") / "mnt" / "archive-output") if not is_path_already_mounted( clp_home, CONTAINER_CLP_HOME, - clp_config.archive_output.archive_directory(), - container_clp_config.archive_output.archive_directory(), + clp_config.archive_output.get_directory(), + container_clp_config.archive_output.get_directory(), ): docker_mounts.archives_output_dir = DockerMount( DockerMountType.BIND, - clp_config.archive_output.archive_directory(), - container_clp_config.archive_output.archive_directory(), + clp_config.archive_output.get_directory(), + container_clp_config.archive_output.get_directory(), ) container_clp_config.stream_output.directory = pathlib.Path("/") / "mnt" / "stream-output" @@ -484,8 +484,13 @@ def validate_results_cache_config( def validate_worker_config(clp_config: CLPConfig): clp_config.validate_input_logs_dir() - clp_config.validate_archive_output_dir() + clp_config.validate_archive_output_config() clp_config.validate_stream_output_dir() + storage_config = clp_config.archive_output.storage + if StorageType.S3 == storage_config.type: + result = verify_s3_config_for_archive_output(storage_config.s3_config) + if not result.success: + raise ValueError(f"S3 config verification failed: {result.error}") def validate_webui_config( diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 6e519e63d..da609a23a 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -701,7 +701,7 @@ def generic_start_worker( container_logs_dir = container_clp_config.logs_directory / component_name # Create necessary directories - clp_config.archive_output.archive_directory().mkdir(parents=True, exist_ok=True) + clp_config.archive_output.get_directory().mkdir(parents=True, exist_ok=True) clp_config.stream_output.directory.mkdir(parents=True, exist_ok=True) clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages" @@ -729,7 +729,8 @@ def generic_start_worker( "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.archive_directory()}", + # need a way to remove this maybe? + "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.get_directory()}", ] if worker_specific_env: for env_name, env_value in worker_specific_env.items(): @@ -755,8 +756,8 @@ def generic_start_worker( mounts.clp_home, mounts.data_dir, mounts.logs_dir, + mounts.archives_output_dir, mounts.input_logs_dir, - mounts.archives_output_dir ] if worker_specific_mount: necessary_mounts.extend(worker_specific_mount) @@ -1140,13 +1141,6 @@ def main(argv): ): validate_and_load_redis_credentials_file(clp_config, clp_home, True) - # Might be a good place to verification for s3 config. - # if target in ( - # ALL_TARGET_NAME, - # COMPRESSION_WORKER_COMPONENT_NAME - # ): - # validate_s3_config(clp_config.archive_output, clp_home) - clp_config.validate_data_dir() clp_config.validate_logs_dir() except: diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 56e3caf99..00d0b6dbb 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -4,7 +4,7 @@ from dotenv import dotenv_values from pydantic import BaseModel, PrivateAttr, validator -from strenum import KebabCaseStrEnum +from strenum import KebabCaseStrEnum, LowercaseStrEnum from .clp_logging import get_valid_logging_level, is_valid_logging_level from .core import ( @@ -48,6 +48,11 @@ class StorageEngine(KebabCaseStrEnum): CLP_S = auto() +class StorageType(LowercaseStrEnum): + FS = auto() + S3 = auto() + + VALID_STORAGE_ENGINES = [storage_engine.value for storage_engine in StorageEngine] @@ -341,7 +346,7 @@ def validate_key_prefix(cls, field): class FSStorage(BaseModel): - type: Literal["fs"] + type: Literal[StorageType.FS.value] directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" @validator("directory") @@ -360,7 +365,7 @@ def dump_to_primitive_dict(self): class S3Storage(BaseModel): - type: Literal["s3"] + type: Literal[StorageType.S3.value] staging_directory: pathlib.Path = pathlib.Path("var") / "data" / "staged_archives" s3_config: S3Config @@ -423,16 +428,16 @@ def validate_target_segment_size(cls, field): raise ValueError("target_segment_size must be greater than 0") return field - def set_archive_directory(self, directory: pathlib.Path) -> None: + def set_directory(self, directory: pathlib.Path) -> None: storage_config = self.storage - if "fs" == storage_config.type: + if StorageType.FS == storage_config.type: storage_config.directory = directory else: storage_config.staging_directory = directory - def archive_directory(self) -> pathlib.Path: + def get_directory(self) -> pathlib.Path: storage_config = self.storage - if "fs" == storage_config.type: + if StorageType.FS == storage_config.type: return storage_config.directory return storage_config.staging_directory @@ -543,11 +548,18 @@ def validate_input_logs_dir(self): if not input_logs_dir.is_dir(): raise ValueError(f"input_logs_directory '{input_logs_dir}' is not a directory.") - def validate_archive_output_dir(self): + def validate_archive_output_config(self): + if ( + StorageType.S3 == self.archive_output.storage.type + and StorageEngine.CLP_S != self.package.storage_engine + ): + raise ValueError( + f"S3 storage is only supported with storage engine: {StorageEngine.CLP_S}" + ) try: - validate_path_could_be_dir(self.archive_output.archive_directory()) + validate_path_could_be_dir(self.archive_output.get_directory()) except ValueError as ex: - raise ValueError(f"archive_output.directory is invalid: {ex}") + raise ValueError(f"directory of storage config is invalid: {ex}") def validate_stream_output_dir(self): try: diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py new file mode 100644 index 000000000..905e30fac --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -0,0 +1,13 @@ +import boto3 +from botocore.exceptions import ClientError + +from clp_py_utils.clp_config import S3Config +from clp_py_utils.result import Result + + +def verify_s3_config_for_archive_output(s3_config: S3Config) -> Result: + # TODO: need to verify: + # 1. Have write priveldge so archive can be compressed + # 2. Have read priviledge so archive can be readed + # 3. bucket and region are the same, this should run into issue but not sure + return Result(success=True) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 7eb8e258d..87a079876 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -263,8 +263,8 @@ def run_clp( } while not last_line_decoded: - stats = None line = proc.stdout.readline() + stats: Dict[str, Any] = None if line: stats = json.loads(line.decode("ascii")) else: @@ -338,11 +338,8 @@ def compress( ): clp_home_str = os.getenv("CLP_HOME") data_dir_str = os.getenv("CLP_DATA_DIR") - logs_dir_str = os.getenv("CLP_LOGS_DIR") - archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") - if archive_output_dir_str is None: - archive_output_dir_str = "/tmp/archives" + logs_dir_str = os.getenv("CLP_LOGS_DIR") # Set logging level clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) From 9ba0a38f6a7134728ccd757274138e8bb338b223 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 11 Dec 2024 22:40:47 -0500 Subject: [PATCH 05/62] Another small refactor --- .../clp_package_utils/general.py | 1 + .../clp_package_utils/scripts/start_clp.py | 12 +++++++++--- .../clp-py-utils/clp_py_utils/clp_config.py | 15 +++++---------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index fbeb5de64..05a67d497 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -486,6 +486,7 @@ def validate_worker_config(clp_config: CLPConfig): clp_config.validate_input_logs_dir() clp_config.validate_archive_output_config() clp_config.validate_stream_output_dir() + storage_config = clp_config.archive_output.storage if StorageType.S3 == storage_config.type: result = verify_s3_config_for_archive_output(storage_config.s3_config) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index da609a23a..8351a8836 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -694,8 +694,6 @@ def generic_start_worker( if container_exists(container_name): return - validate_worker_config(clp_config) - logs_dir = clp_config.logs_directory / component_name logs_dir.mkdir(parents=True, exist_ok=True) container_logs_dir = container_clp_config.logs_directory / component_name @@ -729,7 +727,7 @@ def generic_start_worker( "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", - # need a way to remove this maybe? + # need a way to remove this maybe "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.get_directory()}", ] if worker_specific_env: @@ -756,6 +754,8 @@ def generic_start_worker( mounts.clp_home, mounts.data_dir, mounts.logs_dir, + # need a way to remove this maybe, since reader doesn't need it if it is staged. + # one option is to move it to the worker_specific_mount mounts.archives_output_dir, mounts.input_logs_dir, ] @@ -1140,6 +1140,12 @@ def main(argv): QUERY_WORKER_COMPONENT_NAME, ): validate_and_load_redis_credentials_file(clp_config, clp_home, True) + if target in ( + ALL_TARGET_NAME, + COMPRESSION_WORKER_COMPONENT_NAME, + QUERY_WORKER_COMPONENT_NAME, + ): + validate_worker_config(clp_config) clp_config.validate_data_dir() clp_config.validate_logs_dir() diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 00d0b6dbb..e2398b0be 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -310,12 +310,8 @@ class Queue(BaseModel): class S3Config(BaseModel): - # TODO: need to think of a way to verify the account and - # keys. Maybe need to be outside of the config because every config - # can have different privilege - # Things to verify: - # 1. Can only be enabled if clp-s is used. - # 2. Does key_prefix need to end with '/'? maybe it doesn't but will cause some issue for list bucket. + # Todo: + # Does key_prefix need to end with '/'? maybe it doesn't. # Required fields region_name: str @@ -346,7 +342,7 @@ def validate_key_prefix(cls, field): class FSStorage(BaseModel): - type: Literal[StorageType.FS.value] + type: Literal[StorageType.FS.value] = StorageType.FS.value directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" @validator("directory") @@ -365,7 +361,7 @@ def dump_to_primitive_dict(self): class S3Storage(BaseModel): - type: Literal[StorageType.S3.value] + type: Literal[StorageType.S3.value] = StorageType.S3.value staging_directory: pathlib.Path = pathlib.Path("var") / "data" / "staged_archives" s3_config: S3Config @@ -391,8 +387,7 @@ def dump_to_primitive_dict(self): class ArchiveOutput(BaseModel): - # TODO: For whatever weird reason, must first assign it to Null - storage: Union[FSStorage, S3Storage, None] + storage: Union[FSStorage, S3Storage] = FSStorage() target_archive_size: int = 256 * 1024 * 1024 # 256 MB target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB From 58befefe2390ba4655274c851108e36ba4a284de Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:52:23 -0500 Subject: [PATCH 06/62] small refactor again --- .../executor/compress/fs_compression_task.py | 4 ++-- .../job_orchestration/executor/s3_utils.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 87a079876..756c3f9b3 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -264,14 +264,14 @@ def run_clp( while not last_line_decoded: line = proc.stdout.readline() - stats: Dict[str, Any] = None + stats: Optional[Dict[str, Any]] = None if line: stats = json.loads(line.decode("ascii")) else: last_line_decoded = True if last_archive_stats is not None and ( - last_line_decoded or stats["id"] != last_archive_stats["id"] + None is stats or stats["id"] != last_archive_stats["id"] ): if s3_config is not None: result = upload_single_file_archive_to_s3( diff --git a/components/job-orchestration/job_orchestration/executor/s3_utils.py b/components/job-orchestration/job_orchestration/executor/s3_utils.py index 3a3be0ef0..dd6a215ba 100644 --- a/components/job-orchestration/job_orchestration/executor/s3_utils.py +++ b/components/job-orchestration/job_orchestration/executor/s3_utils.py @@ -1,12 +1,15 @@ from pathlib import Path import boto3 +from botocore.config import Config from botocore.exceptions import ClientError from clp_py_utils.clp_config import S3Config from clp_py_utils.result import Result -def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: +def s3_put( + s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 +) -> Result: if not src_file.exists(): return Result(success=False, error=f"{src_file} doesn't exist") @@ -19,10 +22,11 @@ def s3_put(s3_config: S3Config, src_file: Path, dest_file_name: str) -> Result: "aws_secret_access_key": s3_config.secret_access_key, } - my_s3_client = boto3.client("s3", **s3_client_args) + config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) + + my_s3_client = boto3.client("s3", **s3_client_args, config=config) with open(src_file, "rb") as file_data: - # TODO: Consider retry? try: my_s3_client.put_object( Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name From 35ec0c37569b6d558b994c3d4fb41c70eeeeba8c Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:47:17 -0500 Subject: [PATCH 07/62] Combine s3 utils --- .../clp-py-utils/clp_py_utils/s3_utils.py | 38 ++++++++++++++++- .../executor/compress/fs_compression_task.py | 2 +- .../job_orchestration/executor/s3_utils.py | 41 ------------------- 3 files changed, 38 insertions(+), 43 deletions(-) delete mode 100644 components/job-orchestration/job_orchestration/executor/s3_utils.py diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 905e30fac..83346e4de 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -1,6 +1,8 @@ +from pathlib import Path + import boto3 +from botocore.config import Config from botocore.exceptions import ClientError - from clp_py_utils.clp_config import S3Config from clp_py_utils.result import Result @@ -11,3 +13,37 @@ def verify_s3_config_for_archive_output(s3_config: S3Config) -> Result: # 2. Have read priviledge so archive can be readed # 3. bucket and region are the same, this should run into issue but not sure return Result(success=True) + + +def s3_put( + s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 +) -> Result: + + if not src_file.exists(): + return Result(success=False, error=f"{src_file} doesn't exist") + if not src_file.is_file(): + return Result(success=False, error=f"{src_file} is not a file") + + s3_client_args = { + "region_name": s3_config.region_name, + "aws_access_key_id": s3_config.access_key_id, + "aws_secret_access_key": s3_config.secret_access_key, + } + + config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) + + my_s3_client = boto3.client("s3", **s3_client_args, config=config) + + with open(src_file, "rb") as file_data: + try: + my_s3_client.put_object( + Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name + ) + except ClientError as e: + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + return Result(success=False, error=f"ClientError: {error_code} - {error_message}") + except Exception as e: + return Result(success=False, error=f"An unexpected error occurred: {e}") + + return Result(success=True) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 756c3f9b3..a042a3611 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -19,8 +19,8 @@ from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.result import Result from clp_py_utils.sql_adapter import SQL_Adapter +from clp_py_utils.s3_utils import s3_put from job_orchestration.executor.compress.celery import app -from job_orchestration.executor.s3_utils import s3_put from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult diff --git a/components/job-orchestration/job_orchestration/executor/s3_utils.py b/components/job-orchestration/job_orchestration/executor/s3_utils.py deleted file mode 100644 index dd6a215ba..000000000 --- a/components/job-orchestration/job_orchestration/executor/s3_utils.py +++ /dev/null @@ -1,41 +0,0 @@ -from pathlib import Path - -import boto3 -from botocore.config import Config -from botocore.exceptions import ClientError -from clp_py_utils.clp_config import S3Config -from clp_py_utils.result import Result - - -def s3_put( - s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 -) -> Result: - - if not src_file.exists(): - return Result(success=False, error=f"{src_file} doesn't exist") - if not src_file.is_file(): - return Result(success=False, error=f"{src_file} is not a file") - - s3_client_args = { - "region_name": s3_config.region_name, - "aws_access_key_id": s3_config.access_key_id, - "aws_secret_access_key": s3_config.secret_access_key, - } - - config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) - - my_s3_client = boto3.client("s3", **s3_client_args, config=config) - - with open(src_file, "rb") as file_data: - try: - my_s3_client.put_object( - Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name - ) - except ClientError as e: - error_code = e.response["Error"]["Code"] - error_message = e.response["Error"]["Message"] - return Result(success=False, error=f"ClientError: {error_code} - {error_message}") - except Exception as e: - return Result(success=False, error=f"An unexpected error occurred: {e}") - - return Result(success=True) From 5d57b101217be05d54a9bde581a4021686f668d2 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:47:46 -0500 Subject: [PATCH 08/62] Support handling S3 error message --- .../initialize-orchestration-db.py | 2 +- .../compress/compression_scheduler.py | 22 ++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py index 1ed727367..2c8133e8a 100644 --- a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py +++ b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py @@ -52,7 +52,7 @@ def main(argv): CREATE TABLE IF NOT EXISTS `{COMPRESSION_JOBS_TABLE_NAME}` ( `id` INT NOT NULL AUTO_INCREMENT, `status` INT NOT NULL DEFAULT '{CompressionJobStatus.PENDING}', - `status_msg` VARCHAR(255) NOT NULL DEFAULT '', + `status_msg` VARCHAR(512) NOT NULL DEFAULT '', `creation_time` DATETIME(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3), `start_time` DATETIME(3) NULL DEFAULT NULL, `duration` FLOAT NULL DEFAULT NULL, diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index 62b7a27fc..2de3e835b 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -53,13 +53,14 @@ def update_compression_task_metadata(db_cursor, task_id, kv): logger.error("Must specify at least one field to update") raise ValueError - field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + field_set_expressions = [f"{k} = %s" for k in kv.keys()] query = f""" - UPDATE {COMPRESSION_TASKS_TABLE_NAME} - SET {", ".join(field_set_expressions)} - WHERE id={task_id} + UPDATE {COMPRESSION_TASKS_TABLE_NAME} + SET {", ".join(field_set_expressions)} + WHERE id = %s """ - db_cursor.execute(query) + values = [v for v in kv.values()] + [task_id] + db_cursor.execute(query, values) def update_compression_job_metadata(db_cursor, job_id, kv): @@ -67,13 +68,14 @@ def update_compression_job_metadata(db_cursor, job_id, kv): logger.error("Must specify at least one field to update") raise ValueError - field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + field_set_expressions = [f"{k} = %s" for k in kv.keys()] query = f""" - UPDATE {COMPRESSION_JOBS_TABLE_NAME} - SET {", ".join(field_set_expressions)} - WHERE id={job_id} + UPDATE {COMPRESSION_JOBS_TABLE_NAME} + SET {", ".join(field_set_expressions)} + WHERE id = %s """ - db_cursor.execute(query) + values = [v for v in kv.values()] + [job_id] + db_cursor.execute(query, values) def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection_config): From 999130766a457480a3b0f948ba6ff459715c3797 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:12:06 -0500 Subject: [PATCH 09/62] Slight logging modification --- .../executor/compress/fs_compression_task.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index a042a3611..f808a27ee 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -80,15 +80,16 @@ def upload_single_file_archive_to_s3( archive_dir: pathlib.Path, s3_config: S3Config, ) -> Result: - logger.info("Starting to upload to S3...") archive_id = archive_stats["id"] + + logger.info(f"Starting to upload archive {archive_id} to S3...") src_file = archive_dir / archive_id result = s3_put(s3_config, src_file, archive_id) if not result.success: - logger.error(f"Failed to upload to S3: {result.error}") + logger.error(f"Failed to upload archive {archive_id}: {result.error}") return result - logger.info("Finished uploading to S3...") + logger.info(f"Finished uploading archive {archive_id} to S3...") src_file.unlink() return Result(success=True) @@ -318,6 +319,7 @@ def run_clp( stderr_log_file.close() if s3_upload_failed: + logger.error(f"Failed to upload to S3.") return CompressionTaskStatus.FAILED, worker_output if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output From 5d237907d0e6d679865f71e68bcb0a8d79339109 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:12:32 -0500 Subject: [PATCH 10/62] Linter --- components/clp-py-utils/clp_py_utils/s3_utils.py | 1 + .../job_orchestration/executor/compress/fs_compression_task.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 83346e4de..df2633214 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -3,6 +3,7 @@ import boto3 from botocore.config import Config from botocore.exceptions import ClientError + from clp_py_utils.clp_config import S3Config from clp_py_utils.result import Result diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index f808a27ee..120f9179a 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -18,8 +18,8 @@ ) from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.result import Result -from clp_py_utils.sql_adapter import SQL_Adapter from clp_py_utils.s3_utils import s3_put +from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress From b4bb2af1ecb0002b23049986e90769c9f87e3268 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:15:52 -0500 Subject: [PATCH 11/62] Add extra verification --- components/clp-py-utils/clp_py_utils/clp_config.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index e2398b0be..205a65ec5 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -310,15 +310,10 @@ class Queue(BaseModel): class S3Config(BaseModel): - # Todo: - # Does key_prefix need to end with '/'? maybe it doesn't. - - # Required fields region_name: str bucket: str key_prefix: str - # Optional fields access_key_id: Optional[str] = None secret_access_key: Optional[str] = None @@ -338,6 +333,8 @@ def validate_bucket(cls, field): def validate_key_prefix(cls, field): if field == "": raise ValueError("key_prefix is not provided") + if not field.endswith("/"): + raise ValueError(r'key_prefix must end with "/"') return field From f41c5587755fc672430c33e4fcd5a190c12a4ed0 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:17:11 -0500 Subject: [PATCH 12/62] Update components/clp-py-utils/clp_py_utils/clp_config.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- components/clp-py-utils/clp_py_utils/clp_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 205a65ec5..3694c8312 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -497,7 +497,7 @@ def validate_port(cls, field): class CLPConfig(BaseModel): - execution_container: Optional[str] + execution_container: Optional[str] = None input_logs_directory: pathlib.Path = pathlib.Path("/") From ce5a667aa372c906aceca1c2880452f1f10807c7 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:22:27 -0500 Subject: [PATCH 13/62] do nothing for now --- components/clp-py-utils/clp_py_utils/s3_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index df2633214..0c4356407 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -9,10 +9,7 @@ def verify_s3_config_for_archive_output(s3_config: S3Config) -> Result: - # TODO: need to verify: - # 1. Have write priveldge so archive can be compressed - # 2. Have read priviledge so archive can be readed - # 3. bucket and region are the same, this should run into issue but not sure + # TODO: properly verify the s3 config return Result(success=True) From f05dc889237cc632f72865a13d3a111a09722aae Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 12 Dec 2024 17:53:54 -0500 Subject: [PATCH 14/62] backup changes for worker config --- .../clp_package_utils/general.py | 11 ++++ .../clp_package_utils/scripts/start_clp.py | 44 ++++----------- .../clp-py-utils/clp_py_utils/clp_config.py | 17 ++++++ .../executor/compress/fs_compression_task.py | 55 ++++++++++--------- 4 files changed, 68 insertions(+), 59 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 05a67d497..507e1e434 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -14,6 +14,7 @@ from clp_py_utils.clp_config import ( CLP_DEFAULT_CREDENTIALS_FILE_PATH, CLPConfig, + WorkerConfig, DB_COMPONENT_NAME, LOG_VIEWER_WEBUI_COMPONENT_NAME, QUEUE_COMPONENT_NAME, @@ -270,6 +271,16 @@ def generate_container_config( return container_clp_config, docker_mounts +def generate_worker_config(clp_config: CLPConfig) -> WorkerConfig: + worker_config = WorkerConfig() + worker_config.archive_output = clp_config.archive_output.copy(deep=True) + worker_config.stream_output = clp_config.stream_output.copy(deep=True) + worker_config.package = clp_config.package.copy(deep=True) + worker_config.data_directory = clp_config.data_directory + + return worker_config + + def dump_container_config( container_clp_config: CLPConfig, clp_config: CLPConfig, container_name: str ) -> Tuple[pathlib.Path, pathlib.Path]: diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 8351a8836..f94fefb0b 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -42,6 +42,7 @@ DockerMount, DockerMountType, generate_container_config, + generate_worker_config, get_clp_home, is_container_exited, is_container_running, @@ -638,7 +639,6 @@ def start_compression_worker( num_cpus, mounts, None, - None, ) @@ -653,10 +653,6 @@ def start_query_worker( celery_route = f"{QueueName.QUERY}" query_worker_mount = [mounts.stream_output_dir] - query_worker_env = { - "CLP_STREAM_OUTPUT_DIR": container_clp_config.stream_output.directory, - "CLP_STREAM_COLLECTION_NAME": clp_config.results_cache.stream_collection_name, - } generic_start_worker( QUERY_WORKER_COMPONENT_NAME, @@ -669,7 +665,6 @@ def start_query_worker( clp_config.redis.query_backend_database, num_cpus, mounts, - query_worker_env, query_worker_mount, ) @@ -685,8 +680,7 @@ def generic_start_worker( redis_database: int, num_cpus: int, mounts: CLPDockerMounts, - worker_specific_env: Dict[str, Any], - worker_specific_mount: List[Optional[DockerMount]], + worker_specific_mount: Optional[List[Optional[DockerMount]]], ): logger.info(f"Starting {component_name}...") @@ -694,6 +688,12 @@ def generic_start_worker( if container_exists(container_name): return + container_config_filename = f"{container_name}.yml" + container_config_file_path = clp_config.logs_directory / container_config_filename + container_worker_config = generate_worker_config(container_clp_config) + with open(container_config_file_path, "w") as f: + yaml.safe_dump(container_worker_config.dump_to_primitive_dict(), f) + logs_dir = clp_config.logs_directory / component_name logs_dir.mkdir(parents=True, exist_ok=True) container_logs_dir = container_clp_config.logs_directory / component_name @@ -723,39 +723,19 @@ def generic_start_worker( f"{container_clp_config.redis.host}:{container_clp_config.redis.port}/{redis_database}" ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", - "-e", f"CLP_DATA_DIR={container_clp_config.data_directory}", + "-e", f"WORKER_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", + # Follow the other method. "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", - "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", - # need a way to remove this maybe - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.get_directory()}", ] - if worker_specific_env: - for env_name, env_value in worker_specific_env.items(): - container_start_cmd.append("-e") - container_start_cmd.append(f"{env_name}={env_value}") - - if "s3" == clp_config.archive_output.storage.type: - s3_config = clp_config.archive_output.storage.s3_config - container_start_cmd += [ - "-e", f"ENABLE_S3_ARCHIVE=1", - "-e", f"REGION_NAME={s3_config.region_name}", - "-e", f"BUCKET={s3_config.bucket}", - "-e", f"KEY_PREFIX={s3_config.key_prefix}" - ] - if s3_config.secret_access_key is not None and s3_config.secret_access_key is not None: - container_start_cmd += [ - "-e", f"ACCESS_KEY_ID={s3_config.access_key_id}", - "-e", f"SECRET_ACCESS_KEY={s3_config.secret_access_key}" - ] # fmt: on necessary_mounts = [ mounts.clp_home, mounts.data_dir, mounts.logs_dir, - # need a way to remove this maybe, since reader doesn't need it if it is staged. - # one option is to move it to the worker_specific_mount + # TODO: need a way to remove this maybe, since reader doesn't need it if it + # is staged. one option is to move it to the worker_specific_mount mounts.archives_output_dir, mounts.input_logs_dir, ] diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 3694c8312..e33c29edf 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -633,3 +633,20 @@ def dump_to_primitive_dict(self): d["data_directory"] = str(self.data_directory) d["logs_directory"] = str(self.logs_directory) return d + + +class WorkerConfig(BaseModel): + package: Package = Package() + archive_output: ArchiveOutput = ArchiveOutput() + # Only needed by query worker. Consider inheriting at some point. + stream_output: StreamOutput = StreamOutput() + + data_directory: pathlib.Path = pathlib.Path("var") / "data" + + def dump_to_primitive_dict(self): + d = self.dict() + d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() + d["stream_output"] = self.stream_output.dump_to_primitive_dict() + # Turn paths into primitive strings + d["data_directory"] = str(self.data_directory) + return d \ No newline at end of file diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 120f9179a..66a88ad18 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -15,8 +15,11 @@ Database, S3Config, StorageEngine, + StorageType, + WorkerConfig ) from clp_py_utils.clp_logging import set_logging_level +from clp_py_utils.core import read_yaml_config_file from clp_py_utils.result import Result from clp_py_utils.s3_utils import s3_put from clp_py_utils.sql_adapter import SQL_Adapter @@ -24,6 +27,7 @@ from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult +from pydantic import ValidationError # Setup logging logger = get_task_logger(__name__) @@ -94,23 +98,6 @@ def upload_single_file_archive_to_s3( return Result(success=True) -def get_s3_config() -> Optional[S3Config]: - enable_s3_write = os.getenv("ENABLE_S3_ARCHIVE") - if enable_s3_write is None: - return None - - # TODO: this method is very errorprone since it doesn't check individual members - # Let's leave this for now before we properly implement the config file. - s3_config = S3Config( - region_name=os.getenv("REGION_NAME"), - bucket=os.getenv("BUCKET"), - key_prefix=os.getenv("KEY_PREFIX"), - access_key_id=os.getenv("ACCESS_KEY_ID"), - secret_access_key=os.getenv("SECRET_ACCESS_KEY"), - ) - return s3_config - - def make_clp_command( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, @@ -169,6 +156,7 @@ def make_clp_s_command( def run_clp( + worker_config: WorkerConfig, clp_config: ClpIoConfig, clp_home: pathlib.Path, data_dir: pathlib.Path, @@ -184,6 +172,7 @@ def run_clp( """ Compresses files from an FS into archives on an FS + :param worker_config: WorkerConfig :param clp_config: ClpIoConfig :param clp_home: :param data_dir: @@ -197,7 +186,7 @@ def run_clp( :param clp_metadata_db_connection_config :return: tuple -- (whether compression was successful, output messages) """ - clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) + clp_storage_engine = worker_config.package.storage_engine instance_id_str = f"compression-job-{job_id}-task-{task_id}" @@ -208,7 +197,8 @@ def run_clp( db_config_file.close() # Get s3 config - s3_config = get_s3_config() + storage_config = worker_config.archive_output.storage + s3_config = storage_config.s3_config if StorageType.S3 == storage_config.type else None s3_upload_failed = False if StorageEngine.CLP == clp_storage_engine: @@ -338,10 +328,20 @@ def compress( paths_to_compress_json: str, clp_metadata_db_connection_config, ): - clp_home_str = os.getenv("CLP_HOME") - data_dir_str = os.getenv("CLP_DATA_DIR") - archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") - logs_dir_str = os.getenv("CLP_LOGS_DIR") + clp_home = pathlib.Path(os.getenv("CLP_HOME")) + logs_dir = pathlib.Path(os.getenv("CLP_LOGS_DIR")) + + # Load configuration + worker_config_path = pathlib.Path(os.getenv("WORKER_CONFIG_PATH")) + try: + worker_config = WorkerConfig.parse_obj(read_yaml_config_file(worker_config_path)) + except ValidationError as err: + logger.error(err) + return -1 + except Exception as ex: + logger.error(ex) + return -1 + # Set logging level clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) @@ -355,11 +355,12 @@ def compress( start_time = datetime.datetime.now() logger.info(f"[job_id={job_id} task_id={task_id}] COMPRESSION STARTED.") compression_task_status, worker_output = run_clp( + worker_config, clp_io_config, - pathlib.Path(clp_home_str), - pathlib.Path(data_dir_str), - pathlib.Path(archive_output_dir_str), - pathlib.Path(logs_dir_str), + clp_home, + worker_config.data_directory, + worker_config.archive_output.get_directory(), + logs_dir, job_id, task_id, tag_ids, From abf5dde5d337136b2acffc100808a92b79c06e88 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:18:12 -0500 Subject: [PATCH 15/62] More support --- .../clp_package_utils/general.py | 1 + .../clp-py-utils/clp_py_utils/clp_config.py | 7 +- .../executor/compress/fs_compression_task.py | 64 +++++++++++-------- .../executor/query/extract_stream_task.py | 46 ++++++++----- .../executor/query/fs_search_task.py | 38 +++++++---- .../job_orchestration/executor/query/utils.py | 5 +- .../job_orchestration/executor/utils.py | 18 ++++++ 7 files changed, 117 insertions(+), 62 deletions(-) create mode 100644 components/job-orchestration/job_orchestration/executor/utils.py diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 507e1e434..81bc238aa 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -276,6 +276,7 @@ def generate_worker_config(clp_config: CLPConfig) -> WorkerConfig: worker_config.archive_output = clp_config.archive_output.copy(deep=True) worker_config.stream_output = clp_config.stream_output.copy(deep=True) worker_config.package = clp_config.package.copy(deep=True) + worker_config.results_cache = clp_config.results_cache.copy(deep=True) worker_config.data_directory = clp_config.data_directory return worker_config diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index e33c29edf..a4ca08a7d 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -638,11 +638,12 @@ def dump_to_primitive_dict(self): class WorkerConfig(BaseModel): package: Package = Package() archive_output: ArchiveOutput = ArchiveOutput() - # Only needed by query worker. Consider inheriting at some point. - stream_output: StreamOutput = StreamOutput() - data_directory: pathlib.Path = pathlib.Path("var") / "data" + # Only needed by query worker. Consider inheriting at some point. + stream_output = StreamOutput() + results_cache: ResultsCache = ResultsCache() + def dump_to_primitive_dict(self): d = self.dict() d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 66a88ad18..244b77ebe 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -135,12 +135,12 @@ def make_clp_s_command( archive_output_dir: pathlib.Path, clp_config: ClpIoConfig, db_config_file_path: pathlib.Path, + enable_s3_write: bool, ): # fmt: off compression_cmd = [ str(clp_home / "bin" / "clp-s"), "c", str(archive_output_dir), - "--single-file-archive", "--print-archive-stats", "--target-encoded-size", str(clp_config.output.target_segment_size + clp_config.output.target_dictionaries_size), @@ -148,6 +148,9 @@ def make_clp_s_command( ] # fmt: on + if enable_s3_write: + compression_cmd.append("--single-file-archive") + if clp_config.input.timestamp_key is not None: compression_cmd.append("--timestamp-key") compression_cmd.append(clp_config.input.timestamp_key) @@ -159,8 +162,6 @@ def run_clp( worker_config: WorkerConfig, clp_config: ClpIoConfig, clp_home: pathlib.Path, - data_dir: pathlib.Path, - archive_output_dir: pathlib.Path, logs_dir: pathlib.Path, job_id: int, task_id: int, @@ -175,8 +176,6 @@ def run_clp( :param worker_config: WorkerConfig :param clp_config: ClpIoConfig :param clp_home: - :param data_dir: - :param archive_output_dir: :param logs_dir: :param job_id: :param task_id: @@ -186,10 +185,12 @@ def run_clp( :param clp_metadata_db_connection_config :return: tuple -- (whether compression was successful, output messages) """ - clp_storage_engine = worker_config.package.storage_engine - instance_id_str = f"compression-job-{job_id}-task-{task_id}" + clp_storage_engine = worker_config.package.storage_engine + data_dir = worker_config.data_directory + archive_output_dir = worker_config.archive_output.get_directory() + # Generate database config file for clp db_config_file_path = data_dir / f"{instance_id_str}-db-config.yml" db_config_file = open(db_config_file_path, "w") @@ -197,9 +198,18 @@ def run_clp( db_config_file.close() # Get s3 config - storage_config = worker_config.archive_output.storage - s3_config = storage_config.s3_config if StorageType.S3 == storage_config.type else None - s3_upload_failed = False + s3_config = None + enable_s3_write = False + s3_write_failed = False + storage_type = worker_config.archive_output.storage.type + if StorageType.S3 == storage_type: + # This should be caught by start-clp and could be redundant, but let's be safe for now. + if StorageEngine.CLP == clp_storage_engine: + logger.error(f"S3 is not supported for {clp_storage_engine}") + return False, {"error_message": f"S3 is not supported for {clp_storage_engine}"} + + s3_config = worker_config.archive_output.storage.s3_config + enable_s3_write = True if StorageEngine.CLP == clp_storage_engine: compression_cmd = make_clp_command( @@ -214,6 +224,7 @@ def run_clp( archive_output_dir=archive_output_dir, clp_config=clp_config, db_config_file_path=db_config_file_path, + enable_s3_write=enable_s3_write ) else: logger.error(f"Unsupported storage engine {clp_storage_engine}") @@ -264,13 +275,13 @@ def run_clp( if last_archive_stats is not None and ( None is stats or stats["id"] != last_archive_stats["id"] ): - if s3_config is not None: + if enable_s3_write: result = upload_single_file_archive_to_s3( last_archive_stats, archive_output_dir, s3_config ) if not result.success: worker_output["error_message"] = result.error - s3_upload_failed = True + s3_write_failed = True # Upon failure, skip updating the archive tags and job metadata. break @@ -308,7 +319,7 @@ def run_clp( # Close stderr log file stderr_log_file.close() - if s3_upload_failed: + if s3_write_failed: logger.error(f"Failed to upload to S3.") return CompressionTaskStatus.FAILED, worker_output if compression_successful: @@ -329,23 +340,24 @@ def compress( clp_metadata_db_connection_config, ): clp_home = pathlib.Path(os.getenv("CLP_HOME")) + + # Set logging level logs_dir = pathlib.Path(os.getenv("CLP_LOGS_DIR")) + clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) + set_logging_level(logger, clp_logging_level) # Load configuration - worker_config_path = pathlib.Path(os.getenv("WORKER_CONFIG_PATH")) try: - worker_config = WorkerConfig.parse_obj(read_yaml_config_file(worker_config_path)) - except ValidationError as err: - logger.error(err) - return -1 + worker_config = WorkerConfig.parse_obj(read_yaml_config_file(pathlib.Path(os.getenv("WORKER_CONFIG_PATH")))) except Exception as ex: - logger.error(ex) - return -1 - - - # Set logging level - clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) - set_logging_level(logger, clp_logging_level) + error_msg = "Failed to load worker config" + logger.exception(error_msg) + return CompressionTaskResult( + task_id=task_id, + status=CompressionTaskStatus.FAILED, + duration=0, + error_message=error_msg + ) clp_io_config = ClpIoConfig.parse_raw(clp_io_config_json) paths_to_compress = PathsToCompress.parse_raw(paths_to_compress_json) @@ -358,8 +370,6 @@ def compress( worker_config, clp_io_config, clp_home, - worker_config.data_directory, - worker_config.archive_output.get_directory(), logs_dir, job_id, task_id, diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index 423ebb757..10c98b457 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -5,12 +5,13 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter +from job_orchestration.executor.utils import try_load_worker_config from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - report_command_creation_failure, + report_task_failure, run_query_task, ) from job_orchestration.scheduler.job_config import ExtractIrJobConfig, ExtractJsonJobConfig @@ -21,15 +22,17 @@ def make_command( - storage_engine: str, clp_home: Path, - archives_dir: Path, + worker_config: WorkerConfig, archive_id: str, - stream_output_dir: Path, job_config: dict, results_cache_uri: str, - stream_collection_name: str, ) -> Optional[List[str]]: + storage_engine = worker_config.package.storage_engine + archives_dir = worker_config.archive_output.get_directory() + stream_output_dir = worker_config.stream_output.directory + stream_collection_name = worker_config.results_cache.stream_collection_name + if StorageEngine.CLP == storage_engine: logger.info("Starting IR extraction") extract_ir_config = ExtractIrJobConfig.parse_obj(job_config) @@ -97,28 +100,37 @@ def extract_stream( task_status: QueryTaskStatus sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + # Load configuration + worker_config = try_load_worker_config(os.getenv("WORKER_CONFIG_PATH"), logger) + if worker_config is None: + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + + if worker_config.archive_output.storage.type == StorageType.S3: + logger.error(f"Extraction is not supported for S3 storage type") + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + # Make task_command clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") - stream_output_dir = Path(os.getenv("CLP_STREAM_OUTPUT_DIR")) - stream_collection_name = os.getenv("CLP_STREAM_COLLECTION_NAME") task_command = make_command( - storage_engine=clp_storage_engine, clp_home=clp_home, - archives_dir=archive_directory, + worker_config=worker_config, archive_id=archive_id, - stream_output_dir=stream_output_dir, job_config=job_config, results_cache_uri=results_cache_uri, - stream_collection_name=stream_collection_name, ) if not task_command: - return report_command_creation_failure( + logger.error(f"Error creating {task_name} command") + return report_task_failure( sql_adapter=sql_adapter, - logger=logger, - task_name=task_name, task_id=task_id, start_time=start_time, ) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 598bfdcfc..6f0337b08 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -5,14 +5,15 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - report_command_creation_failure, + report_task_failure, run_query_task, ) +from job_orchestration.executor.utils import try_load_worker_config from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -21,14 +22,16 @@ def make_command( - storage_engine: str, clp_home: Path, - archives_dir: Path, + worker_config: WorkerConfig, archive_id: str, search_config: SearchJobConfig, results_cache_uri: str, results_collection: str, ) -> Optional[List[str]]: + storage_engine = worker_config.package.storage_engine + archives_dir = worker_config.archive_output.get_directory() + if StorageEngine.CLP == storage_engine: command = [str(clp_home / "bin" / "clo"), "s", str(archives_dir / archive_id)] if search_config.path_filter is not None: @@ -116,26 +119,39 @@ def search( task_status: QueryTaskStatus sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + # Load configuration + worker_config = try_load_worker_config(os.getenv("WORKER_CONFIG_PATH"), logger) + if worker_config is None: + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + + if worker_config.archive_output.storage.type == StorageType.S3: + logger.error(f"Search is not supported for S3 storage type") + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + # Make task_command clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") search_config = SearchJobConfig.parse_obj(job_config) task_command = make_command( - storage_engine=clp_storage_engine, clp_home=clp_home, - archives_dir=archive_directory, + worker_config=worker_config, archive_id=archive_id, search_config=search_config, results_cache_uri=results_cache_uri, results_collection=job_id, ) if not task_command: - return report_command_creation_failure( + logger.error(f"Error creating {task_name} command") + return report_task_failure( sql_adapter=sql_adapter, - logger=logger, - task_name=task_name, task_id=task_id, start_time=start_time, ) diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index 69d22398e..523abbe00 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -19,14 +19,11 @@ def get_task_log_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Pat return worker_logs_dir / f"{task_id}-clo.log" -def report_command_creation_failure( +def report_task_failure( sql_adapter: SQL_Adapter, - logger: Logger, - task_name: str, task_id: int, start_time: datetime.datetime, ): - logger.error(f"Error creating {task_name} command") task_status = QueryTaskStatus.FAILED update_query_task_metadata( sql_adapter, diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py new file mode 100644 index 000000000..fd5cc27ed --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -0,0 +1,18 @@ +from typing import Optional +from logging import Logger +from clp_py_utils.clp_config import WorkerConfig +from clp_py_utils.core import read_yaml_config_file +from pathlib import Path +def try_load_worker_config( + config_path_str: str, + logger: Logger, +) -> Optional[WorkerConfig]: + if config_path_str is None: + logger.error("config_path_str can't be empty") + return None + + try: + return WorkerConfig.parse_obj(read_yaml_config_file(Path(config_path_str))) + except Exception: + logger.exception("Failed to load worker config") + return None \ No newline at end of file From 7d344561054f4f306ce56270f4796b7fe34a7a97 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:22:44 -0500 Subject: [PATCH 16/62] Remove unnecssary change --- .../clp-package-utils/clp_package_utils/scripts/start_clp.py | 2 +- .../job_orchestration/executor/compress/fs_compression_task.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index f94fefb0b..0b665c717 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -724,9 +724,9 @@ def generic_start_worker( ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", "-e", f"WORKER_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", - # Follow the other method. "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", + "-u", f"{os.getuid()}:{os.getgid()}", ] # fmt: on diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 244b77ebe..b13a2257b 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -27,7 +27,6 @@ from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult -from pydantic import ValidationError # Setup logging logger = get_task_logger(__name__) From a7afd0d87805e141e0ec837de533ca54b59fdb10 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:24:25 -0500 Subject: [PATCH 17/62] Linter --- .../clp-package-utils/clp_package_utils/general.py | 2 +- components/clp-py-utils/clp_py_utils/clp_config.py | 2 +- .../executor/compress/fs_compression_task.py | 10 ++++++---- .../executor/query/extract_stream_task.py | 2 +- .../job_orchestration/executor/utils.py | 9 ++++++--- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 81bc238aa..e1ffee795 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -14,7 +14,6 @@ from clp_py_utils.clp_config import ( CLP_DEFAULT_CREDENTIALS_FILE_PATH, CLPConfig, - WorkerConfig, DB_COMPONENT_NAME, LOG_VIEWER_WEBUI_COMPONENT_NAME, QUEUE_COMPONENT_NAME, @@ -23,6 +22,7 @@ RESULTS_CACHE_COMPONENT_NAME, StorageType, WEBUI_COMPONENT_NAME, + WorkerConfig, ) from clp_py_utils.core import ( get_config_value, diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index a4ca08a7d..226d49d2d 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -650,4 +650,4 @@ def dump_to_primitive_dict(self): d["stream_output"] = self.stream_output.dump_to_primitive_dict() # Turn paths into primitive strings d["data_directory"] = str(self.data_directory) - return d \ No newline at end of file + return d diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index b13a2257b..f96ba72f1 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -16,7 +16,7 @@ S3Config, StorageEngine, StorageType, - WorkerConfig + WorkerConfig, ) from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.core import read_yaml_config_file @@ -223,7 +223,7 @@ def run_clp( archive_output_dir=archive_output_dir, clp_config=clp_config, db_config_file_path=db_config_file_path, - enable_s3_write=enable_s3_write + enable_s3_write=enable_s3_write, ) else: logger.error(f"Unsupported storage engine {clp_storage_engine}") @@ -347,7 +347,9 @@ def compress( # Load configuration try: - worker_config = WorkerConfig.parse_obj(read_yaml_config_file(pathlib.Path(os.getenv("WORKER_CONFIG_PATH")))) + worker_config = WorkerConfig.parse_obj( + read_yaml_config_file(pathlib.Path(os.getenv("WORKER_CONFIG_PATH"))) + ) except Exception as ex: error_msg = "Failed to load worker config" logger.exception(error_msg) @@ -355,7 +357,7 @@ def compress( task_id=task_id, status=CompressionTaskStatus.FAILED, duration=0, - error_message=error_msg + error_message=error_msg, ) clp_io_config = ClpIoConfig.parse_raw(clp_io_config_json) diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index 10c98b457..a1df2d5bc 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -8,12 +8,12 @@ from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter -from job_orchestration.executor.utils import try_load_worker_config from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( report_task_failure, run_query_task, ) +from job_orchestration.executor.utils import try_load_worker_config from job_orchestration.scheduler.job_config import ExtractIrJobConfig, ExtractJsonJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py index fd5cc27ed..e00e7ca21 100644 --- a/components/job-orchestration/job_orchestration/executor/utils.py +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -1,8 +1,11 @@ -from typing import Optional from logging import Logger +from pathlib import Path +from typing import Optional + from clp_py_utils.clp_config import WorkerConfig from clp_py_utils.core import read_yaml_config_file -from pathlib import Path + + def try_load_worker_config( config_path_str: str, logger: Logger, @@ -15,4 +18,4 @@ def try_load_worker_config( return WorkerConfig.parse_obj(read_yaml_config_file(Path(config_path_str))) except Exception: logger.exception("Failed to load worker config") - return None \ No newline at end of file + return None From 99d30940391b0a603221d89d7133990070efd2d2 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:41:48 -0500 Subject: [PATCH 18/62] Handle mount for fs & S3 --- .../clp_package_utils/scripts/start_clp.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 0b665c717..7e33f9f55 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -29,6 +29,7 @@ REDIS_COMPONENT_NAME, REDUCER_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, + StorageType, WEBUI_COMPONENT_NAME, ) from job_orchestration.scheduler.constants import QueueName @@ -627,6 +628,9 @@ def start_compression_worker( ): celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" + + compression_worker_mount = [mounts.archives_output_dir] + generic_start_worker( COMPRESSION_WORKER_COMPONENT_NAME, instance_id, @@ -638,7 +642,7 @@ def start_compression_worker( clp_config.redis.compression_backend_database, num_cpus, mounts, - None, + compression_worker_mount, ) @@ -653,6 +657,9 @@ def start_query_worker( celery_route = f"{QueueName.QUERY}" query_worker_mount = [mounts.stream_output_dir] + if clp_config.archive_output.storage.type == StorageType.FS: + query_worker_mount.append(mounts.archives_output_dir) + generic_start_worker( QUERY_WORKER_COMPONENT_NAME, @@ -734,9 +741,6 @@ def generic_start_worker( mounts.clp_home, mounts.data_dir, mounts.logs_dir, - # TODO: need a way to remove this maybe, since reader doesn't need it if it - # is staged. one option is to move it to the worker_specific_mount - mounts.archives_output_dir, mounts.input_logs_dir, ] if worker_specific_mount: From 1afed1a1a35556dd3db35b44398145cba5559335 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:44:48 -0500 Subject: [PATCH 19/62] Linter --- .../clp-package-utils/clp_package_utils/scripts/start_clp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 7e33f9f55..e708b45ec 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -660,7 +660,6 @@ def start_query_worker( if clp_config.archive_output.storage.type == StorageType.FS: query_worker_mount.append(mounts.archives_output_dir) - generic_start_worker( QUERY_WORKER_COMPONENT_NAME, instance_id, From 1de661a2576bd8cde4f59d0437439c91c0253ec8 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:21:01 -0500 Subject: [PATCH 20/62] Remove unused functions --- components/clp-package-utils/clp_package_utils/general.py | 7 ------- components/clp-py-utils/clp_py_utils/s3_utils.py | 5 ----- 2 files changed, 12 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index e1ffee795..7cfa396f3 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -30,7 +30,6 @@ read_yaml_config_file, validate_path_could_be_dir, ) -from clp_py_utils.s3_utils import verify_s3_config_for_archive_output from strenum import KebabCaseStrEnum # CONSTANTS @@ -499,12 +498,6 @@ def validate_worker_config(clp_config: CLPConfig): clp_config.validate_archive_output_config() clp_config.validate_stream_output_dir() - storage_config = clp_config.archive_output.storage - if StorageType.S3 == storage_config.type: - result = verify_s3_config_for_archive_output(storage_config.s3_config) - if not result.success: - raise ValueError(f"S3 config verification failed: {result.error}") - def validate_webui_config( clp_config: CLPConfig, logs_dir: pathlib.Path, settings_json_path: pathlib.Path diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 0c4356407..8003beb8c 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -8,11 +8,6 @@ from clp_py_utils.result import Result -def verify_s3_config_for_archive_output(s3_config: S3Config) -> Result: - # TODO: properly verify the s3 config - return Result(success=True) - - def s3_put( s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 ) -> Result: From ce3de982fa5e98df5dcd87ea9a1629f7159281cb Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:23:56 -0500 Subject: [PATCH 21/62] Update components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py --- .../job_orchestration/executor/compress/fs_compression_task.py | 1 - 1 file changed, 1 deletion(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index f96ba72f1..1057527e1 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -281,7 +281,6 @@ def run_clp( if not result.success: worker_output["error_message"] = result.error s3_write_failed = True - # Upon failure, skip updating the archive tags and job metadata. break # We've started a new archive so add the previous archive's last From f49664fcba7d046693a76b460d80173bdda83141 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:33:24 -0500 Subject: [PATCH 22/62] simplify worker config --- .../clp-package-utils/clp_package_utils/general.py | 7 ++++--- components/clp-py-utils/clp_py_utils/clp_config.py | 10 +++++----- .../executor/query/extract_stream_task.py | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 7cfa396f3..60f1053f8 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -272,12 +272,13 @@ def generate_container_config( def generate_worker_config(clp_config: CLPConfig) -> WorkerConfig: worker_config = WorkerConfig() - worker_config.archive_output = clp_config.archive_output.copy(deep=True) - worker_config.stream_output = clp_config.stream_output.copy(deep=True) worker_config.package = clp_config.package.copy(deep=True) - worker_config.results_cache = clp_config.results_cache.copy(deep=True) + worker_config.archive_output = clp_config.archive_output.copy(deep=True) worker_config.data_directory = clp_config.data_directory + worker_config.stream_output_dir = clp_config.stream_output.directory + worker_config.stream_collection_name = clp_config.results_cache.stream_collection_name + return worker_config diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 226d49d2d..cb2c07f18 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -638,16 +638,16 @@ def dump_to_primitive_dict(self): class WorkerConfig(BaseModel): package: Package = Package() archive_output: ArchiveOutput = ArchiveOutput() - data_directory: pathlib.Path = pathlib.Path("var") / "data" + data_directory: pathlib.Path = CLPConfig().data_directory - # Only needed by query worker. Consider inheriting at some point. - stream_output = StreamOutput() - results_cache: ResultsCache = ResultsCache() + # Only needed by query workers. + stream_output_dir: pathlib.Path = StreamOutput().directory + stream_collection_name: str = ResultsCache().stream_collection_name def dump_to_primitive_dict(self): d = self.dict() d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() - d["stream_output"] = self.stream_output.dump_to_primitive_dict() # Turn paths into primitive strings d["data_directory"] = str(self.data_directory) + d["stream_output_dir"] = str(self.stream_output_dir) return d diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index a1df2d5bc..bd3c720d3 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -30,8 +30,8 @@ def make_command( ) -> Optional[List[str]]: storage_engine = worker_config.package.storage_engine archives_dir = worker_config.archive_output.get_directory() - stream_output_dir = worker_config.stream_output.directory - stream_collection_name = worker_config.results_cache.stream_collection_name + stream_output_dir = worker_config.stream_output_dir + stream_collection_name = worker_config.stream_collection_name if StorageEngine.CLP == storage_engine: logger.info("Starting IR extraction") From 046cdcb2cf0c1ca8bdc633cfbaac31b538a5fdba Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:37:27 -0500 Subject: [PATCH 23/62] polishing --- components/clp-py-utils/clp_py_utils/clp_config.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index cb2c07f18..ed607c75c 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -433,6 +433,11 @@ def get_directory(self) -> pathlib.Path: return storage_config.directory return storage_config.staging_directory + def dump_to_primitive_dict(self): + d = self.dict() + # Turn directory (pathlib.Path) into a primitive string + d["storage"] = self.storage.dump_to_primitive_dict() + return d class StreamOutput(BaseModel): directory: pathlib.Path = pathlib.Path("var") / "data" / "streams" @@ -625,7 +630,7 @@ def load_redis_credentials_from_file(self): def dump_to_primitive_dict(self): d = self.dict() - d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() + d["archive_output"] = self.archive_output.dump_to_primitive_dict() d["stream_output"] = self.stream_output.dump_to_primitive_dict() # Turn paths into primitive strings d["input_logs_directory"] = str(self.input_logs_directory) @@ -646,7 +651,7 @@ class WorkerConfig(BaseModel): def dump_to_primitive_dict(self): d = self.dict() - d["archive_output"]["storage"] = self.archive_output.storage.dump_to_primitive_dict() + d["archive_output"] = self.archive_output.dump_to_primitive_dict() # Turn paths into primitive strings d["data_directory"] = str(self.data_directory) d["stream_output_dir"] = str(self.stream_output_dir) From 242dec2d9676a9a87002166b74d0a51220497bb7 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Sat, 14 Dec 2024 12:42:09 -0500 Subject: [PATCH 24/62] linter --- components/clp-py-utils/clp_py_utils/clp_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index ed607c75c..2d9be98f0 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -439,6 +439,7 @@ def dump_to_primitive_dict(self): d["storage"] = self.storage.dump_to_primitive_dict() return d + class StreamOutput(BaseModel): directory: pathlib.Path = pathlib.Path("var") / "data" / "streams" target_uncompressed_size: int = 128 * 1024 * 1024 From ed280cb2a9b7a3d89f2e491c1418a33e467939fb Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:33:07 -0500 Subject: [PATCH 25/62] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../clp_package_utils/scripts/start_clp.py | 2 -- .../clp-py-utils/clp_py_utils/clp_config.py | 15 +++++++++------ .../executor/compress/fs_compression_task.py | 13 ++++++------- .../executor/query/extract_stream_task.py | 2 +- .../executor/query/fs_search_task.py | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index e708b45ec..5f12659d8 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -628,9 +628,7 @@ def start_compression_worker( ): celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" - compression_worker_mount = [mounts.archives_output_dir] - generic_start_worker( COMPRESSION_WORKER_COMPONENT_NAME, instance_id, diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 2d9be98f0..79d2a9588 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -334,7 +334,7 @@ def validate_key_prefix(cls, field): if field == "": raise ValueError("key_prefix is not provided") if not field.endswith("/"): - raise ValueError(r'key_prefix must end with "/"') + raise ValueError('key_prefix must end with "/"') return field @@ -345,7 +345,7 @@ class FSStorage(BaseModel): @validator("directory") def validate_directory(cls, field): if "" == field: - raise ValueError("directory can not be empty") + raise ValueError("directory cannot be empty") return field def make_config_path_absolute(self, clp_home: pathlib.Path): @@ -365,7 +365,7 @@ class S3Storage(BaseModel): @validator("staging_directory") def validate_staging_directory(cls, field): if "" == field: - raise ValueError("staging_directory can not be empty") + raise ValueError("staging_directory cannot be empty") return field @validator("s3_config") @@ -420,7 +420,7 @@ def validate_target_segment_size(cls, field): raise ValueError("target_segment_size must be greater than 0") return field - def set_directory(self, directory: pathlib.Path) -> None: + def set_directory(self, directory: pathlib.Path): storage_config = self.storage if StorageType.FS == storage_config.type: storage_config.directory = directory @@ -552,12 +552,13 @@ def validate_archive_output_config(self): and StorageEngine.CLP_S != self.package.storage_engine ): raise ValueError( - f"S3 storage is only supported with storage engine: {StorageEngine.CLP_S}" + f"archive_output.storage.type = 's3' is only supported with package.storage_engine" + f" = '{StorageEngine.CLP_S}'" ) try: validate_path_could_be_dir(self.archive_output.get_directory()) except ValueError as ex: - raise ValueError(f"directory of storage config is invalid: {ex}") + raise ValueError(f"archive_output.storage's directory is invalid: {ex}") def validate_stream_output_dir(self): try: @@ -653,7 +654,9 @@ class WorkerConfig(BaseModel): def dump_to_primitive_dict(self): d = self.dict() d["archive_output"] = self.archive_output.dump_to_primitive_dict() + # Turn paths into primitive strings d["data_directory"] = str(self.data_directory) d["stream_output_dir"] = str(self.stream_output_dir) + return d diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 1057527e1..d3a97223b 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -85,14 +85,14 @@ def upload_single_file_archive_to_s3( ) -> Result: archive_id = archive_stats["id"] - logger.info(f"Starting to upload archive {archive_id} to S3...") + logger.info(f"Uploading archive {archive_id} to S3...") src_file = archive_dir / archive_id result = s3_put(s3_config, src_file, archive_id) if not result.success: logger.error(f"Failed to upload archive {archive_id}: {result.error}") return result - logger.info(f"Finished uploading archive {archive_id} to S3...") + logger.info(f"Finished uploading archive {archive_id} to S3.") src_file.unlink() return Result(success=True) @@ -197,15 +197,15 @@ def run_clp( db_config_file.close() # Get s3 config - s3_config = None + s3_config: S3Config enable_s3_write = False s3_write_failed = False storage_type = worker_config.archive_output.storage.type if StorageType.S3 == storage_type: - # This should be caught by start-clp and could be redundant, but let's be safe for now. if StorageEngine.CLP == clp_storage_engine: - logger.error(f"S3 is not supported for {clp_storage_engine}") - return False, {"error_message": f"S3 is not supported for {clp_storage_engine}"} + error_msg = f"S3 storage is not supported for the {clp_storage_engine} storage engine." + logger.error(error_msg) + return False, {"error_message": error_msg} s3_config = worker_config.archive_output.storage.s3_config enable_s3_write = True @@ -262,7 +262,6 @@ def run_clp( "total_uncompressed_size": 0, "total_compressed_size": 0, } - while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index bd3c720d3..7b88d796d 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -110,7 +110,7 @@ def extract_stream( ) if worker_config.archive_output.storage.type == StorageType.S3: - logger.error(f"Extraction is not supported for S3 storage type") + logger.error(f"Stream extraction is not supported for the S3 storage type") return report_task_failure( sql_adapter=sql_adapter, task_id=task_id, diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 6f0337b08..b4c2f269b 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -129,7 +129,7 @@ def search( ) if worker_config.archive_output.storage.type == StorageType.S3: - logger.error(f"Search is not supported for S3 storage type") + logger.error(f"Search is not supported for the S3 storage type") return report_task_failure( sql_adapter=sql_adapter, task_id=task_id, From 0788e5953d458fadf39a01ed1a1e251655f78052 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:40:58 -0500 Subject: [PATCH 26/62] Fix easier ones --- .../scripts/native/decompress.py | 2 +- .../scripts/native/del_archives.py | 2 +- .../clp_package_utils/scripts/start_clp.py | 14 +++--- .../clp-py-utils/clp_py_utils/clp_config.py | 47 ++++++++---------- .../clp-py-utils/clp_py_utils/s3_utils.py | 14 +++--- .../executor/compress/fs_compression_task.py | 48 ++++++++++--------- .../executor/query/extract_stream_task.py | 5 +- .../executor/query/fs_search_task.py | 5 +- .../job_orchestration/executor/utils.py | 10 ++-- .../compress/compression_scheduler.py | 4 +- 10 files changed, 74 insertions(+), 77 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py index d16cdcb6f..f1183e10e 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py @@ -207,7 +207,7 @@ def handle_extract_file_cmd( list_path = parsed_args.files_from logs_dir = clp_config.logs_directory - archives_dir = clp_config.archive_output.directory + archives_dir = clp_config.archive_output.get_directory() # Generate database config file for clp db_config_file_path = logs_dir / f".decompress-db-config-{uuid.uuid4()}.yml" diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py index 735bf299d..c489c3806 100644 --- a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py @@ -54,7 +54,7 @@ def main(argv): return -1 database_config = clp_config.database - archives_dir = clp_config.archive_output.directory + archives_dir = clp_config.archive_output.get_directory() if not archives_dir.exists(): logger.error("`archive_output.directory` doesn't exist.") return -1 diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 5f12659d8..f4da04090 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -628,7 +628,7 @@ def start_compression_worker( ): celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" - compression_worker_mount = [mounts.archives_output_dir] + compression_worker_mounts = [mounts.archives_output_dir] generic_start_worker( COMPRESSION_WORKER_COMPONENT_NAME, instance_id, @@ -640,7 +640,7 @@ def start_compression_worker( clp_config.redis.compression_backend_database, num_cpus, mounts, - compression_worker_mount, + compression_worker_mounts, ) @@ -654,9 +654,9 @@ def start_query_worker( celery_method = "job_orchestration.executor.query" celery_route = f"{QueueName.QUERY}" - query_worker_mount = [mounts.stream_output_dir] + query_worker_mounts = [mounts.stream_output_dir] if clp_config.archive_output.storage.type == StorageType.FS: - query_worker_mount.append(mounts.archives_output_dir) + query_worker_mounts.append(mounts.archives_output_dir) generic_start_worker( QUERY_WORKER_COMPONENT_NAME, @@ -669,7 +669,7 @@ def start_query_worker( clp_config.redis.query_backend_database, num_cpus, mounts, - query_worker_mount, + query_worker_mounts, ) @@ -727,13 +727,13 @@ def generic_start_worker( f"{container_clp_config.redis.host}:{container_clp_config.redis.port}/{redis_database}" ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", - "-e", f"WORKER_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", + "-e", f"CLP_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", "-u", f"{os.getuid()}:{os.getgid()}", ] - # fmt: on + necessary_mounts = [ mounts.clp_home, mounts.data_dir, diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 79d2a9588..4d5ba7a47 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -310,35 +310,35 @@ class Queue(BaseModel): class S3Config(BaseModel): - region_name: str + region_code: str bucket: str key_prefix: str access_key_id: Optional[str] = None secret_access_key: Optional[str] = None - @validator("region_name") - def validate_region_name(cls, field): + @validator("region_code") + def validate_region_code(cls, field): if field == "": - raise ValueError("region_name is not provided") + raise ValueError("region_code can not be empty") return field @validator("bucket") def validate_bucket(cls, field): if field == "": - raise ValueError("bucket is not provided") + raise ValueError("bucket can not be empty") return field @validator("key_prefix") def validate_key_prefix(cls, field): if field == "": - raise ValueError("key_prefix is not provided") + raise ValueError("key_prefix can not be empty") if not field.endswith("/"): raise ValueError('key_prefix must end with "/"') return field -class FSStorage(BaseModel): +class FsStorage(BaseModel): type: Literal[StorageType.FS.value] = StorageType.FS.value directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" @@ -348,7 +348,7 @@ def validate_directory(cls, field): raise ValueError("directory cannot be empty") return field - def make_config_path_absolute(self, clp_home: pathlib.Path): + def make_config_paths_absolute(self, clp_home: pathlib.Path): self.directory = make_config_path_absolute(clp_home, self.directory) def dump_to_primitive_dict(self): @@ -368,13 +368,7 @@ def validate_staging_directory(cls, field): raise ValueError("staging_directory cannot be empty") return field - @validator("s3_config") - def validate_s3_config(cls, field): - if None == field: - raise ValueError("s3_config must be provided") - return field - - def make_config_path_absolute(self, clp_home: pathlib.Path): + def make_config_paths_absolute(self, clp_home: pathlib.Path): self.staging_directory = make_config_path_absolute(clp_home, self.staging_directory) def dump_to_primitive_dict(self): @@ -384,18 +378,12 @@ def dump_to_primitive_dict(self): class ArchiveOutput(BaseModel): - storage: Union[FSStorage, S3Storage] = FSStorage() + storage: Union[FsStorage, S3Storage] = FsStorage() target_archive_size: int = 256 * 1024 * 1024 # 256 MB target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB target_segment_size: int = 256 * 1024 * 1024 # 256 MB - @validator("storage") - def validate_storage(cls, field) -> bool: - if None == field: - raise ValueError("storage must be provided") - return field - @validator("target_archive_size") def validate_target_archive_size(cls, field): if field <= 0: @@ -422,16 +410,23 @@ def validate_target_segment_size(cls, field): def set_directory(self, directory: pathlib.Path): storage_config = self.storage - if StorageType.FS == storage_config.type: + storage_type = storage_config.type + if StorageType.FS == storage_type: storage_config.directory = directory - else: + elif StorageType.S3 == storage_type: storage_config.staging_directory = directory + else: + raise NotImplementedError(f"storage.type {storage_type} is not supported") def get_directory(self) -> pathlib.Path: storage_config = self.storage + storage_type = storage_config.type if StorageType.FS == storage_config.type: return storage_config.directory - return storage_config.staging_directory + elif StorageType.S3 == storage_type: + return storage_config.staging_directory + else: + raise NotImplementedError(f"storage.type {storage_type} is not supported") def dump_to_primitive_dict(self): d = self.dict() @@ -531,7 +526,7 @@ class CLPConfig(BaseModel): def make_config_paths_absolute(self, clp_home: pathlib.Path): self.input_logs_directory = make_config_path_absolute(clp_home, self.input_logs_directory) self.credentials_file_path = make_config_path_absolute(clp_home, self.credentials_file_path) - self.archive_output.storage.make_config_path_absolute(clp_home) + self.archive_output.storage.make_config_paths_absolute(clp_home) self.stream_output.make_config_paths_absolute(clp_home) self.data_directory = make_config_path_absolute(clp_home, self.data_directory) self.logs_directory = make_config_path_absolute(clp_home, self.logs_directory) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 8003beb8c..3271cf985 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -17,15 +17,15 @@ def s3_put( if not src_file.is_file(): return Result(success=False, error=f"{src_file} is not a file") - s3_client_args = { - "region_name": s3_config.region_name, - "aws_access_key_id": s3_config.access_key_id, - "aws_secret_access_key": s3_config.secret_access_key, - } - config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) - my_s3_client = boto3.client("s3", **s3_client_args, config=config) + my_s3_client = boto3.client( + "s3", + region_name=s3_config.region_code, + aws_access_key_id=s3_config.access_key_id, + aws_secret_access_key=s3_config.secret_access_key, + config=config, + ) with open(src_file, "rb") as file_data: try: diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index d3a97223b..3a16c33ed 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -79,21 +79,17 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi def upload_single_file_archive_to_s3( - archive_stats: Dict[str, Any], - archive_dir: pathlib.Path, + archive_id: str, + src_archive_file: pathlib.Path, s3_config: S3Config, ) -> Result: - archive_id = archive_stats["id"] - logger.info(f"Uploading archive {archive_id} to S3...") - src_file = archive_dir / archive_id - result = s3_put(s3_config, src_file, archive_id) + result = s3_put(s3_config, src_archive_file, archive_id) if not result.success: logger.error(f"Failed to upload archive {archive_id}: {result.error}") return result logger.info(f"Finished uploading archive {archive_id} to S3.") - src_file.unlink() return Result(success=True) @@ -199,7 +195,7 @@ def run_clp( # Get s3 config s3_config: S3Config enable_s3_write = False - s3_write_failed = False + storage_type = worker_config.archive_output.storage.type if StorageType.S3 == storage_type: if StorageEngine.CLP == clp_storage_engine: @@ -258,10 +254,11 @@ def run_clp( # Compute the total amount of data compressed last_archive_stats = None last_line_decoded = False - worker_output = { - "total_uncompressed_size": 0, - "total_compressed_size": 0, - } + total_uncompressed_size = 0 + total_compressed_size = 0 + + # Handle job metadata update and s3 write if enabled + s3_error_msg = None while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None @@ -274,18 +271,19 @@ def run_clp( None is stats or stats["id"] != last_archive_stats["id"] ): if enable_s3_write: - result = upload_single_file_archive_to_s3( - last_archive_stats, archive_output_dir, s3_config - ) + archive_id = last_archive_stats["id"] + src_archive_file = archive_output_dir / archive_id + + result = upload_single_file_archive_to_s3(archive_id, src_archive_file, s3_config) if not result.success: - worker_output["error_message"] = result.error - s3_write_failed = True + s3_error_msg = result.error break + src_archive_file.unlink() # We've started a new archive so add the previous archive's last # reported size to the total - worker_output["total_uncompressed_size"] += last_archive_stats["uncompressed_size"] - worker_output["total_compressed_size"] += last_archive_stats["size"] + total_uncompressed_size += last_archive_stats["uncompressed_size"] + total_compressed_size += last_archive_stats["size"] with closing(sql_adapter.create_connection(True)) as db_conn, closing( db_conn.cursor(dictionary=True) ) as db_cursor: @@ -316,8 +314,14 @@ def run_clp( # Close stderr log file stderr_log_file.close() - if s3_write_failed: - logger.error(f"Failed to upload to S3.") + worker_output = { + "total_uncompressed_size": total_uncompressed_size, + "total_compressed_size": total_compressed_size, + } + + # TODO: think about how to deal with error messages + if s3_error_msg is not None: + worker_output["error_message"] = s3_error_msg return CompressionTaskStatus.FAILED, worker_output if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output @@ -346,7 +350,7 @@ def compress( # Load configuration try: worker_config = WorkerConfig.parse_obj( - read_yaml_config_file(pathlib.Path(os.getenv("WORKER_CONFIG_PATH"))) + read_yaml_config_file(pathlib.Path(os.getenv("CLP_CONFIG_PATH"))) ) except Exception as ex: error_msg = "Failed to load worker config" diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index 7b88d796d..58ae43450 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -13,7 +13,7 @@ report_task_failure, run_query_task, ) -from job_orchestration.executor.utils import try_load_worker_config +from job_orchestration.executor.utils import load_worker_config from job_orchestration.scheduler.job_config import ExtractIrJobConfig, ExtractJsonJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -101,7 +101,8 @@ def extract_stream( sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) # Load configuration - worker_config = try_load_worker_config(os.getenv("WORKER_CONFIG_PATH"), logger) + clp_config_path = Path(os.getenv("CLP_CONFIG_PATH")) + worker_config = load_worker_config(clp_config_path, logger) if worker_config is None: return report_task_failure( sql_adapter=sql_adapter, diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index b4c2f269b..7cf7b330f 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -13,7 +13,7 @@ report_task_failure, run_query_task, ) -from job_orchestration.executor.utils import try_load_worker_config +from job_orchestration.executor.utils import load_worker_config from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -120,7 +120,8 @@ def search( sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) # Load configuration - worker_config = try_load_worker_config(os.getenv("WORKER_CONFIG_PATH"), logger) + clp_config_path = Path(os.getenv("CLP_CONFIG_PATH")) + worker_config = load_worker_config(clp_config_path, logger) if worker_config is None: return report_task_failure( sql_adapter=sql_adapter, diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py index e00e7ca21..c9a0beaac 100644 --- a/components/job-orchestration/job_orchestration/executor/utils.py +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -6,16 +6,12 @@ from clp_py_utils.core import read_yaml_config_file -def try_load_worker_config( - config_path_str: str, +def load_worker_config( + config_path: Path, logger: Logger, ) -> Optional[WorkerConfig]: - if config_path_str is None: - logger.error("config_path_str can't be empty") - return None - try: - return WorkerConfig.parse_obj(read_yaml_config_file(Path(config_path_str))) + return WorkerConfig.parse_obj(read_yaml_config_file(config_path)) except Exception: logger.exception("Failed to load worker config") return None diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index 2de3e835b..bd793686b 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -59,7 +59,7 @@ def update_compression_task_metadata(db_cursor, task_id, kv): SET {", ".join(field_set_expressions)} WHERE id = %s """ - values = [v for v in kv.values()] + [task_id] + values = list(kv.values()) + [task_id] db_cursor.execute(query, values) @@ -74,7 +74,7 @@ def update_compression_job_metadata(db_cursor, job_id, kv): SET {", ".join(field_set_expressions)} WHERE id = %s """ - values = [v for v in kv.values()] + [job_id] + values = list(kv.values()) + [job_id] db_cursor.execute(query, values) From c198f27899b0deb73993a58548d959567b759284 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 15:49:00 -0500 Subject: [PATCH 27/62] Backup changes --- .../clp-py-utils/clp_py_utils/result.py | 10 --------- .../clp-py-utils/clp_py_utils/s3_utils.py | 14 ++++++------ components/clp-py-utils/pyproject.toml | 2 ++ .../executor/compress/fs_compression_task.py | 22 +++++++++++-------- components/job-orchestration/pyproject.toml | 1 - 5 files changed, 22 insertions(+), 27 deletions(-) delete mode 100644 components/clp-py-utils/clp_py_utils/result.py diff --git a/components/clp-py-utils/clp_py_utils/result.py b/components/clp-py-utils/clp_py_utils/result.py deleted file mode 100644 index 9c72cebf9..000000000 --- a/components/clp-py-utils/clp_py_utils/result.py +++ /dev/null @@ -1,10 +0,0 @@ -from typing import Optional - - -class Result: - def __init__(self, success: bool, error: Optional[str] = None): - self.success = success - self.error = error - - def __repr__(self): - return f"Result(success={self.success}, error={self.error})" diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 3271cf985..ed77d9a4a 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -5,17 +5,17 @@ from botocore.exceptions import ClientError from clp_py_utils.clp_config import S3Config -from clp_py_utils.result import Result +from result import Ok, Err, Result def s3_put( s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 -) -> Result: +) -> Result[bool, str]: if not src_file.exists(): - return Result(success=False, error=f"{src_file} doesn't exist") + return Err(f"{src_file} doesn't exist") if not src_file.is_file(): - return Result(success=False, error=f"{src_file} is not a file") + return Err(f"{src_file} is not a file") config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) @@ -35,8 +35,8 @@ def s3_put( except ClientError as e: error_code = e.response["Error"]["Code"] error_message = e.response["Error"]["Message"] - return Result(success=False, error=f"ClientError: {error_code} - {error_message}") + return Err(f"ClientError: {error_code} - {error_message}") except Exception as e: - return Result(success=False, error=f"An unexpected error occurred: {e}") + return Err(f"An unexpected error occurred: {e}") - return Result(success=True) + return Ok(True) diff --git a/components/clp-py-utils/pyproject.toml b/components/clp-py-utils/pyproject.toml index 4e827b926..34a97e7bb 100644 --- a/components/clp-py-utils/pyproject.toml +++ b/components/clp-py-utils/pyproject.toml @@ -10,6 +10,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" +boto3 = "^1.35.81" # mariadb version must be compatible with libmariadev installed in runtime env. # See https://mariadb.com/docs/server/connect/programming-languages/python/install/#Dependencies mariadb = "~1.0.11" @@ -19,6 +20,7 @@ python-dotenv = "^1.0.1" python-Levenshtein = "~0.22" sqlalchemy = "~2.0" PyYAML = "^6.0.1" +result = "~0.17.0" StrEnum = "^0.4.15" [build-system] diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 3a16c33ed..7c18ea287 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -2,6 +2,7 @@ import json import os import pathlib +from result import Result, is_err import subprocess from contextlib import closing from typing import Any, Dict, Optional @@ -20,7 +21,6 @@ ) from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.core import read_yaml_config_file -from clp_py_utils.result import Result from clp_py_utils.s3_utils import s3_put from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app @@ -82,15 +82,15 @@ def upload_single_file_archive_to_s3( archive_id: str, src_archive_file: pathlib.Path, s3_config: S3Config, -) -> Result: +) -> Result[bool, str]: logger.info(f"Uploading archive {archive_id} to S3...") result = s3_put(s3_config, src_archive_file, archive_id) - if not result.success: - logger.error(f"Failed to upload archive {archive_id}: {result.error}") + if result.is_err(): + logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") return result logger.info(f"Finished uploading archive {archive_id} to S3.") - return Result(success=True) + return result def make_clp_command( @@ -257,8 +257,11 @@ def run_clp( total_uncompressed_size = 0 total_compressed_size = 0 - # Handle job metadata update and s3 write if enabled + # Variables to track s3 write status. s3_error_msg = None + s3_write_failed = False + + # Handle job metadata update and s3 write if enabled while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None @@ -275,8 +278,9 @@ def run_clp( src_archive_file = archive_output_dir / archive_id result = upload_single_file_archive_to_s3(archive_id, src_archive_file, s3_config) - if not result.success: - s3_error_msg = result.error + if result.is_err(): + s3_write_failed = False + s3_error_msg = result.err_value break src_archive_file.unlink() @@ -320,7 +324,7 @@ def run_clp( } # TODO: think about how to deal with error messages - if s3_error_msg is not None: + if s3_write_failed: worker_output["error_message"] = s3_error_msg return CompressionTaskStatus.FAILED, worker_output if compression_successful: diff --git a/components/job-orchestration/pyproject.toml b/components/job-orchestration/pyproject.toml index 2db22cab0..c89d84dec 100644 --- a/components/job-orchestration/pyproject.toml +++ b/components/job-orchestration/pyproject.toml @@ -10,7 +10,6 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" -boto3 = "^1.35.76" Brotli = "^1.1.0" celery = "^5.3.6" # mariadb version must be compatible with libmariadev installed in runtime env. From 4819f7666ad037b7ae0606a524653ac03f92d70f Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:55:43 -0500 Subject: [PATCH 28/62] Small fixes --- .../clp-package-utils/clp_package_utils/scripts/start_clp.py | 1 - .../executor/compress/fs_compression_task.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index f4da04090..6de3174ff 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -715,7 +715,6 @@ def generic_start_worker( "-w", str(CONTAINER_CLP_HOME), "--name", container_name, "--log-driver", "local", - "-u", f"{os.getuid()}:{os.getgid()}", "-e", f"PYTHONPATH={clp_site_packages_dir}", "-e", ( f"BROKER_URL=amqp://" diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 7c18ea287..0678d05ea 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -2,7 +2,6 @@ import json import os import pathlib -from result import Result, is_err import subprocess from contextlib import closing from typing import Any, Dict, Optional @@ -27,6 +26,7 @@ from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult +from result import is_err, Result # Setup logging logger = get_task_logger(__name__) @@ -279,7 +279,7 @@ def run_clp( result = upload_single_file_archive_to_s3(archive_id, src_archive_file, s3_config) if result.is_err(): - s3_write_failed = False + s3_write_failed = True s3_error_msg = result.err_value break src_archive_file.unlink() From e5f43fbf58ca5a42513287ce0988d0d318654a1d Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:04:39 -0500 Subject: [PATCH 29/62] fixes --- .../clp-py-utils/clp_py_utils/s3_utils.py | 2 +- .../executor/compress/fs_compression_task.py | 34 ++++++------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index ed77d9a4a..cb98d24ab 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -3,9 +3,9 @@ import boto3 from botocore.config import Config from botocore.exceptions import ClientError +from result import Err, Ok, Result from clp_py_utils.clp_config import S3Config -from result import Ok, Err, Result def s3_put( diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 0678d05ea..f2b4a2268 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -78,21 +78,6 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi ) -def upload_single_file_archive_to_s3( - archive_id: str, - src_archive_file: pathlib.Path, - s3_config: S3Config, -) -> Result[bool, str]: - logger.info(f"Uploading archive {archive_id} to S3...") - result = s3_put(s3_config, src_archive_file, archive_id) - if result.is_err(): - logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") - return result - - logger.info(f"Finished uploading archive {archive_id} to S3.") - return result - - def make_clp_command( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, @@ -257,11 +242,8 @@ def run_clp( total_uncompressed_size = 0 total_compressed_size = 0 - # Variables to track s3 write status. - s3_error_msg = None - s3_write_failed = False - # Handle job metadata update and s3 write if enabled + s3_error = None while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None @@ -277,11 +259,15 @@ def run_clp( archive_id = last_archive_stats["id"] src_archive_file = archive_output_dir / archive_id - result = upload_single_file_archive_to_s3(archive_id, src_archive_file, s3_config) + logger.info(f"Uploading archive {archive_id} to S3...") + result = s3_put(s3_config, src_archive_file, archive_id) + if result.is_err(): - s3_write_failed = True - s3_error_msg = result.err_value + logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") + s3_error = result.err_value break + + logger.info(f"Finished uploading archive {archive_id} to S3.") src_archive_file.unlink() # We've started a new archive so add the previous archive's last @@ -324,8 +310,8 @@ def run_clp( } # TODO: think about how to deal with error messages - if s3_write_failed: - worker_output["error_message"] = s3_error_msg + if s3_error is not None: + worker_output["error_message"] = s3_error return CompressionTaskStatus.FAILED, worker_output if compression_successful: return CompressionTaskStatus.SUCCEEDED, worker_output From 12460629816c4ce070c430bfb6759b10bdfa9251 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 20:08:34 -0500 Subject: [PATCH 30/62] add safeguard for archive update failure --- .../executor/compress/fs_compression_task.py | 65 ++++++++++--------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index f2b4a2268..4fc949ab9 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -259,32 +259,37 @@ def run_clp( archive_id = last_archive_stats["id"] src_archive_file = archive_output_dir / archive_id - logger.info(f"Uploading archive {archive_id} to S3...") - result = s3_put(s3_config, src_archive_file, archive_id) + if s3_error is None: + logger.info(f"Uploading archive {archive_id} to S3...") + result = s3_put(s3_config, src_archive_file, archive_id) + + if result.is_err(): + logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") + s3_error = result.err_value + # Note: it's possible proc finishes before we call terminate() on it. In + # Which case the process will still return success. + proc.terminate() + else: + logger.info(f"Finished uploading archive {archive_id} to S3.") - if result.is_err(): - logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") - s3_error = result.err_value - break - - logger.info(f"Finished uploading archive {archive_id} to S3.") src_archive_file.unlink() - # We've started a new archive so add the previous archive's last - # reported size to the total - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_job_metadata_and_tags( - db_cursor, - job_id, - clp_metadata_db_connection_config["table_prefix"], - tag_ids, - last_archive_stats, - ) - db_conn.commit() + if s3_error is None: + # We've started a new archive so add the previous archive's last + # reported size to the total + total_uncompressed_size += last_archive_stats["uncompressed_size"] + total_compressed_size += last_archive_stats["size"] + with closing(sql_adapter.create_connection(True)) as db_conn, closing( + db_conn.cursor(dictionary=True) + ) as db_cursor: + update_job_metadata_and_tags( + db_cursor, + job_id, + clp_metadata_db_connection_config["table_prefix"], + tag_ids, + last_archive_stats, + ) + db_conn.commit() last_archive_stats = stats @@ -309,14 +314,16 @@ def run_clp( "total_compressed_size": total_compressed_size, } - # TODO: think about how to deal with error messages - if s3_error is not None: - worker_output["error_message"] = s3_error - return CompressionTaskStatus.FAILED, worker_output - if compression_successful: + if compression_successful and s3_error is None: return CompressionTaskStatus.SUCCEEDED, worker_output else: - worker_output["error_message"] = f"See logs {stderr_log_path}" + worker_output["error_message"] = "" + if compression_successful is False: + worker_output["error_message"] += f"See logs {stderr_log_path}" + if s3_error is not None: + if worker_output["error_message"]: + worker_output["error_message"] += "\n" + worker_output["error_message"] += s3_error return CompressionTaskStatus.FAILED, worker_output From 3b870a44438a191b02dbe5bb832a51d67a430718 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Mon, 16 Dec 2024 20:24:32 -0500 Subject: [PATCH 31/62] Add docstrings --- components/clp-py-utils/clp_py_utils/s3_utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index cb98d24ab..ddf386268 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -11,11 +11,21 @@ def s3_put( s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 ) -> Result[bool, str]: - + """ + Upload a local file to S3 bucket using AWS's PutObject operation. + :param s3_config: S3 configuration specifies the upload destination and credentials. + :param src_file: local file to upload. + :param dest_file_name: The name for uploaded file in the S3 bucket. + :param total_max_attempts: Maximum number of retry attempts for the upload. Defaults to 3. + :return: Result.OK(bool) on success. + Result.Err(str) with the error message if put operation fails. + """ if not src_file.exists(): return Err(f"{src_file} doesn't exist") if not src_file.is_file(): return Err(f"{src_file} is not a file") + if src_file.stat().st_size > 5 * 1024 * 1024 * 1024: + return Err("File size exceeds 5GB limit for single put_object operation") config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) From 214ae3f94523f5e4cfc842c3308c140053afc4fd Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:06:57 -0500 Subject: [PATCH 32/62] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- components/clp-py-utils/clp_py_utils/s3_utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index ddf386268..03717a445 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -12,20 +12,19 @@ def s3_put( s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 ) -> Result[bool, str]: """ - Upload a local file to S3 bucket using AWS's PutObject operation. - :param s3_config: S3 configuration specifies the upload destination and credentials. - :param src_file: local file to upload. - :param dest_file_name: The name for uploaded file in the S3 bucket. - :param total_max_attempts: Maximum number of retry attempts for the upload. Defaults to 3. - :return: Result.OK(bool) on success. - Result.Err(str) with the error message if put operation fails. + Uploads a local file to an S3 bucket using AWS's PutObject operation. + :param s3_config: S3 configuration specifying the upload destination and credentials. + :param src_file: Local file to upload. + :param dest_file_name: The name for the uploaded file in the S3 bucket. + :param total_max_attempts: Maximum number of retry attempts for the upload. + :return: Result.OK(bool) on success, or Result.Err(str) with the error message otherwise. """ if not src_file.exists(): return Err(f"{src_file} doesn't exist") if not src_file.is_file(): return Err(f"{src_file} is not a file") if src_file.stat().st_size > 5 * 1024 * 1024 * 1024: - return Err("File size exceeds 5GB limit for single put_object operation") + return Err(f"{src_file} is larger than the limit (5GiB) for a single PutObject operation.") config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) From 6ff92fc312476a0a12f2e48b1a7e3449aaca7a4e Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:07:34 -0500 Subject: [PATCH 33/62] Clean up --- components/clp-py-utils/clp_py_utils/clp_config.py | 8 ++++---- .../executor/compress/fs_compression_task.py | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 4d5ba7a47..f59de7647 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -320,19 +320,19 @@ class S3Config(BaseModel): @validator("region_code") def validate_region_code(cls, field): if field == "": - raise ValueError("region_code can not be empty") + raise ValueError("region_code cannot be empty") return field @validator("bucket") def validate_bucket(cls, field): if field == "": - raise ValueError("bucket can not be empty") + raise ValueError("bucket cannot be empty") return field @validator("key_prefix") def validate_key_prefix(cls, field): if field == "": - raise ValueError("key_prefix can not be empty") + raise ValueError("key_prefix cannot be empty") if not field.endswith("/"): raise ValueError('key_prefix must end with "/"') return field @@ -442,7 +442,7 @@ class StreamOutput(BaseModel): @validator("directory") def validate_directory(cls, field): if "" == field: - raise ValueError("directory can not be empty") + raise ValueError("directory cannot be empty") return field @validator("target_uncompressed_size") diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 4fc949ab9..57b3ca3d5 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -26,7 +26,6 @@ from job_orchestration.scheduler.constants import CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress from job_orchestration.scheduler.scheduler_data import CompressionTaskResult -from result import is_err, Result # Setup logging logger = get_task_logger(__name__) From 9e07d37e355551f0c20596442217e7cc7590faba Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:08:06 -0500 Subject: [PATCH 34/62] update pyproject.toml --- components/clp-py-utils/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/clp-py-utils/pyproject.toml b/components/clp-py-utils/pyproject.toml index 34a97e7bb..6d68ceebe 100644 --- a/components/clp-py-utils/pyproject.toml +++ b/components/clp-py-utils/pyproject.toml @@ -20,7 +20,7 @@ python-dotenv = "^1.0.1" python-Levenshtein = "~0.22" sqlalchemy = "~2.0" PyYAML = "^6.0.1" -result = "~0.17.0" +result = "^0.17.0" StrEnum = "^0.4.15" [build-system] From 915b49d02610ea94a76e62633f2b7564e2622a29 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:21:27 -0500 Subject: [PATCH 35/62] Add docstrings --- .../job-orchestration/job_orchestration/executor/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py index c9a0beaac..e558cc704 100644 --- a/components/job-orchestration/job_orchestration/executor/utils.py +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -10,6 +10,12 @@ def load_worker_config( config_path: Path, logger: Logger, ) -> Optional[WorkerConfig]: + """ + Loads a WorkerConfig object from the specified configuration file. + :param config_path: Path to the configuration file. + :param logger: Logger instance for reporting error if loading fails. + :return: The loaded WorkerConfig object on success, None otherwise. + """ try: return WorkerConfig.parse_obj(read_yaml_config_file(config_path)) except Exception: From a061a29abbfc7958c326e2d9df1c15564ebb3a5f Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:20:13 -0500 Subject: [PATCH 36/62] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../executor/compress/fs_compression_task.py | 18 ++++++++---------- .../job_orchestration/executor/utils.py | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 57b3ca3d5..4e0f70d19 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -179,7 +179,6 @@ def run_clp( # Get s3 config s3_config: S3Config enable_s3_write = False - storage_type = worker_config.archive_output.storage.type if StorageType.S3 == storage_type: if StorageEngine.CLP == clp_storage_engine: @@ -265,8 +264,8 @@ def run_clp( if result.is_err(): logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") s3_error = result.err_value - # Note: it's possible proc finishes before we call terminate() on it. In - # Which case the process will still return success. + # NOTE: It's possible `proc` finishes before we call `terminate` on it, in + # which case the process will still return success. proc.terminate() else: logger.info(f"Finished uploading archive {archive_id} to S3.") @@ -274,8 +273,8 @@ def run_clp( src_archive_file.unlink() if s3_error is None: - # We've started a new archive so add the previous archive's last - # reported size to the total + # We've started a new archive so add the previous archive's last reported size to + # the total total_uncompressed_size += last_archive_stats["uncompressed_size"] total_compressed_size += last_archive_stats["size"] with closing(sql_adapter.create_connection(True)) as db_conn, closing( @@ -316,13 +315,12 @@ def run_clp( if compression_successful and s3_error is None: return CompressionTaskStatus.SUCCEEDED, worker_output else: - worker_output["error_message"] = "" + error_msgs = [] if compression_successful is False: - worker_output["error_message"] += f"See logs {stderr_log_path}" + error_msgs.append(f"See logs {stderr_log_path}") if s3_error is not None: - if worker_output["error_message"]: - worker_output["error_message"] += "\n" - worker_output["error_message"] += s3_error + error_msgs.append(s3_error) + worker_output["error_message"] = "\n".join(error_msgs) return CompressionTaskStatus.FAILED, worker_output diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py index e558cc704..47ea702ae 100644 --- a/components/job-orchestration/job_orchestration/executor/utils.py +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -13,7 +13,7 @@ def load_worker_config( """ Loads a WorkerConfig object from the specified configuration file. :param config_path: Path to the configuration file. - :param logger: Logger instance for reporting error if loading fails. + :param logger: Logger instance for reporting errors if loading fails. :return: The loaded WorkerConfig object on success, None otherwise. """ try: From 83017480290c676a319eb5112389c9c2996ea28b Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:24:11 -0500 Subject: [PATCH 37/62] Update name as suggested by the code review --- .../executor/compress/fs_compression_task.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 4e0f70d19..d140d3777 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -255,11 +255,11 @@ def run_clp( ): if enable_s3_write: archive_id = last_archive_stats["id"] - src_archive_file = archive_output_dir / archive_id + archive_path = archive_output_dir / archive_id if s3_error is None: logger.info(f"Uploading archive {archive_id} to S3...") - result = s3_put(s3_config, src_archive_file, archive_id) + result = s3_put(s3_config, archive_path, archive_id) if result.is_err(): logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") @@ -270,7 +270,7 @@ def run_clp( else: logger.info(f"Finished uploading archive {archive_id} to S3.") - src_archive_file.unlink() + archive_path.unlink() if s3_error is None: # We've started a new archive so add the previous archive's last reported size to From 2ada4646f7db277d3adab585f687a394c82e62c8 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:54:55 -0500 Subject: [PATCH 38/62] a few small fixes to ensure other scripts still work --- .../clp_package_utils/scripts/decompress.py | 12 +++++++++++- .../clp_package_utils/scripts/del_archives.py | 7 +++++++ .../clp_package_utils/scripts/native/decompress.py | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/decompress.py index 325f2add6..9fdbeee34 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/decompress.py @@ -5,7 +5,7 @@ import sys from typing import Optional -from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.clp_config import CLPConfig, StorageType from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -81,6 +81,11 @@ def handle_extract_file_cmd( if clp_config is None: return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"File extraction is not supported for storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.FILE_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( @@ -156,6 +161,11 @@ def handle_extract_stream_cmd( if clp_config is None: return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Stream extraction is not supported for storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.IR_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( diff --git a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py index 54d959771..5b9bc6d97 100644 --- a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py +++ b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py @@ -4,6 +4,8 @@ import sys from pathlib import Path +from clp_py_utils.clp_config import StorageType + from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, dump_container_config, @@ -57,6 +59,11 @@ def main(argv): logger.exception("Failed to load config.") return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Archive deletion is not supported for storage type: {storage_type}.") + return -1 + # Validate the input timestamp begin_ts = parsed_args.begin_ts end_ts = parsed_args.end_ts diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py index f1183e10e..7e3c7da6e 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py @@ -167,7 +167,7 @@ def validate_and_load_config_file( """ try: clp_config = load_config_file(config_file_path, default_config_file_path, clp_home) - clp_config.validate_archive_output_dir() + clp_config.validate_archive_output_config() clp_config.validate_logs_dir() return clp_config except Exception: From 6e5aad5ed4164528fb3e6862539b294fdc21471a Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:52:11 -0500 Subject: [PATCH 39/62] adding safeguard for empty stdout line from clp. --- .../executor/compress/fs_compression_task.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index d140d3777..fdd05263b 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -245,7 +245,11 @@ def run_clp( while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None - if line: + if "" == line: + # Skip empty lines that could be caused by potential errors in printing archive stats + continue + + if line is not None: stats = json.loads(line.decode("ascii")) else: last_line_decoded = True From 55c0f36a0ba0d794234016bfbc04a51060388696 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:23:54 -0500 Subject: [PATCH 40/62] add safe guard for search --- .../clp-package-utils/clp_package_utils/scripts/search.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py index beb7fb0b0..af3693b16 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/search.py @@ -7,6 +7,7 @@ import uuid import yaml +from clp_py_utils.clp_config import StorageType from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -74,6 +75,11 @@ def main(argv): logger.exception("Failed to load config.") return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Search is not supported for storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.SEARCH)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) From 2d7443e72ce0e29c110813a62f14ddc6f47d8cda Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:36:42 -0500 Subject: [PATCH 41/62] Polish error messages. --- .../clp_package_utils/scripts/decompress.py | 6 ++++-- .../clp-package-utils/clp_package_utils/scripts/search.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/decompress.py index 9fdbeee34..092c339a6 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/decompress.py @@ -83,7 +83,7 @@ def handle_extract_file_cmd( storage_type = clp_config.archive_output.storage.type if StorageType.FS != storage_type: - logger.error(f"File extraction is not supported for storage type: {storage_type}.") + logger.error(f"File extraction is not supported for archive storage type: {storage_type}.") return -1 container_name = generate_container_name(str(JobType.FILE_EXTRACTION)) @@ -163,7 +163,9 @@ def handle_extract_stream_cmd( storage_type = clp_config.archive_output.storage.type if StorageType.FS != storage_type: - logger.error(f"Stream extraction is not supported for storage type: {storage_type}.") + logger.error( + f"Stream extraction is not supported for archive storage type: {storage_type}." + ) return -1 container_name = generate_container_name(str(JobType.IR_EXTRACTION)) diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py index af3693b16..ee50ecf02 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/search.py @@ -77,7 +77,9 @@ def main(argv): storage_type = clp_config.archive_output.storage.type if StorageType.FS != storage_type: - logger.error(f"Search is not supported for storage type: {storage_type}.") + logger.error( + f"Search is not supported for archive storage type: {storage_type}." + ) return -1 container_name = generate_container_name(str(JobType.SEARCH)) From 6f907b22a388de32a5a56b195a86c67aacada355 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:51:30 -0500 Subject: [PATCH 42/62] Linter --- .../clp-package-utils/clp_package_utils/scripts/search.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py index ee50ecf02..38d528528 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/search.py @@ -77,9 +77,7 @@ def main(argv): storage_type = clp_config.archive_output.storage.type if StorageType.FS != storage_type: - logger.error( - f"Search is not supported for archive storage type: {storage_type}." - ) + logger.error(f"Search is not supported for archive storage type: {storage_type}.") return -1 container_name = generate_container_name(str(JobType.SEARCH)) From 120ffec2ea3f5611b322521b1b092953ea46f732 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 15:12:16 -0500 Subject: [PATCH 43/62] Slighlty improve the error message --- .../executor/compress/fs_compression_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index fdd05263b..a5dbc0e35 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -181,8 +181,8 @@ def run_clp( enable_s3_write = False storage_type = worker_config.archive_output.storage.type if StorageType.S3 == storage_type: - if StorageEngine.CLP == clp_storage_engine: - error_msg = f"S3 storage is not supported for the {clp_storage_engine} storage engine." + if StorageEngine.CLP_S != clp_storage_engine: + error_msg = f"S3 storage is not supported for storage engine: {clp_storage_engine}." logger.error(error_msg) return False, {"error_message": error_msg} From d5eae21b1b2b6633d440c3dda689b5ef18a31421 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:58:28 -0500 Subject: [PATCH 44/62] Back up --- .../scripts/native/compress.py | 4 +- .../clp-py-utils/clp_py_utils/s3_utils.py | 35 +++++ .../executor/compress/fs_compression_task.py | 21 ++- .../compress/compression_scheduler.py | 124 +++++++++++++----- .../job_orchestration/scheduler/job_config.py | 17 ++- components/job-orchestration/pyproject.toml | 1 + 6 files changed, 165 insertions(+), 37 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py index b6d9bb7eb..88ccc8aba 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py @@ -15,7 +15,7 @@ CompressionJobCompletionStatus, CompressionJobStatus, ) -from job_orchestration.scheduler.job_config import ClpIoConfig, InputConfig, OutputConfig +from job_orchestration.scheduler.job_config import ClpIoConfig, FsInputConfig, OutputConfig, S3InputConfig from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -194,7 +194,7 @@ def main(argv): paths_to_compress.append(resolved_path_str) mysql_adapter = SQL_Adapter(clp_config.database) - clp_input_config = InputConfig( + clp_input_config = FsInputConfig( paths_to_compress=paths_to_compress, timestamp_key=parsed_args.timestamp_key ) if parsed_args.remove_path_prefix: diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 03717a445..5c5880df0 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -5,8 +5,43 @@ from botocore.exceptions import ClientError from result import Err, Ok, Result +from typing import List + from clp_py_utils.clp_config import S3Config +from job_orchestration.scheduler.job_config import S3InputConfig +from clp_py_utils.compression import FileMetadata + +def verify_s3_ingestion_config(s3_input_config: S3InputConfig) -> Result[boto3.client, str]: + # Consider actually returning the S3 client? + s3_client = boto3.client( + "s3", + region_name=s3_input_config.region_code, + aws_access_key_id=s3_input_config.aws_access_key_id, + aws_secret_access_key=s3_input_config.aws_secret_access_key, + ) + return Ok(s3_client) + +def get_file_metadata_with_bucket_prefix( + s3_client: boto3.client, + s3_config: S3InputConfig +) -> List[FileMetadata]: + file_metadata_list: List[FileMetadata] = list() + paginator = s3_client.get_paginator("list_objects_v2") + pages = paginator.paginate(Bucket=s3_config.bucket, Prefix=s3_config.key_prefix) + + for page in pages: + contents = page.get("Contents", None) + if contents: + for obj in contents: + # we skip empty directory in s3. + object_key = obj["Key"] + if object_key.endswith("/"): + continue + # Note, don't resolve this path + # s3_path = Path(bucket_name + "/" + object_key) + file_metadata_list.append(FileMetadata(Path(object_key), obj["Size"])) + return file_metadata_list def s3_put( s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index a5dbc0e35..abc34e901 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -176,6 +176,20 @@ def run_clp( yaml.safe_dump(clp_metadata_db_connection_config, db_config_file) db_config_file.close() + # Get input config + input_config = clp_config.input + input_type = input_config.type + if "s3" == input_type: + logger.info(paths_to_compress.file_paths) + logger.error("Unsupported for now") + worker_output = { + "total_uncompressed_size": 0, + "total_compressed_size": 0, + "error_message": "unsupported" + } + return CompressionTaskStatus.FAILED, worker_output + + # Get s3 config s3_config: S3Config enable_s3_write = False @@ -209,8 +223,9 @@ def run_clp( return False, {"error_message": f"Unsupported storage engine {clp_storage_engine}"} # Prepare list of paths to compress for clp - file_paths = paths_to_compress.file_paths log_list_path = data_dir / f"{instance_id_str}-log-paths.txt" + file_paths = paths_to_compress.file_paths + with open(log_list_path, "w") as file: if len(file_paths) > 0: for path_str in file_paths: @@ -222,8 +237,8 @@ def run_clp( file.write(path_str) file.write("\n") - compression_cmd.append("--files-from") - compression_cmd.append(str(log_list_path)) + compression_cmd.append("--files-from") + compression_cmd.append(str(log_list_path)) # Open stderr log file stderr_log_path = logs_dir / f"{instance_id_str}-stderr.log" diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index bd793686b..cef0bc932 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -7,6 +7,7 @@ from contextlib import closing from pathlib import Path +import boto3 import brotli import celery import msgpack @@ -21,10 +22,11 @@ from clp_py_utils.compression import validate_path_and_get_info from clp_py_utils.core import read_yaml_config_file from clp_py_utils.sql_adapter import SQL_Adapter +from clp_py_utils.s3_utils import verify_s3_ingestion_config, get_file_metadata_with_bucket_prefix from job_orchestration.executor.compress.fs_compression_task import compress from job_orchestration.scheduler.compress.partition import PathsToCompressBuffer from job_orchestration.scheduler.constants import CompressionJobStatus, CompressionTaskStatus -from job_orchestration.scheduler.job_config import ClpIoConfig +from job_orchestration.scheduler.job_config import ClpIoConfig, FsInputConfig, S3InputConfig from job_orchestration.scheduler.scheduler_data import ( CompressionJob, CompressionTaskResult, @@ -78,6 +80,59 @@ def update_compression_job_metadata(db_cursor, job_id, kv): db_cursor.execute(query, values) +def process_fs_input_paths( + fs_input_conf: FsInputConfig, + paths_to_compress_buffer: PathsToCompressBuffer +): + for path_idx, path in enumerate(fs_input_conf.paths_to_compress, start=1): + path = Path(path) + + try: + file, empty_directory = validate_path_and_get_info( + CONTAINER_INPUT_LOGS_ROOT_DIR, path + ) + except ValueError as ex: + logger.error(str(ex)) + continue + + if file: + paths_to_compress_buffer.add_file(file) + elif empty_directory: + paths_to_compress_buffer.add_empty_directory(empty_directory) + + if path.is_dir(): + for internal_path in path.rglob("*"): + try: + file, empty_directory = validate_path_and_get_info( + CONTAINER_INPUT_LOGS_ROOT_DIR, internal_path + ) + except ValueError as ex: + logger.error(str(ex)) + continue + + if file: + paths_to_compress_buffer.add_file(file) + elif empty_directory: + paths_to_compress_buffer.add_empty_directory(empty_directory) + + +def process_s3_input( + s3_client: boto3.client, + input_config: S3InputConfig, + paths_to_compress_buffer: PathsToCompressBuffer, +): + try: + file_metadata_list = get_file_metadata_with_bucket_prefix(s3_client, input_config) + except Exception: + logger.exception("Failed to get file metadata from s3") + return False + + for file_metadata in file_metadata_list: + paths_to_compress_buffer.add_file(file_metadata) + + return True + + def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection_config): """ For all jobs with PENDING status, split the job into tasks and schedule them. @@ -103,39 +158,48 @@ def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection clp_metadata_db_connection_config=clp_metadata_db_connection_config, ) - for path_idx, path in enumerate(clp_io_config.input.paths_to_compress, start=1): - path = Path(path) - - try: - file, empty_directory = validate_path_and_get_info( - CONTAINER_INPUT_LOGS_ROOT_DIR, path + input_config = clp_io_config.input + input_type = input_config.type + if input_type == "fs": + process_fs_input_paths(input_config, paths_to_compress_buffer) + elif input_type == "s3": + res = verify_s3_ingestion_config(input_config) + if res.is_err(): + logger.error(f"S3 verification failed: {res.err_value}") + update_compression_job_metadata( + db_cursor, + job_id, + { + "status": CompressionJobStatus.FAILED, + "status_msg": f"invalid input type: {input_type}", + }, ) - except ValueError as ex: - logger.error(str(ex)) + db_conn.commit() continue - if file: - paths_to_compress_buffer.add_file(file) - elif empty_directory: - paths_to_compress_buffer.add_empty_directory(empty_directory) - - if path.is_dir(): - for internal_path in path.rglob("*"): - try: - file, empty_directory = validate_path_and_get_info( - CONTAINER_INPUT_LOGS_ROOT_DIR, internal_path - ) - except ValueError as ex: - logger.error(str(ex)) - continue - - if file: - paths_to_compress_buffer.add_file(file) - elif empty_directory: - paths_to_compress_buffer.add_empty_directory(empty_directory) - - if path_idx % 10000 == 0: + if not process_s3_input(res.ok_value, input_config, paths_to_compress_buffer): + update_compression_job_metadata( + db_cursor, + job_id, + { + "status": CompressionJobStatus.FAILED, + "status_msg": f"S3 somehow failed during scheduling", + }, + ) db_conn.commit() + continue + else: + logger.error(f"Unexpected input type {input_type}") + update_compression_job_metadata( + db_cursor, + job_id, + { + "status": CompressionJobStatus.FAILED, + "status_msg": f"invalid input type: {input_type}", + }, + ) + db_conn.commit() + continue paths_to_compress_buffer.flush() tasks = paths_to_compress_buffer.get_tasks() diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py index 7cf8b2324..140acd515 100644 --- a/components/job-orchestration/job_orchestration/scheduler/job_config.py +++ b/components/job-orchestration/job_orchestration/scheduler/job_config.py @@ -12,12 +12,25 @@ class PathsToCompress(BaseModel): empty_directories: typing.List[str] = None -class InputConfig(BaseModel): +class FsInputConfig(BaseModel): + type: typing.Literal["fs"] = "fs" paths_to_compress: typing.List[str] path_prefix_to_remove: str = None timestamp_key: typing.Optional[str] = None +class S3InputConfig(BaseModel): + type: typing.Literal["s3"] = "s3" + timestamp_key: typing.Optional[str] = None + + region_code: str + bucket: str + key_prefix: str + + aws_access_key_id: typing.Optional[str] = None + aws_secret_access_key: typing.Optional[str] = None + + class OutputConfig(BaseModel): tags: typing.Optional[typing.List[str]] = None target_archive_size: int @@ -27,7 +40,7 @@ class OutputConfig(BaseModel): class ClpIoConfig(BaseModel): - input: InputConfig + input: typing.Union[S3InputConfig, FsInputConfig] output: OutputConfig diff --git a/components/job-orchestration/pyproject.toml b/components/job-orchestration/pyproject.toml index c89d84dec..7941fb0db 100644 --- a/components/job-orchestration/pyproject.toml +++ b/components/job-orchestration/pyproject.toml @@ -10,6 +10,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" +boto3 = "^1.35.81" Brotli = "^1.1.0" celery = "^5.3.6" # mariadb version must be compatible with libmariadev installed in runtime env. From ce2b440dc23c70762d70da535fc892a65cec06a9 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 20:41:08 -0500 Subject: [PATCH 45/62] Backup --- .../clp-py-utils/clp_py_utils/s3_utils.py | 4 + .../executor/compress/fs_compression_task.py | 107 +++++++++++------- .../job_orchestration/scheduler/job_config.py | 10 +- 3 files changed, 77 insertions(+), 44 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 5c5880df0..0645272a0 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -21,6 +21,10 @@ def verify_s3_ingestion_config(s3_input_config: S3InputConfig) -> Result[boto3.c ) return Ok(s3_client) +def generate_s3_virtual_hosted_style_url(region_code: str, bucket_name: str, object_key: str) -> str: + return f"https://{bucket_name}.s3.{region_code}.amazonaws.com/{object_key}" + + def get_file_metadata_with_bucket_prefix( s3_client: boto3.client, s3_config: S3InputConfig diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index abc34e901..8d890f864 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -4,7 +4,7 @@ import pathlib import subprocess from contextlib import closing -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, List, Tuple import yaml from celery.app.task import Task @@ -20,11 +20,11 @@ ) from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.core import read_yaml_config_file -from clp_py_utils.s3_utils import s3_put +from clp_py_utils.s3_utils import s3_put, generate_s3_virtual_hosted_style_url from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app from job_orchestration.scheduler.constants import CompressionTaskStatus -from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress +from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress, InputType, S3InputConfig from job_orchestration.scheduler.scheduler_data import CompressionTaskResult # Setup logging @@ -76,13 +76,43 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi ), ) +def generate_fs_log_list_path_file( + output_file_path: pathlib.Path, + paths_to_compress: PathsToCompress, +): + file_paths = paths_to_compress.file_paths + empty_directories = paths_to_compress.empty_directories + with open(output_file_path, "w") as file: + if len(file_paths) > 0: + for path_str in file_paths: + file.write(path_str) + file.write("\n") + if empty_directories and len(empty_directories) > 0: + # Prepare list of paths to compress for clp + for path_str in empty_directories: + file.write(path_str) + file.write("\n") + + +def generate_s3_log_list_path_file( + output_file_path: pathlib.Path, + paths_to_compress: PathsToCompress, + s3_input_config: S3InputConfig, +): + file_paths = paths_to_compress.file_paths + with open(output_file_path, "w") as file: + if len(file_paths) > 0: + for path_str in file_paths: + file.write(generate_s3_virtual_hosted_style_url(s3_input_config.region_code, s3_input_config.bucket, path_str)) + file.write("\n") + def make_clp_command( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, clp_config: ClpIoConfig, db_config_file_path: pathlib.Path, -): +) -> Tuple[List[str], Optional[Dict[str, str]]]: path_prefix_to_remove = clp_config.input.path_prefix_to_remove # fmt: off @@ -106,7 +136,7 @@ def make_clp_command( compression_cmd.append("--schema-path") compression_cmd.append(str(schema_path)) - return compression_cmd + return compression_cmd, None def make_clp_s_command( @@ -115,7 +145,7 @@ def make_clp_s_command( clp_config: ClpIoConfig, db_config_file_path: pathlib.Path, enable_s3_write: bool, -): +) -> Tuple[List[str], Optional[Dict[str, str]]]: # fmt: off compression_cmd = [ str(clp_home / "bin" / "clp-s"), @@ -127,6 +157,17 @@ def make_clp_s_command( ] # fmt: on + if InputType.S3 == clp_config.input.type: + compression_env = { + **os.environ, + "AWS_ACCESS_KEY_ID": clp_config.input.aws_access_key_id, + "AWS_SECRET_ACCESS_KEY": clp_config.input.aws_secret_access_key, + } + compression_cmd.append("--auth") + compression_cmd.append("s3") + else: + compression_env = None + if enable_s3_write: compression_cmd.append("--single-file-archive") @@ -134,7 +175,7 @@ def make_clp_s_command( compression_cmd.append("--timestamp-key") compression_cmd.append(clp_config.input.timestamp_key) - return compression_cmd + return compression_cmd, compression_env def run_clp( @@ -176,20 +217,6 @@ def run_clp( yaml.safe_dump(clp_metadata_db_connection_config, db_config_file) db_config_file.close() - # Get input config - input_config = clp_config.input - input_type = input_config.type - if "s3" == input_type: - logger.info(paths_to_compress.file_paths) - logger.error("Unsupported for now") - worker_output = { - "total_uncompressed_size": 0, - "total_compressed_size": 0, - "error_message": "unsupported" - } - return CompressionTaskStatus.FAILED, worker_output - - # Get s3 config s3_config: S3Config enable_s3_write = False @@ -204,14 +231,14 @@ def run_clp( enable_s3_write = True if StorageEngine.CLP == clp_storage_engine: - compression_cmd = make_clp_command( + compression_cmd, compression_env = make_clp_command( clp_home=clp_home, archive_output_dir=archive_output_dir, clp_config=clp_config, db_config_file_path=db_config_file_path, ) elif StorageEngine.CLP_S == clp_storage_engine: - compression_cmd = make_clp_s_command( + compression_cmd, compression_env = make_clp_s_command( clp_home=clp_home, archive_output_dir=archive_output_dir, clp_config=clp_config, @@ -223,19 +250,16 @@ def run_clp( return False, {"error_message": f"Unsupported storage engine {clp_storage_engine}"} # Prepare list of paths to compress for clp + input_type = clp_config.input.type log_list_path = data_dir / f"{instance_id_str}-log-paths.txt" - file_paths = paths_to_compress.file_paths - - with open(log_list_path, "w") as file: - if len(file_paths) > 0: - for path_str in file_paths: - file.write(path_str) - file.write("\n") - if paths_to_compress.empty_directories and len(paths_to_compress.empty_directories) > 0: - # Prepare list of paths to compress for clp - for path_str in paths_to_compress.empty_directories: - file.write(path_str) - file.write("\n") + if InputType.FS == input_type: + generate_fs_log_list_path_file(log_list_path, paths_to_compress) + elif InputType.S3 == input_type: + generate_s3_log_list_path_file(log_list_path, paths_to_compress, clp_config.input) + else: + error_msg = f"Unsupported input type: {input_type}." + logger.error(error_msg) + return False, {"error_message": error_msg} compression_cmd.append("--files-from") compression_cmd.append(str(log_list_path)) @@ -247,7 +271,12 @@ def run_clp( # Start compression logger.debug("Compressing...") compression_successful = False - proc = subprocess.Popen(compression_cmd, stdout=subprocess.PIPE, stderr=stderr_log_file) + proc = subprocess.Popen( + compression_cmd, + stdout=subprocess.PIPE, + stderr=stderr_log_file, + env=compression_env + ) # Compute the total amount of data compressed last_archive_stats = None @@ -260,11 +289,7 @@ def run_clp( while not last_line_decoded: line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None - if "" == line: - # Skip empty lines that could be caused by potential errors in printing archive stats - continue - - if line is not None: + if line: stats = json.loads(line.decode("ascii")) else: last_line_decoded = True diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py index 140acd515..8d0267b27 100644 --- a/components/job-orchestration/job_orchestration/scheduler/job_config.py +++ b/components/job-orchestration/job_orchestration/scheduler/job_config.py @@ -1,9 +1,13 @@ from __future__ import annotations import typing - +from enum import auto from pydantic import BaseModel, validator +from strenum import LowercaseStrEnum +class InputType(LowercaseStrEnum): + FS = auto() + S3 = auto() class PathsToCompress(BaseModel): file_paths: typing.List[str] @@ -13,14 +17,14 @@ class PathsToCompress(BaseModel): class FsInputConfig(BaseModel): - type: typing.Literal["fs"] = "fs" + type: typing.Literal[InputType.FS.value] = InputType.FS.value paths_to_compress: typing.List[str] path_prefix_to_remove: str = None timestamp_key: typing.Optional[str] = None class S3InputConfig(BaseModel): - type: typing.Literal["s3"] = "s3" + type: typing.Literal[InputType.S3.value] = InputType.S3.value timestamp_key: typing.Optional[str] = None region_code: str From b8f715d9467b4e82b6911795cb94b4a63fac84d3 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 19 Dec 2024 15:34:41 -0500 Subject: [PATCH 46/62] Update execution image dependency --- .../setup-scripts/install-prebuilt-packages.sh | 1 + .../setup-scripts/install-prebuilt-packages.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/docker-images/clp-execution-base-ubuntu-focal/setup-scripts/install-prebuilt-packages.sh b/tools/docker-images/clp-execution-base-ubuntu-focal/setup-scripts/install-prebuilt-packages.sh index 9e20e791c..a648a137b 100755 --- a/tools/docker-images/clp-execution-base-ubuntu-focal/setup-scripts/install-prebuilt-packages.sh +++ b/tools/docker-images/clp-execution-base-ubuntu-focal/setup-scripts/install-prebuilt-packages.sh @@ -8,6 +8,7 @@ set -u apt-get update DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + ca-certificates \ checkinstall \ curl \ libmariadb-dev \ diff --git a/tools/docker-images/clp-execution-base-ubuntu-jammy/setup-scripts/install-prebuilt-packages.sh b/tools/docker-images/clp-execution-base-ubuntu-jammy/setup-scripts/install-prebuilt-packages.sh index 9e20e791c..a648a137b 100755 --- a/tools/docker-images/clp-execution-base-ubuntu-jammy/setup-scripts/install-prebuilt-packages.sh +++ b/tools/docker-images/clp-execution-base-ubuntu-jammy/setup-scripts/install-prebuilt-packages.sh @@ -8,6 +8,7 @@ set -u apt-get update DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + ca-certificates \ checkinstall \ curl \ libmariadb-dev \ From 57e1912b8103643e6e834af8ddd8c9f80ccf77e1 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:26:53 -0500 Subject: [PATCH 47/62] simplify the code a little bit --- .../clp-py-utils/clp_py_utils/s3_utils.py | 49 +++++++++++-------- .../compress/compression_scheduler.py | 37 +++++--------- components/job-orchestration/pyproject.toml | 1 - 3 files changed, 40 insertions(+), 47 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 5e82d609d..4a0cfe707 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -11,41 +11,48 @@ from job_orchestration.scheduler.job_config import S3InputConfig from clp_py_utils.compression import FileMetadata -def verify_s3_ingestion_config(s3_input_config: S3InputConfig) -> Result[boto3.client, str]: - # Consider actually returning the S3 client? + +def generate_s3_virtual_hosted_style_url(region_code: str, bucket_name: str, object_key: str) -> str: + return f"https://{bucket_name}.s3.{region_code}.amazonaws.com/{object_key}" + + +def get_s3_file_metadata( + s3_input_config: S3InputConfig +) -> Result[List[FileMetadata], str]: + file_metadata_list: List[FileMetadata] = list() + s3_client = boto3.client( "s3", region_name=s3_input_config.region_code, aws_access_key_id=s3_input_config.aws_access_key_id, aws_secret_access_key=s3_input_config.aws_secret_access_key, ) - return Ok(s3_client) - -def generate_s3_virtual_hosted_style_url(region_code: str, bucket_name: str, object_key: str) -> str: - return f"https://{bucket_name}.s3.{region_code}.amazonaws.com/{object_key}" + try: + paginator = s3_client.get_paginator("list_objects_v2") + pages = paginator.paginate(Bucket=s3_input_config.bucket, Prefix=s3_input_config.key_prefix) -def get_file_metadata_with_bucket_prefix( - s3_client: boto3.client, - s3_config: S3InputConfig -) -> List[FileMetadata]: - file_metadata_list: List[FileMetadata] = list() - paginator = s3_client.get_paginator("list_objects_v2") - pages = paginator.paginate(Bucket=s3_config.bucket, Prefix=s3_config.key_prefix) + for page in pages: + contents = page.get("Contents", None) + if contents is None: + continue - for page in pages: - contents = page.get("Contents", None) - if contents: for obj in contents: - # we skip empty directory in s3. object_key = obj["Key"] - if object_key.endswith("/"): + if not object_key.endswith("/"): + # Skip any object that resolves to a directory like path continue - # Note, don't resolve this path - # s3_path = Path(bucket_name + "/" + object_key) + file_metadata_list.append(FileMetadata(Path(object_key), obj["Size"])) - return file_metadata_list + except ClientError as e: + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + return Err(f"ClientError: {error_code} - {error_message}") + except Exception as e: + return Err(f"An unexpected error occurred: {e}") + + return Ok(file_metadata_list) def s3_put( diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index cef0bc932..d9196cc7f 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -6,8 +6,8 @@ import time from contextlib import closing from pathlib import Path +from result import Ok, Result -import boto3 import brotli import celery import msgpack @@ -22,7 +22,7 @@ from clp_py_utils.compression import validate_path_and_get_info from clp_py_utils.core import read_yaml_config_file from clp_py_utils.sql_adapter import SQL_Adapter -from clp_py_utils.s3_utils import verify_s3_ingestion_config, get_file_metadata_with_bucket_prefix +from clp_py_utils.s3_utils import get_s3_files_metadata from job_orchestration.executor.compress.fs_compression_task import compress from job_orchestration.scheduler.compress.partition import PathsToCompressBuffer from job_orchestration.scheduler.constants import CompressionJobStatus, CompressionTaskStatus @@ -117,20 +117,20 @@ def process_fs_input_paths( def process_s3_input( - s3_client: boto3.client, input_config: S3InputConfig, paths_to_compress_buffer: PathsToCompressBuffer, -): - try: - file_metadata_list = get_file_metadata_with_bucket_prefix(s3_client, input_config) - except Exception: - logger.exception("Failed to get file metadata from s3") - return False +) -> Result[bool, str]: + res = get_s3_file_metadata(input_config) + if res.is_err(): + logger.error(f"Failed to process s3 input: {res.err_value}") + return res + + file_metadata_list = res.ok_value for file_metadata in file_metadata_list: paths_to_compress_buffer.add_file(file_metadata) - return True + return Ok(True) def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection_config): @@ -163,27 +163,14 @@ def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection if input_type == "fs": process_fs_input_paths(input_config, paths_to_compress_buffer) elif input_type == "s3": - res = verify_s3_ingestion_config(input_config) + res = process_s3_input(input_config, paths_to_compress_buffer) if res.is_err(): - logger.error(f"S3 verification failed: {res.err_value}") - update_compression_job_metadata( - db_cursor, - job_id, - { - "status": CompressionJobStatus.FAILED, - "status_msg": f"invalid input type: {input_type}", - }, - ) - db_conn.commit() - continue - - if not process_s3_input(res.ok_value, input_config, paths_to_compress_buffer): update_compression_job_metadata( db_cursor, job_id, { "status": CompressionJobStatus.FAILED, - "status_msg": f"S3 somehow failed during scheduling", + "status_msg": f"Scheduler Failed for s3 input: {res.err_value}", }, ) db_conn.commit() diff --git a/components/job-orchestration/pyproject.toml b/components/job-orchestration/pyproject.toml index 7941fb0db..c89d84dec 100644 --- a/components/job-orchestration/pyproject.toml +++ b/components/job-orchestration/pyproject.toml @@ -10,7 +10,6 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" -boto3 = "^1.35.81" Brotli = "^1.1.0" celery = "^5.3.6" # mariadb version must be compatible with libmariadev installed in runtime env. From 27b861247b60ec539788f51bdc5fa80239f89b05 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 19 Dec 2024 17:00:17 -0500 Subject: [PATCH 48/62] fix a previous mistake --- .../scheduler/compress/compression_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index d9196cc7f..1aadf6962 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -22,7 +22,7 @@ from clp_py_utils.compression import validate_path_and_get_info from clp_py_utils.core import read_yaml_config_file from clp_py_utils.sql_adapter import SQL_Adapter -from clp_py_utils.s3_utils import get_s3_files_metadata +from clp_py_utils.s3_utils import get_s3_file_metadata from job_orchestration.executor.compress.fs_compression_task import compress from job_orchestration.scheduler.compress.partition import PathsToCompressBuffer from job_orchestration.scheduler.constants import CompressionJobStatus, CompressionTaskStatus From d55f1ad73bcc5724f9f8f27addb7186f49d8500e Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 19 Dec 2024 17:07:13 -0500 Subject: [PATCH 49/62] Keep fixing previous mistake --- components/clp-py-utils/clp_py_utils/s3_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 4a0cfe707..000c81010 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -39,7 +39,7 @@ def get_s3_file_metadata( for obj in contents: object_key = obj["Key"] - if not object_key.endswith("/"): + if object_key.endswith("/"): # Skip any object that resolves to a directory like path continue From 4de4fee6f80bdeaa7af39c41bd88d64da3f65e3c Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 19 Dec 2024 17:45:49 -0500 Subject: [PATCH 50/62] add url parsing helper --- .../clp-py-utils/clp_py_utils/s3_utils.py | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 000c81010..ff3c68c67 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -4,16 +4,46 @@ from botocore.config import Config from botocore.exceptions import ClientError from result import Err, Ok, Result - -from typing import List +import re +from typing import List, Tuple from clp_py_utils.clp_config import S3Config from job_orchestration.scheduler.job_config import S3InputConfig from clp_py_utils.compression import FileMetadata +# Constants +AWS_ENDPOINT = "amazonaws.com" + +def parse_s3_url(s3_url: str) -> Tuple[str, str, str]: + host_style_url_regex = re.compile( + r"https://(?P[a-z0-9.-]+)\.s3(\.(?P[a-z0-9-]+))?" + r"\.(?P[a-z0-9.-]+)/(?P[^?]+).*" + ) + match = host_style_url_regex.match(s3_url) + + if match is None: + path_style_url_regex = re.compile( + r"https://s3(\.(?P[a-z0-9-]+))?\.(?P[a-z0-9.-]+)/" + r"(?P[a-z0-9.-]+)/(?P[^?]+).*" + ) + match = path_style_url_regex.match(s3_url) + + if match is None: + raise ValueError(f"Unsupported URL format: {s3_url}") + + region_code = match.group("region_code") + bucket_name = match.group("bucket_name") + endpoint = match.group("endpoint") + key_prefix = match.group("key_prefix") + + if AWS_ENDPOINT != endpoint: + raise ValueError(f"Unsupported endpoint: {endpoint}") + + return region_code, bucket_name, key_prefix + def generate_s3_virtual_hosted_style_url(region_code: str, bucket_name: str, object_key: str) -> str: - return f"https://{bucket_name}.s3.{region_code}.amazonaws.com/{object_key}" + return f"https://{bucket_name}.s3.{region_code}.{AWS_ENDPOINT}/{object_key}" def get_s3_file_metadata( From 4224bd636bd974dea00a8d85d5db650a90f40a4e Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 19 Dec 2024 20:16:55 -0500 Subject: [PATCH 51/62] Linter --- .../scripts/native/compress.py | 6 ++++- .../clp-py-utils/clp_py_utils/s3_utils.py | 15 ++++++----- .../executor/compress/fs_compression_task.py | 27 ++++++++++++------- .../compress/compression_scheduler.py | 11 +++----- .../job_orchestration/scheduler/job_config.py | 3 +++ 5 files changed, 37 insertions(+), 25 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py index 88ccc8aba..438140f60 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py @@ -15,7 +15,11 @@ CompressionJobCompletionStatus, CompressionJobStatus, ) -from job_orchestration.scheduler.job_config import ClpIoConfig, FsInputConfig, OutputConfig, S3InputConfig +from job_orchestration.scheduler.job_config import ( + ClpIoConfig, + FsInputConfig, + OutputConfig, +) from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index ff3c68c67..ebc632cab 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -1,19 +1,20 @@ +import re from pathlib import Path +from typing import List, Tuple import boto3 from botocore.config import Config from botocore.exceptions import ClientError +from job_orchestration.scheduler.job_config import S3InputConfig from result import Err, Ok, Result -import re -from typing import List, Tuple from clp_py_utils.clp_config import S3Config -from job_orchestration.scheduler.job_config import S3InputConfig from clp_py_utils.compression import FileMetadata # Constants AWS_ENDPOINT = "amazonaws.com" + def parse_s3_url(s3_url: str) -> Tuple[str, str, str]: host_style_url_regex = re.compile( r"https://(?P[a-z0-9.-]+)\.s3(\.(?P[a-z0-9-]+))?" @@ -42,13 +43,13 @@ def parse_s3_url(s3_url: str) -> Tuple[str, str, str]: return region_code, bucket_name, key_prefix -def generate_s3_virtual_hosted_style_url(region_code: str, bucket_name: str, object_key: str) -> str: +def generate_s3_virtual_hosted_style_url( + region_code: str, bucket_name: str, object_key: str +) -> str: return f"https://{bucket_name}.s3.{region_code}.{AWS_ENDPOINT}/{object_key}" -def get_s3_file_metadata( - s3_input_config: S3InputConfig -) -> Result[List[FileMetadata], str]: +def get_s3_file_metadata(s3_input_config: S3InputConfig) -> Result[List[FileMetadata], str]: file_metadata_list: List[FileMetadata] = list() s3_client = boto3.client( diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index cfe983be9..514b32ef0 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -4,7 +4,7 @@ import pathlib import subprocess from contextlib import closing -from typing import Any, Dict, Optional, List, Tuple +from typing import Any, Dict, List, Optional, Tuple import yaml from celery.app.task import Task @@ -20,11 +20,16 @@ ) from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.core import read_yaml_config_file -from clp_py_utils.s3_utils import s3_put, generate_s3_virtual_hosted_style_url +from clp_py_utils.s3_utils import generate_s3_virtual_hosted_style_url, s3_put from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app from job_orchestration.scheduler.constants import CompressionTaskStatus -from job_orchestration.scheduler.job_config import ClpIoConfig, PathsToCompress, InputType, S3InputConfig +from job_orchestration.scheduler.job_config import ( + ClpIoConfig, + InputType, + PathsToCompress, + S3InputConfig, +) from job_orchestration.scheduler.scheduler_data import CompressionTaskResult # Setup logging @@ -76,10 +81,11 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi ), ) + def generate_fs_log_list_path_file( output_file_path: pathlib.Path, paths_to_compress: PathsToCompress, -): +) -> None: file_paths = paths_to_compress.file_paths empty_directories = paths_to_compress.empty_directories with open(output_file_path, "w") as file: @@ -98,12 +104,16 @@ def generate_s3_log_list_path_file( output_file_path: pathlib.Path, paths_to_compress: PathsToCompress, s3_input_config: S3InputConfig, -): +) -> None: file_paths = paths_to_compress.file_paths with open(output_file_path, "w") as file: if len(file_paths) > 0: for path_str in file_paths: - file.write(generate_s3_virtual_hosted_style_url(s3_input_config.region_code, s3_input_config.bucket, path_str)) + file.write( + generate_s3_virtual_hosted_style_url( + s3_input_config.region_code, s3_input_config.bucket, path_str + ) + ) file.write("\n") @@ -272,10 +282,7 @@ def run_clp( logger.debug("Compressing...") compression_successful = False proc = subprocess.Popen( - compression_cmd, - stdout=subprocess.PIPE, - stderr=stderr_log_file, - env=compression_env + compression_cmd, stdout=subprocess.PIPE, stderr=stderr_log_file, env=compression_env ) # Compute the total amount of data compressed diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index 1aadf6962..e57049aad 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -6,7 +6,6 @@ import time from contextlib import closing from pathlib import Path -from result import Ok, Result import brotli import celery @@ -21,8 +20,8 @@ from clp_py_utils.clp_logging import get_logger, get_logging_formatter, set_logging_level from clp_py_utils.compression import validate_path_and_get_info from clp_py_utils.core import read_yaml_config_file -from clp_py_utils.sql_adapter import SQL_Adapter from clp_py_utils.s3_utils import get_s3_file_metadata +from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.fs_compression_task import compress from job_orchestration.scheduler.compress.partition import PathsToCompressBuffer from job_orchestration.scheduler.constants import CompressionJobStatus, CompressionTaskStatus @@ -32,6 +31,7 @@ CompressionTaskResult, ) from pydantic import ValidationError +from result import Ok, Result # Setup logging logger = get_logger("compression_scheduler") @@ -81,16 +81,13 @@ def update_compression_job_metadata(db_cursor, job_id, kv): def process_fs_input_paths( - fs_input_conf: FsInputConfig, - paths_to_compress_buffer: PathsToCompressBuffer + fs_input_conf: FsInputConfig, paths_to_compress_buffer: PathsToCompressBuffer ): for path_idx, path in enumerate(fs_input_conf.paths_to_compress, start=1): path = Path(path) try: - file, empty_directory = validate_path_and_get_info( - CONTAINER_INPUT_LOGS_ROOT_DIR, path - ) + file, empty_directory = validate_path_and_get_info(CONTAINER_INPUT_LOGS_ROOT_DIR, path) except ValueError as ex: logger.error(str(ex)) continue diff --git a/components/job-orchestration/job_orchestration/scheduler/job_config.py b/components/job-orchestration/job_orchestration/scheduler/job_config.py index 8d0267b27..576d68886 100644 --- a/components/job-orchestration/job_orchestration/scheduler/job_config.py +++ b/components/job-orchestration/job_orchestration/scheduler/job_config.py @@ -2,13 +2,16 @@ import typing from enum import auto + from pydantic import BaseModel, validator from strenum import LowercaseStrEnum + class InputType(LowercaseStrEnum): FS = auto() S3 = auto() + class PathsToCompress(BaseModel): file_paths: typing.List[str] group_ids: typing.List[int] From 1cf3d01bc6bee9354ddf21912bfdde3a89879baf Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 20 Dec 2024 12:32:02 -0500 Subject: [PATCH 52/62] Some refactor --- .../executor/compress/fs_compression_task.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 514b32ef0..6cb076ee2 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -82,7 +82,7 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi ) -def generate_fs_log_list_path_file( +def generate_fs_log_path_list_file( output_file_path: pathlib.Path, paths_to_compress: PathsToCompress, ) -> None: @@ -100,20 +100,20 @@ def generate_fs_log_list_path_file( file.write("\n") -def generate_s3_log_list_path_file( +def generate_s3_log_url_list_file( output_file_path: pathlib.Path, paths_to_compress: PathsToCompress, s3_input_config: S3InputConfig, ) -> None: - file_paths = paths_to_compress.file_paths + # S3 object keys are stored as file_paths in `PathsToCompress` + object_keys = paths_to_compress.file_paths with open(output_file_path, "w") as file: - if len(file_paths) > 0: - for path_str in file_paths: - file.write( - generate_s3_virtual_hosted_style_url( - s3_input_config.region_code, s3_input_config.bucket, path_str - ) + if len(object_keys) > 0: + for object_key in object_keys: + s3_virtual_hosted_style_url = generate_s3_virtual_hosted_style_url( + s3_input_config.region_code, s3_input_config.bucket, object_key ) + file.write(s3_virtual_hosted_style_url) file.write("\n") @@ -259,20 +259,20 @@ def run_clp( logger.error(f"Unsupported storage engine {clp_storage_engine}") return False, {"error_message": f"Unsupported storage engine {clp_storage_engine}"} - # Prepare list of paths to compress for clp + # Prepare list of targets to compress input_type = clp_config.input.type - log_list_path = data_dir / f"{instance_id_str}-log-paths.txt" + targets_list_path = data_dir / f"{instance_id_str}-log-paths.txt" if InputType.FS == input_type: - generate_fs_log_list_path_file(log_list_path, paths_to_compress) + generate_fs_log_path_list_file(targets_list_path, paths_to_compress) elif InputType.S3 == input_type: - generate_s3_log_list_path_file(log_list_path, paths_to_compress, clp_config.input) + generate_s3_log_url_list_file(targets_list_path, paths_to_compress, clp_config.input) else: error_msg = f"Unsupported input type: {input_type}." logger.error(error_msg) return False, {"error_message": error_msg} compression_cmd.append("--files-from") - compression_cmd.append(str(log_list_path)) + compression_cmd.append(str(targets_list_path)) # Open stderr log file stderr_log_path = logs_dir / f"{instance_id_str}-stderr.log" @@ -351,8 +351,8 @@ def run_clp( compression_successful = True # Remove generated temporary files - if log_list_path: - log_list_path.unlink() + if targets_list_path: + targets_list_path.unlink() db_config_file_path.unlink() logger.debug("Compressed.") From b1655cd6baa89ed3e3c84913d55a2d1594566c06 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 20 Dec 2024 14:38:34 -0500 Subject: [PATCH 53/62] Refactor compress scripts --- .../clp_package_utils/scripts/compress.py | 52 ++++++++++-------- .../scripts/native/compress.py | 55 +++++++------------ 2 files changed, 49 insertions(+), 58 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/compress.py b/components/clp-package-utils/clp_package_utils/scripts/compress.py index efd3180ae..bb9ceb7ea 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/compress.py @@ -7,7 +7,6 @@ from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, - CONTAINER_INPUT_LOGS_ROOT_DIR, dump_container_config, generate_container_config, generate_container_name, @@ -46,6 +45,16 @@ def main(argv): parsed_args = args_parser.parse_args(argv[1:]) + paths_to_compress = parsed_args.paths + compression_path_list = parsed_args.path_list + # Validate some input paths were specified + if len(paths_to_compress) == 0 and compression_path_list is None: + args_parser.error("No paths specified.") + + # Validate paths were specified using only one method + if len(paths_to_compress) > 0 and compression_path_list is not None: + args_parser.error("Paths cannot be specified on the command line AND through a file.") + # Validate and load config file try: config_file_path = pathlib.Path(parsed_args.config) @@ -74,8 +83,7 @@ def main(argv): compress_cmd = [ "python3", "-m", "clp_package_utils.scripts.native.compress", - "--config", str(generated_config_path_on_container), - "--remove-path-prefix", str(CONTAINER_INPUT_LOGS_ROOT_DIR), + "--config", str(generated_config_path_on_container) ] # fmt: on if parsed_args.timestamp_key is not None: @@ -84,30 +92,28 @@ def main(argv): if parsed_args.tags is not None: compress_cmd.append("--tags") compress_cmd.append(parsed_args.tags) - for path in parsed_args.paths: - # Resolve path and prefix it with CONTAINER_INPUT_LOGS_ROOT_DIR - resolved_path = pathlib.Path(path).resolve() - path = str(CONTAINER_INPUT_LOGS_ROOT_DIR / resolved_path.relative_to(resolved_path.anchor)) - compress_cmd.append(path) - if parsed_args.path_list is not None: + + # Write paths to compress to a file + while True: # Get unused output path - while True: - container_path_list_filename = f"{uuid.uuid4()}.txt" - container_path_list_path = clp_config.logs_directory / container_path_list_filename - if not container_path_list_path.exists(): - break - - with open(parsed_args.path_list, "r") as path_list_file: - with open(container_path_list_path, "w") as container_path_list_file: + container_path_list_filename = f"{uuid.uuid4()}.txt" + container_path_list_path = clp_config.logs_directory / container_path_list_filename + if not container_path_list_path.exists(): + break + + with open(container_path_list_path, "w") as container_path_list_file: + if compression_path_list is not None: + with open(parsed_args.path_list, "r") as path_list_file: for line in path_list_file: resolved_path = pathlib.Path(line.rstrip()).resolve() - path = CONTAINER_INPUT_LOGS_ROOT_DIR / resolved_path.relative_to( - resolved_path.anchor - ) - container_path_list_file.write(f"{path}\n") + container_path_list_file.write(f"{resolved_path}\n") + + for path in paths_to_compress: + resolved_path = pathlib.Path(path).resolve() + container_path_list_file.write(f"{resolved_path}\n") - compress_cmd.append("--path-list") - compress_cmd.append(container_clp_config.logs_directory / container_path_list_filename) + compress_cmd.append("--path-list") + compress_cmd.append(container_clp_config.logs_directory / container_path_list_filename) cmd = container_start_cmd + compress_cmd subprocess.run(cmd, check=True) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py index 438140f60..5592b23a4 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py @@ -23,6 +23,7 @@ from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, + CONTAINER_INPUT_LOGS_ROOT_DIR, get_clp_home, load_config_file, ) @@ -132,14 +133,12 @@ def main(argv): default=str(default_config_file_path), help="CLP package configuration file.", ) - args_parser.add_argument("paths", metavar="PATH", nargs="*", help="Paths to compress.") args_parser.add_argument( - "-f", "--path-list", dest="path_list", help="A file listing all paths to compress." - ) - args_parser.add_argument( - "--remove-path-prefix", - metavar="DIR", - help="Removes the given path prefix from each compressed file/dir.", + "-f", + "--path-list", + dest="path_list", + help="A file listing all paths to compress.", + required=True, ) args_parser.add_argument( "--no-progress-reporting", action="store_true", help="Disables progress reporting." @@ -152,17 +151,8 @@ def main(argv): "-t", "--tags", help="A comma-separated list of tags to apply to the compressed archives." ) parsed_args = args_parser.parse_args(argv[1:]) - compress_paths_arg = parsed_args.paths compress_path_list_arg = parsed_args.path_list - # Validate some input paths were specified - if compress_path_list_arg is None and len(compress_paths_arg) == 0: - args_parser.error("No paths specified.") - - # Validate paths were specified using only one method - if len(compress_paths_arg) > 0 and compress_path_list_arg is not None: - args_parser.error("Paths cannot be specified on the command line AND through a file.") - # Validate and load config file try: config_file_path = pathlib.Path(parsed_args.config) @@ -177,32 +167,27 @@ def main(argv): comp_jobs_dir.mkdir(parents=True, exist_ok=True) paths_to_compress = [] - if len(compress_paths_arg) > 0: - for path in compress_paths_arg: - stripped_path = path.strip() - if "" == stripped_path: + # Read paths from the input file + compress_path_list_path = pathlib.Path(compress_path_list_arg).resolve() + with open(compress_path_list_path, "r") as f: + for path in f: + stripped_path_str = path.strip() + if "" == stripped_path_str: # Skip empty paths continue - resolved_path_str = str(pathlib.Path(stripped_path).resolve()) + stripped_path = pathlib.Path(stripped_path_str) + container_file_path = CONTAINER_INPUT_LOGS_ROOT_DIR / pathlib.Path( + stripped_path + ).relative_to(stripped_path.anchor) + resolved_path_str = str(container_file_path.resolve()) paths_to_compress.append(resolved_path_str) - else: - # Read paths from the input file - compress_path_list_path = pathlib.Path(compress_path_list_arg).resolve() - with open(compress_path_list_path, "r") as f: - for path in f: - stripped_path = path.strip() - if "" == stripped_path: - # Skip empty paths - continue - resolved_path_str = str(pathlib.Path(stripped_path).resolve()) - paths_to_compress.append(resolved_path_str) mysql_adapter = SQL_Adapter(clp_config.database) clp_input_config = FsInputConfig( - paths_to_compress=paths_to_compress, timestamp_key=parsed_args.timestamp_key + paths_to_compress=paths_to_compress, + timestamp_key=parsed_args.timestamp_key, + path_prefix_to_remove=str(CONTAINER_INPUT_LOGS_ROOT_DIR), ) - if parsed_args.remove_path_prefix: - clp_input_config.path_prefix_to_remove = parsed_args.remove_path_prefix clp_output_config = OutputConfig.parse_obj(clp_config.archive_output) if parsed_args.tags: tag_list = [tag.strip().lower() for tag in parsed_args.tags.split(",") if tag] From d12e173ff3694664d6a28d6ddccefe8821de0b6a Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 2 Jan 2025 16:22:47 -0500 Subject: [PATCH 54/62] Initial support for cmdline --- .../clp_package_utils/scripts/compress.py | 143 ++++++++++++++---- .../scripts/native/compress.py | 128 ++++++++++++---- 2 files changed, 214 insertions(+), 57 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/compress.py b/components/clp-package-utils/clp_package_utils/scripts/compress.py index bb9ceb7ea..dee4526ae 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/compress.py @@ -4,6 +4,7 @@ import subprocess import sys import uuid +from typing import List from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -16,25 +17,68 @@ load_config_file, validate_and_load_db_credentials_file, ) +from clp_py_utils.clp_config import StorageEngine +from job_orchestration.scheduler.job_config import InputType logger = logging.getLogger(__file__) -def main(argv): - clp_home = get_clp_home() - default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH - - args_parser = argparse.ArgumentParser(description="Compresses files/directories") +def generate_targets_list( + input_type: InputType, + container_targets_list_path: pathlib.Path, + parsed_args: argparse.Namespace, +) -> None: + if InputType.FS == input_type: + compression_targets_list_file = parsed_args.path_list + with open(container_targets_list_path, "w") as container_targets_list_file: + if compression_targets_list_file is not None: + with open(compression_targets_list_file, "r") as targets_list_file: + for line in targets_list_file: + resolved_path = pathlib.Path(line.rstrip()).resolve() + container_targets_list_file.write(f"{resolved_path}\n") + + for path in parsed_args.paths: + resolved_path = pathlib.Path(path).resolve() + container_targets_list_file.write(f"{resolved_path}\n") + + elif InputType.S3 == input_type: + with open(container_targets_list_path, "w") as container_targets_list_file: + container_targets_list_file.write(f"{parsed_args.url}\n") + + else: + raise ValueError(f"Unsupported input type: {input_type}.") + + +def append_input_specific_args( + compress_cmd: List[str], + parsed_args: argparse.Namespace +) -> None: + input_type = parsed_args.input_type + + if InputType.FS == input_type: + return + elif InputType.S3 == input_type: + # TODO: also think about credentials from file. + if parsed_args.aws_access_key_id is not None: + compress_cmd.append("--aws-access-key-id") + compress_cmd.append(parsed_args.aws_access_key_id) + if parsed_args.aws_secret_access_key is not None: + compress_cmd.append("--aws-secret-access-key") + compress_cmd.append(parsed_args.aws_secret_access_key) + else: + raise ValueError(f"Unsupported input type: {input_type}.") + + +def add_common_arguments( + args_parser: argparse.ArgumentParser, + default_config_file_path: pathlib.Path +) -> None: args_parser.add_argument( "--config", "-c", default=str(default_config_file_path), help="CLP package configuration file.", ) - args_parser.add_argument("paths", metavar="PATH", nargs="*", help="Paths to compress.") - args_parser.add_argument( - "-f", "--path-list", dest="path_list", help="A file listing all paths to compress." - ) args_parser.add_argument( "--timestamp-key", help="The path (e.g. x.y) for the field containing the log event's timestamp.", @@ -42,18 +86,39 @@ def main(argv): args_parser.add_argument( "-t", "--tags", help="A comma-separated list of tags to apply to the compressed archives." ) + args_parser.add_argument( + "--no-progress-reporting", action="store_true", help="Disables progress reporting." + ) - parsed_args = args_parser.parse_args(argv[1:]) - paths_to_compress = parsed_args.paths - compression_path_list = parsed_args.path_list - # Validate some input paths were specified - if len(paths_to_compress) == 0 and compression_path_list is None: - args_parser.error("No paths specified.") +def main(argv): + clp_home = get_clp_home() + default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH - # Validate paths were specified using only one method - if len(paths_to_compress) > 0 and compression_path_list is not None: - args_parser.error("Paths cannot be specified on the command line AND through a file.") + args_parser = argparse.ArgumentParser(description="Compresses files from filesystem/s3") + input_type_args_parser = args_parser.add_subparsers(dest="input_type") + + fs_compressor_parser = input_type_args_parser.add_parser(InputType.FS) + add_common_arguments(fs_compressor_parser, default_config_file_path) + fs_compressor_parser.add_argument("paths", metavar="PATH", nargs="*", help="Paths to compress.") + fs_compressor_parser.add_argument( + "-f", "--path-list", dest="path_list", help="A file listing all paths to compress." + ) + + s3_compressor_parser = input_type_args_parser.add_parser(InputType.S3) + add_common_arguments(s3_compressor_parser, default_config_file_path) + s3_compressor_parser.add_argument("url", metavar="URL", help="URL of object to be compressed") + s3_compressor_parser.add_argument( + "--aws-access-key-id", type=str, default=None, help="AWS access key id." + ) + s3_compressor_parser.add_argument( + "--aws-secret-access-key", type=str, default=None, help="AWS secret access key." + ) + # args_parser.add_argument( + # "--aws-credentials-file", type=str, default=None, help="Access key id." + # ) + + parsed_args = args_parser.parse_args(argv[1:]) # Validate and load config file try: @@ -67,6 +132,23 @@ def main(argv): logger.exception("Failed to load config.") return -1 + input_type = parsed_args.input_type + if InputType.FS == input_type: + # Validate some input paths were specified + if len(parsed_args.paths) == 0 and parsed_args.path_list is None: + args_parser.error("No paths specified.") + + # Validate paths were specified using only one method + if len(parsed_args.paths) > 0 and parsed_args.path_list is not None: + args_parser.error("Paths cannot be specified on the command line AND through a file.") + + elif InputType.S3 == input_type: + if StorageEngine.CLP_S != clp_config.package.storage_engine: + raise ValueError(f"input type {InputType.S3} is only supported with storage engine {StorageEngine.CLP_S}") + + else: + raise ValueError(f"Unsupported input type: {input_type}.") + container_name = generate_container_name(str(JobType.COMPRESSION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) @@ -83,6 +165,7 @@ def main(argv): compress_cmd = [ "python3", "-m", "clp_package_utils.scripts.native.compress", + input_type, "--config", str(generated_config_path_on_container) ] # fmt: on @@ -92,8 +175,12 @@ def main(argv): if parsed_args.tags is not None: compress_cmd.append("--tags") compress_cmd.append(parsed_args.tags) + if parsed_args.no_progress_reporting is True: + compress_cmd.append("--no-progress-reporting") + + append_input_specific_args(compress_cmd, parsed_args) - # Write paths to compress to a file + # Write targets to compress to a file while True: # Get unused output path container_path_list_filename = f"{uuid.uuid4()}.txt" @@ -101,19 +188,13 @@ def main(argv): if not container_path_list_path.exists(): break - with open(container_path_list_path, "w") as container_path_list_file: - if compression_path_list is not None: - with open(parsed_args.path_list, "r") as path_list_file: - for line in path_list_file: - resolved_path = pathlib.Path(line.rstrip()).resolve() - container_path_list_file.write(f"{resolved_path}\n") - - for path in paths_to_compress: - resolved_path = pathlib.Path(path).resolve() - container_path_list_file.write(f"{resolved_path}\n") - + generate_targets_list( + input_type, + container_path_list_path, + parsed_args + ) compress_cmd.append("--path-list") - compress_cmd.append(container_clp_config.logs_directory / container_path_list_filename) + compress_cmd.append(str(container_clp_config.logs_directory / container_path_list_filename)) cmd = container_start_cmd + compress_cmd subprocess.run(cmd, check=True) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py index 5592b23a4..5f3fce3e0 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py @@ -4,12 +4,15 @@ import pathlib import sys import time +import typing from contextlib import closing +from typing import List import brotli import msgpack from clp_py_utils.clp_config import COMPRESSION_JOBS_TABLE_NAME from clp_py_utils.pretty_size import pretty_size +from clp_py_utils.s3_utils import parse_s3_url from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.scheduler.constants import ( CompressionJobCompletionStatus, @@ -18,7 +21,9 @@ from job_orchestration.scheduler.job_config import ( ClpIoConfig, FsInputConfig, + S3InputConfig, OutputConfig, + InputType ) from clp_package_utils.general import ( @@ -122,11 +127,77 @@ def handle_job(sql_adapter: SQL_Adapter, clp_io_config: ClpIoConfig, no_progress return CompressionJobCompletionStatus.SUCCEEDED -def main(argv): - clp_home = get_clp_home() - default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH +def generate_clp_io_config( + targets_to_compress: List[str], + parsed_args: argparse.Namespace +) -> typing.Union[S3InputConfig, FsInputConfig]: + input_type = parsed_args.input_type + + if InputType.FS == input_type: + return FsInputConfig( + paths_to_compress=targets_to_compress, + timestamp_key=parsed_args.timestamp_key, + path_prefix_to_remove=str(CONTAINER_INPUT_LOGS_ROOT_DIR), + ) + elif InputType.S3 == input_type: + if len(targets_to_compress) != 1: + logger.error(f"Unexpected number of targets: {targets_to_compress}") + exit(-1) + s3_url = targets_to_compress[0] + region_code, bucket_name, key_prefix = parse_s3_url(s3_url) + return S3InputConfig( + region_code=region_code, + bucket=bucket_name, + key_prefix=key_prefix, + aws_access_key_id=parsed_args.aws_access_key_id, + aws_secret_access_key=parsed_args.aws_secret_access_key, + timestamp_key=parsed_args.timestamp_key, + ) + else: + raise ValueError(f"Unsupported input type: {input_type}") + + +def get_targets_to_compress( + compress_path_list_path: pathlib.Path, + input_type: InputType +) -> List[str]: + # Define the path processing function based on the input type + process_path_func: typing.Callable[[str], str] + + def process_fs_path(path_str: str) -> str: + stripped_path = pathlib.Path(path_str) + container_file_path = CONTAINER_INPUT_LOGS_ROOT_DIR / pathlib.Path( + stripped_path + ).relative_to(stripped_path.anchor) + return str(container_file_path.resolve()) + + def process_s3_path(path_str: str) -> str: + return path_str + + if input_type == InputType.FS: + process_path_func = process_fs_path + elif input_type == InputType.S3: + process_path_func = process_s3_path + else: + raise ValueError(f"Unsupported input type: {input_type}") + + targets_to_compress = [] + # Read targets from the input file + with open(compress_path_list_path, "r") as f: + for path in f: + stripped_path_str = path.strip() + if "" == stripped_path_str: + # Skip empty paths + continue + targets_to_compress.append(process_path_func(stripped_path_str)) + + return targets_to_compress - args_parser = argparse.ArgumentParser(description="Compresses log files.") + +def add_common_arguments( + args_parser: argparse.ArgumentParser, + default_config_file_path: pathlib.Path +) -> None: args_parser.add_argument( "--config", "-c", @@ -150,9 +221,29 @@ def main(argv): args_parser.add_argument( "-t", "--tags", help="A comma-separated list of tags to apply to the compressed archives." ) + + +def main(argv): + clp_home = get_clp_home() + default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH + args_parser = argparse.ArgumentParser(description="Compresses files from filesystem/s3") + input_type_args_parser = args_parser.add_subparsers(dest="input_type") + + fs_compressor_parser = input_type_args_parser.add_parser(InputType.FS) + add_common_arguments(fs_compressor_parser, default_config_file_path) + + s3_compressor_parser = input_type_args_parser.add_parser(InputType.S3) + add_common_arguments(s3_compressor_parser, default_config_file_path) + s3_compressor_parser.add_argument( + "--aws-access-key-id", type=str, default=None, help="AWS access key id." + ) + s3_compressor_parser.add_argument( + "--aws-secret-access-key", type=str, default=None, help="AWS secret access key." + ) + parsed_args = args_parser.parse_args(argv[1:]) - compress_path_list_arg = parsed_args.path_list + input_type = parsed_args.input_type # Validate and load config file try: config_file_path = pathlib.Path(parsed_args.config) @@ -166,28 +257,12 @@ def main(argv): comp_jobs_dir = clp_config.logs_directory / "comp-jobs" comp_jobs_dir.mkdir(parents=True, exist_ok=True) - paths_to_compress = [] - # Read paths from the input file - compress_path_list_path = pathlib.Path(compress_path_list_arg).resolve() - with open(compress_path_list_path, "r") as f: - for path in f: - stripped_path_str = path.strip() - if "" == stripped_path_str: - # Skip empty paths - continue - stripped_path = pathlib.Path(stripped_path_str) - container_file_path = CONTAINER_INPUT_LOGS_ROOT_DIR / pathlib.Path( - stripped_path - ).relative_to(stripped_path.anchor) - resolved_path_str = str(container_file_path.resolve()) - paths_to_compress.append(resolved_path_str) - - mysql_adapter = SQL_Adapter(clp_config.database) - clp_input_config = FsInputConfig( - paths_to_compress=paths_to_compress, - timestamp_key=parsed_args.timestamp_key, - path_prefix_to_remove=str(CONTAINER_INPUT_LOGS_ROOT_DIR), + targets_to_compress = get_targets_to_compress( + pathlib.Path(parsed_args.path_list).resolve(), + input_type ) + + clp_input_config = generate_clp_io_config(targets_to_compress, parsed_args) clp_output_config = OutputConfig.parse_obj(clp_config.archive_output) if parsed_args.tags: tag_list = [tag.strip().lower() for tag in parsed_args.tags.split(",") if tag] @@ -195,6 +270,7 @@ def main(argv): clp_output_config.tags = tag_list clp_io_config = ClpIoConfig(input=clp_input_config, output=clp_output_config) + mysql_adapter = SQL_Adapter(clp_config.database) return handle_job( sql_adapter=mysql_adapter, clp_io_config=clp_io_config, From 6833ee9e2b4560c40c094184155f359020132564 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 2 Jan 2025 16:32:04 -0500 Subject: [PATCH 55/62] Linter fixes --- .../clp_package_utils/scripts/compress.py | 23 ++++++++----------- .../scripts/native/compress.py | 16 +++++-------- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/compress.py b/components/clp-package-utils/clp_package_utils/scripts/compress.py index dee4526ae..da8fe3c4f 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/compress.py @@ -6,6 +6,9 @@ import uuid from typing import List +from clp_py_utils.clp_config import StorageEngine +from job_orchestration.scheduler.job_config import InputType + from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, dump_container_config, @@ -17,8 +20,6 @@ load_config_file, validate_and_load_db_credentials_file, ) -from clp_py_utils.clp_config import StorageEngine -from job_orchestration.scheduler.job_config import InputType logger = logging.getLogger(__file__) @@ -49,10 +50,7 @@ def generate_targets_list( raise ValueError(f"Unsupported input type: {input_type}.") -def append_input_specific_args( - compress_cmd: List[str], - parsed_args: argparse.Namespace -) -> None: +def append_input_specific_args(compress_cmd: List[str], parsed_args: argparse.Namespace) -> None: input_type = parsed_args.input_type if InputType.FS == input_type: @@ -70,8 +68,7 @@ def append_input_specific_args( def add_common_arguments( - args_parser: argparse.ArgumentParser, - default_config_file_path: pathlib.Path + args_parser: argparse.ArgumentParser, default_config_file_path: pathlib.Path ) -> None: args_parser.add_argument( "--config", @@ -144,7 +141,9 @@ def main(argv): elif InputType.S3 == input_type: if StorageEngine.CLP_S != clp_config.package.storage_engine: - raise ValueError(f"input type {InputType.S3} is only supported with storage engine {StorageEngine.CLP_S}") + raise ValueError( + f"input type {InputType.S3} is only supported with storage engine {StorageEngine.CLP_S}" + ) else: raise ValueError(f"Unsupported input type: {input_type}.") @@ -188,11 +187,7 @@ def main(argv): if not container_path_list_path.exists(): break - generate_targets_list( - input_type, - container_path_list_path, - parsed_args - ) + generate_targets_list(input_type, container_path_list_path, parsed_args) compress_cmd.append("--path-list") compress_cmd.append(str(container_clp_config.logs_directory / container_path_list_filename)) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py index 5f3fce3e0..9dc661fbb 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py @@ -21,9 +21,9 @@ from job_orchestration.scheduler.job_config import ( ClpIoConfig, FsInputConfig, - S3InputConfig, + InputType, OutputConfig, - InputType + S3InputConfig, ) from clp_package_utils.general import ( @@ -128,8 +128,7 @@ def handle_job(sql_adapter: SQL_Adapter, clp_io_config: ClpIoConfig, no_progress def generate_clp_io_config( - targets_to_compress: List[str], - parsed_args: argparse.Namespace + targets_to_compress: List[str], parsed_args: argparse.Namespace ) -> typing.Union[S3InputConfig, FsInputConfig]: input_type = parsed_args.input_type @@ -158,8 +157,7 @@ def generate_clp_io_config( def get_targets_to_compress( - compress_path_list_path: pathlib.Path, - input_type: InputType + compress_path_list_path: pathlib.Path, input_type: InputType ) -> List[str]: # Define the path processing function based on the input type process_path_func: typing.Callable[[str], str] @@ -195,8 +193,7 @@ def process_s3_path(path_str: str) -> str: def add_common_arguments( - args_parser: argparse.ArgumentParser, - default_config_file_path: pathlib.Path + args_parser: argparse.ArgumentParser, default_config_file_path: pathlib.Path ) -> None: args_parser.add_argument( "--config", @@ -258,8 +255,7 @@ def main(argv): comp_jobs_dir.mkdir(parents=True, exist_ok=True) targets_to_compress = get_targets_to_compress( - pathlib.Path(parsed_args.path_list).resolve(), - input_type + pathlib.Path(parsed_args.path_list).resolve(), input_type ) clp_input_config = generate_clp_io_config(targets_to_compress, parsed_args) From a4e92ae0d67a284ecb95d763402bcce6b41bd91a Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Thu, 2 Jan 2025 17:15:08 -0500 Subject: [PATCH 56/62] add argument checks --- .../clp_package_utils/scripts/compress.py | 79 +++++++++++++------ .../clp-py-utils/clp_py_utils/s3_utils.py | 23 ++++++ 2 files changed, 79 insertions(+), 23 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/compress.py b/components/clp-package-utils/clp_package_utils/scripts/compress.py index da8fe3c4f..80e50f845 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/compress.py @@ -6,7 +6,8 @@ import uuid from typing import List -from clp_py_utils.clp_config import StorageEngine +from clp_py_utils.clp_config import CLPConfig, StorageEngine +from clp_py_utils.s3_utils import parse_aws_credentials_file from job_orchestration.scheduler.job_config import InputType from clp_package_utils.general import ( @@ -56,13 +57,17 @@ def append_input_specific_args(compress_cmd: List[str], parsed_args: argparse.Na if InputType.FS == input_type: return elif InputType.S3 == input_type: - # TODO: also think about credentials from file. - if parsed_args.aws_access_key_id is not None: + aws_access_key_id = parsed_args.aws_access_key_id + aws_secret_access_key = parsed_args.aws_secret_access_key + if parsed_args.aws_credentials_file: + aws_access_key_id, aws_secret_access_key = parse_aws_credentials_file( + pathlib.Path(parsed_args.aws_credentials_file) + ) + if aws_access_key_id and aws_secret_access_key: compress_cmd.append("--aws-access-key-id") - compress_cmd.append(parsed_args.aws_access_key_id) - if parsed_args.aws_secret_access_key is not None: + compress_cmd.append(aws_access_key_id) compress_cmd.append("--aws-secret-access-key") - compress_cmd.append(parsed_args.aws_secret_access_key) + compress_cmd.append(aws_secret_access_key) else: raise ValueError(f"Unsupported input type: {input_type}.") @@ -88,6 +93,46 @@ def add_common_arguments( ) +def validate_fs_input_args( + parsed_args: argparse.Namespace, + args_parser: argparse.ArgumentParser, +) -> None: + # Validate some input paths were specified + if len(parsed_args.paths) == 0 and parsed_args.path_list is None: + args_parser.error("No paths specified.") + + # Validate paths were specified using only one method + if len(parsed_args.paths) > 0 and parsed_args.path_list is not None: + args_parser.error("Paths cannot be specified on the command line AND through a file.") + + +def validate_s3_input_args( + parsed_args: argparse.Namespace, args_parser: argparse.ArgumentParser, clp_config: CLPConfig +) -> None: + if StorageEngine.CLP_S != clp_config.package.storage_engine: + raise ValueError( + f"input type {InputType.S3} is only supported with storage engine {StorageEngine.CLP_S}" + ) + + # Validate aws credentials were specified using only one method + aws_credential_file = parsed_args.aws_credentials_file + aws_access_key_id = parsed_args.aws_access_key_id + aws_secret_access_key = parsed_args.aws_secret_access_key + if aws_credential_file is not None: + if not pathlib.Path(aws_credential_file).exists(): + raise ValueError(f"{aws_credential_file} doesn't exist.") + + if aws_access_key_id is not None or aws_secret_access_key is not None: + args_parser.error( + "aws_credentials_file can not be specified together with aws_access_key_id or aws_secret_access_key." + ) + + elif bool(aws_access_key_id) != bool(aws_secret_access_key): + args_parser.error( + "aws_access_key_id and aws_secret_access_key must be both set or left unset." + ) + + def main(argv): clp_home = get_clp_home() default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH @@ -111,10 +156,9 @@ def main(argv): s3_compressor_parser.add_argument( "--aws-secret-access-key", type=str, default=None, help="AWS secret access key." ) - # args_parser.add_argument( - # "--aws-credentials-file", type=str, default=None, help="Access key id." - # ) - + s3_compressor_parser.add_argument( + "--aws-credentials-file", type=str, default=None, help="Path to AWS credentials file." + ) parsed_args = args_parser.parse_args(argv[1:]) # Validate and load config file @@ -131,20 +175,9 @@ def main(argv): input_type = parsed_args.input_type if InputType.FS == input_type: - # Validate some input paths were specified - if len(parsed_args.paths) == 0 and parsed_args.path_list is None: - args_parser.error("No paths specified.") - - # Validate paths were specified using only one method - if len(parsed_args.paths) > 0 and parsed_args.path_list is not None: - args_parser.error("Paths cannot be specified on the command line AND through a file.") - + validate_fs_input_args(parsed_args, args_parser) elif InputType.S3 == input_type: - if StorageEngine.CLP_S != clp_config.package.storage_engine: - raise ValueError( - f"input type {InputType.S3} is only supported with storage engine {StorageEngine.CLP_S}" - ) - + validate_s3_input_args(parsed_args, args_parser, clp_config) else: raise ValueError(f"Unsupported input type: {input_type}.") diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index ebc632cab..271150df0 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -15,6 +15,29 @@ AWS_ENDPOINT = "amazonaws.com" +def parse_aws_credentials_file(credentials_file_path: Path) -> Tuple[str, str]: + aws_access_key_id = None + aws_secret_access_key = None + + if not credentials_file_path.exists(): + raise ValueError(f"File {credentials_file_path} doesn't exist.") + + with open(credentials_file_path, "r") as f: + for line in f: + line = line.strip() + if line.startswith("aws_access_key_id"): + aws_access_key_id = line.split("=", 1)[1].strip() + elif line.startswith("aws_secret_access_key"): + aws_secret_access_key = line.split("=", 1)[1].strip() + + if aws_access_key_id is None or aws_secret_access_key is None: + raise ValueError( + "The credentials file must contain both 'aws_access_key_id' and 'aws_secret_access_key'." + ) + + return aws_access_key_id, aws_secret_access_key + + def parse_s3_url(s3_url: str) -> Tuple[str, str, str]: host_style_url_regex = re.compile( r"https://(?P[a-z0-9.-]+)\.s3(\.(?P[a-z0-9-]+))?" From a638f2d91019e774d4351d240710ab13a7de586b Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:00:46 -0500 Subject: [PATCH 57/62] Polishing --- .../clp_package_utils/scripts/compress.py | 95 +++++++++++-------- .../scripts/native/compress.py | 34 +++---- 2 files changed, 70 insertions(+), 59 deletions(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/compress.py b/components/clp-package-utils/clp_package_utils/scripts/compress.py index 80e50f845..7c0ec5220 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/compress.py @@ -25,11 +25,12 @@ logger = logging.getLogger(__file__) -def generate_targets_list( - input_type: InputType, +def _generate_targets_list( container_targets_list_path: pathlib.Path, parsed_args: argparse.Namespace, ) -> None: + input_type = parsed_args.input_type + if InputType.FS == input_type: compression_targets_list_file = parsed_args.path_list with open(container_targets_list_path, "w") as container_targets_list_file: @@ -51,11 +52,30 @@ def generate_targets_list( raise ValueError(f"Unsupported input type: {input_type}.") -def append_input_specific_args(compress_cmd: List[str], parsed_args: argparse.Namespace) -> None: +def _generate_compress_cmd( + parsed_args: argparse.Namespace, config_path: pathlib.Path, target_list_path: pathlib.Path +) -> List[str]: input_type = parsed_args.input_type + # fmt: off + compress_cmd = [ + "python3", + "-m", "clp_package_utils.scripts.native.compress", + input_type, + "--config", str(config_path) + ] + # fmt: on + if parsed_args.timestamp_key is not None: + compress_cmd.append("--timestamp-key") + compress_cmd.append(parsed_args.timestamp_key) + if parsed_args.tags is not None: + compress_cmd.append("--tags") + compress_cmd.append(parsed_args.tags) + if parsed_args.no_progress_reporting is True: + compress_cmd.append("--no-progress-reporting") + if InputType.FS == input_type: - return + pass elif InputType.S3 == input_type: aws_access_key_id = parsed_args.aws_access_key_id aws_secret_access_key = parsed_args.aws_secret_access_key @@ -71,8 +91,13 @@ def append_input_specific_args(compress_cmd: List[str], parsed_args: argparse.Na else: raise ValueError(f"Unsupported input type: {input_type}.") + compress_cmd.append("--target-list") + compress_cmd.append(str(target_list_path)) -def add_common_arguments( + return compress_cmd + + +def _add_common_arguments( args_parser: argparse.ArgumentParser, default_config_file_path: pathlib.Path ) -> None: args_parser.add_argument( @@ -93,7 +118,7 @@ def add_common_arguments( ) -def validate_fs_input_args( +def _validate_fs_input_args( parsed_args: argparse.Namespace, args_parser: argparse.ArgumentParser, ) -> None: @@ -106,12 +131,12 @@ def validate_fs_input_args( args_parser.error("Paths cannot be specified on the command line AND through a file.") -def validate_s3_input_args( +def _validate_s3_input_args( parsed_args: argparse.Namespace, args_parser: argparse.ArgumentParser, clp_config: CLPConfig ) -> None: if StorageEngine.CLP_S != clp_config.package.storage_engine: raise ValueError( - f"input type {InputType.S3} is only supported with storage engine {StorageEngine.CLP_S}" + f"input type {InputType.S3} is only supported for the storage engine {StorageEngine.CLP_S}." ) # Validate aws credentials were specified using only one method @@ -120,7 +145,7 @@ def validate_s3_input_args( aws_secret_access_key = parsed_args.aws_secret_access_key if aws_credential_file is not None: if not pathlib.Path(aws_credential_file).exists(): - raise ValueError(f"{aws_credential_file} doesn't exist.") + raise ValueError(f"credentials file {aws_credential_file} doesn't exist.") if aws_access_key_id is not None or aws_secret_access_key is not None: args_parser.error( @@ -129,7 +154,7 @@ def validate_s3_input_args( elif bool(aws_access_key_id) != bool(aws_secret_access_key): args_parser.error( - "aws_access_key_id and aws_secret_access_key must be both set or left unset." + "aws_access_key_id and aws_secret_access_key must be both specified or left unspecified." ) @@ -141,14 +166,14 @@ def main(argv): input_type_args_parser = args_parser.add_subparsers(dest="input_type") fs_compressor_parser = input_type_args_parser.add_parser(InputType.FS) - add_common_arguments(fs_compressor_parser, default_config_file_path) + _add_common_arguments(fs_compressor_parser, default_config_file_path) fs_compressor_parser.add_argument("paths", metavar="PATH", nargs="*", help="Paths to compress.") fs_compressor_parser.add_argument( "-f", "--path-list", dest="path_list", help="A file listing all paths to compress." ) s3_compressor_parser = input_type_args_parser.add_parser(InputType.S3) - add_common_arguments(s3_compressor_parser, default_config_file_path) + _add_common_arguments(s3_compressor_parser, default_config_file_path) s3_compressor_parser.add_argument("url", metavar="URL", help="URL of object to be compressed") s3_compressor_parser.add_argument( "--aws-access-key-id", type=str, default=None, help="AWS access key id." @@ -159,6 +184,7 @@ def main(argv): s3_compressor_parser.add_argument( "--aws-credentials-file", type=str, default=None, help="Path to AWS credentials file." ) + parsed_args = args_parser.parse_args(argv[1:]) # Validate and load config file @@ -175,9 +201,9 @@ def main(argv): input_type = parsed_args.input_type if InputType.FS == input_type: - validate_fs_input_args(parsed_args, args_parser) + _validate_fs_input_args(parsed_args, args_parser) elif InputType.S3 == input_type: - validate_s3_input_args(parsed_args, args_parser, clp_config) + _validate_s3_input_args(parsed_args, args_parser, clp_config) else: raise ValueError(f"Unsupported input type: {input_type}.") @@ -189,41 +215,26 @@ def main(argv): ) necessary_mounts = [mounts.clp_home, mounts.input_logs_dir, mounts.data_dir, mounts.logs_dir] - container_start_cmd = generate_container_start_cmd( - container_name, necessary_mounts, clp_config.execution_container - ) - - # fmt: off - compress_cmd = [ - "python3", - "-m", "clp_package_utils.scripts.native.compress", - input_type, - "--config", str(generated_config_path_on_container) - ] - # fmt: on - if parsed_args.timestamp_key is not None: - compress_cmd.append("--timestamp-key") - compress_cmd.append(parsed_args.timestamp_key) - if parsed_args.tags is not None: - compress_cmd.append("--tags") - compress_cmd.append(parsed_args.tags) - if parsed_args.no_progress_reporting is True: - compress_cmd.append("--no-progress-reporting") - - append_input_specific_args(compress_cmd, parsed_args) # Write targets to compress to a file while True: # Get unused output path - container_path_list_filename = f"{uuid.uuid4()}.txt" - container_path_list_path = clp_config.logs_directory / container_path_list_filename - if not container_path_list_path.exists(): + container_target_list_filename = f"{uuid.uuid4()}.txt" + container_target_list_path = clp_config.logs_directory / container_target_list_filename + path_list_path_on_container = ( + container_clp_config.logs_directory / container_target_list_filename + ) + if not container_target_list_path.exists(): break - generate_targets_list(input_type, container_path_list_path, parsed_args) - compress_cmd.append("--path-list") - compress_cmd.append(str(container_clp_config.logs_directory / container_path_list_filename)) + _generate_targets_list(container_target_list_path, parsed_args) + container_start_cmd = generate_container_start_cmd( + container_name, necessary_mounts, clp_config.execution_container + ) + compress_cmd = _generate_compress_cmd( + parsed_args, generated_config_path_on_container, path_list_path_on_container + ) cmd = container_start_cmd + compress_cmd subprocess.run(cmd, check=True) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py index 9dc661fbb..aa143402e 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py @@ -127,7 +127,7 @@ def handle_job(sql_adapter: SQL_Adapter, clp_io_config: ClpIoConfig, no_progress return CompressionJobCompletionStatus.SUCCEEDED -def generate_clp_io_config( +def _generate_clp_io_config( targets_to_compress: List[str], parsed_args: argparse.Namespace ) -> typing.Union[S3InputConfig, FsInputConfig]: input_type = parsed_args.input_type @@ -140,8 +140,8 @@ def generate_clp_io_config( ) elif InputType.S3 == input_type: if len(targets_to_compress) != 1: - logger.error(f"Unexpected number of targets: {targets_to_compress}") - exit(-1) + ValueError(f"Unexpected number of targets: {targets_to_compress}") + s3_url = targets_to_compress[0] region_code, bucket_name, key_prefix = parse_s3_url(s3_url) return S3InputConfig( @@ -156,26 +156,26 @@ def generate_clp_io_config( raise ValueError(f"Unsupported input type: {input_type}") -def get_targets_to_compress( +def _get_targets_to_compress( compress_path_list_path: pathlib.Path, input_type: InputType ) -> List[str]: # Define the path processing function based on the input type process_path_func: typing.Callable[[str], str] - def process_fs_path(path_str: str) -> str: + def _process_fs_path(path_str: str) -> str: stripped_path = pathlib.Path(path_str) container_file_path = CONTAINER_INPUT_LOGS_ROOT_DIR / pathlib.Path( stripped_path ).relative_to(stripped_path.anchor) return str(container_file_path.resolve()) - def process_s3_path(path_str: str) -> str: + def _process_s3_path(path_str: str) -> str: return path_str if input_type == InputType.FS: - process_path_func = process_fs_path + process_path_func = _process_fs_path elif input_type == InputType.S3: - process_path_func = process_s3_path + process_path_func = _process_s3_path else: raise ValueError(f"Unsupported input type: {input_type}") @@ -192,7 +192,7 @@ def process_s3_path(path_str: str) -> str: return targets_to_compress -def add_common_arguments( +def _add_common_arguments( args_parser: argparse.ArgumentParser, default_config_file_path: pathlib.Path ) -> None: args_parser.add_argument( @@ -203,9 +203,9 @@ def add_common_arguments( ) args_parser.add_argument( "-f", - "--path-list", - dest="path_list", - help="A file listing all paths to compress.", + "--target-list", + dest="target_list", + help="A file listing all targets to compress.", required=True, ) args_parser.add_argument( @@ -227,10 +227,10 @@ def main(argv): input_type_args_parser = args_parser.add_subparsers(dest="input_type") fs_compressor_parser = input_type_args_parser.add_parser(InputType.FS) - add_common_arguments(fs_compressor_parser, default_config_file_path) + _add_common_arguments(fs_compressor_parser, default_config_file_path) s3_compressor_parser = input_type_args_parser.add_parser(InputType.S3) - add_common_arguments(s3_compressor_parser, default_config_file_path) + _add_common_arguments(s3_compressor_parser, default_config_file_path) s3_compressor_parser.add_argument( "--aws-access-key-id", type=str, default=None, help="AWS access key id." ) @@ -254,11 +254,11 @@ def main(argv): comp_jobs_dir = clp_config.logs_directory / "comp-jobs" comp_jobs_dir.mkdir(parents=True, exist_ok=True) - targets_to_compress = get_targets_to_compress( - pathlib.Path(parsed_args.path_list).resolve(), input_type + targets_to_compress = _get_targets_to_compress( + pathlib.Path(parsed_args.target_list).resolve(), input_type ) - clp_input_config = generate_clp_io_config(targets_to_compress, parsed_args) + clp_input_config = _generate_clp_io_config(targets_to_compress, parsed_args) clp_output_config = OutputConfig.parse_obj(clp_config.archive_output) if parsed_args.tags: tag_list = [tag.strip().lower() for tag in parsed_args.tags.split(",") if tag] From 5685224b269e4d75c6bd29d436cc3f06d7b9df61 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:08:46 -0500 Subject: [PATCH 58/62] Add some docstrings --- .../clp-py-utils/clp_py_utils/s3_utils.py | 24 +++++++++++- .../executor/compress/fs_compression_task.py | 37 +++++++++++++----- .../compress/compression_scheduler.py | 39 +++++++++++++------ 3 files changed, 79 insertions(+), 21 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index 271150df0..f2d7de605 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -16,6 +16,13 @@ def parse_aws_credentials_file(credentials_file_path: Path) -> Tuple[str, str]: + """ + Parses the `aws_access_key_id` and `aws_secret_access_key` from the given credentials_file_path. + :param credentials_file_path: path to the file containing aws credentials. + :return: A tuple of (aws_access_key_id, aws_secret_access_key) + :raise: ValueError if the file doesn't exist, or doesn't contain the aws credentials. + """ + aws_access_key_id = None aws_secret_access_key = None @@ -39,6 +46,14 @@ def parse_aws_credentials_file(credentials_file_path: Path) -> Tuple[str, str]: def parse_s3_url(s3_url: str) -> Tuple[str, str, str]: + """ + Parses the region_code, bucket and key_prefix from the given s3 url. The url must be either a + host_style_url or path_style_url. + :param s3_url: a host_style_url or path_style_url. + :return: A tuple of (region_code, bucket, key_prefix) + :raise: ValueError if the given s3_url is not a valid host_style_url or path_style_url. + """ + host_style_url_regex = re.compile( r"https://(?P[a-z0-9.-]+)\.s3(\.(?P[a-z0-9-]+))?" r"\.(?P[a-z0-9.-]+)/(?P[^?]+).*" @@ -72,7 +87,14 @@ def generate_s3_virtual_hosted_style_url( return f"https://{bucket_name}.s3.{region_code}.{AWS_ENDPOINT}/{object_key}" -def get_s3_file_metadata(s3_input_config: S3InputConfig) -> Result[List[FileMetadata], str]: +def get_s3_object_metadata(s3_input_config: S3InputConfig) -> Result[List[FileMetadata], str]: + """ + Gets the metadata of objects under the / specified by s3_input_config. + :param s3_input_config: S3 configuration specifying the bucket, key_prefix and credentials. + :return: Result.OK(List[FileMetadata]) containing the object metadata on success, + otherwise Result.Err(str) with the error message. + """ + file_metadata_list: List[FileMetadata] = list() s3_client = boto3.client( diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index 6cb076ee2..b35bffe86 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -82,7 +82,7 @@ def update_job_metadata_and_tags(db_cursor, job_id, table_prefix, tag_ids, archi ) -def generate_fs_log_path_list_file( +def _generate_fs_targets_file( output_file_path: pathlib.Path, paths_to_compress: PathsToCompress, ) -> None: @@ -100,7 +100,7 @@ def generate_fs_log_path_list_file( file.write("\n") -def generate_s3_log_url_list_file( +def _generate_s3_targets_file( output_file_path: pathlib.Path, paths_to_compress: PathsToCompress, s3_input_config: S3InputConfig, @@ -117,12 +117,21 @@ def generate_s3_log_url_list_file( file.write("\n") -def make_clp_command( +def make_clp_command_and_env( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, clp_config: ClpIoConfig, db_config_file_path: pathlib.Path, ) -> Tuple[List[str], Optional[Dict[str, str]]]: + """ + Generates the command and environment for a clp compression job + :param clp_home: + :param archive_output_dir: + :param clp_config: + :param db_config_file_path: + :return: Tuple of (compression_command, compression_env) + """ + path_prefix_to_remove = clp_config.input.path_prefix_to_remove # fmt: off @@ -149,13 +158,23 @@ def make_clp_command( return compression_cmd, None -def make_clp_s_command( +def make_clp_s_command_and_env( clp_home: pathlib.Path, archive_output_dir: pathlib.Path, clp_config: ClpIoConfig, db_config_file_path: pathlib.Path, enable_s3_write: bool, ) -> Tuple[List[str], Optional[Dict[str, str]]]: + """ + Generates the command and environment for a clp_s compression job + :param clp_home: + :param archive_output_dir: + :param clp_config: + :param db_config_file_path: + :param enable_s3_write: Whether to write output to S3 storage. + :return: Tuple of (compression_command, compression_env) + """ + # fmt: off compression_cmd = [ str(clp_home / "bin" / "clp-s"), @@ -241,14 +260,14 @@ def run_clp( enable_s3_write = True if StorageEngine.CLP == clp_storage_engine: - compression_cmd, compression_env = make_clp_command( + compression_cmd, compression_env = make_clp_command_and_env( clp_home=clp_home, archive_output_dir=archive_output_dir, clp_config=clp_config, db_config_file_path=db_config_file_path, ) elif StorageEngine.CLP_S == clp_storage_engine: - compression_cmd, compression_env = make_clp_s_command( + compression_cmd, compression_env = make_clp_s_command_and_env( clp_home=clp_home, archive_output_dir=archive_output_dir, clp_config=clp_config, @@ -259,13 +278,13 @@ def run_clp( logger.error(f"Unsupported storage engine {clp_storage_engine}") return False, {"error_message": f"Unsupported storage engine {clp_storage_engine}"} - # Prepare list of targets to compress + # generate list of targets to compress input_type = clp_config.input.type targets_list_path = data_dir / f"{instance_id_str}-log-paths.txt" if InputType.FS == input_type: - generate_fs_log_path_list_file(targets_list_path, paths_to_compress) + _generate_fs_targets_file(targets_list_path, paths_to_compress) elif InputType.S3 == input_type: - generate_s3_log_url_list_file(targets_list_path, paths_to_compress, clp_config.input) + _generate_s3_targets_file(targets_list_path, paths_to_compress, clp_config.input) else: error_msg = f"Unsupported input type: {input_type}." logger.error(error_msg) diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index e57049aad..e20789529 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -20,7 +20,7 @@ from clp_py_utils.clp_logging import get_logger, get_logging_formatter, set_logging_level from clp_py_utils.compression import validate_path_and_get_info from clp_py_utils.core import read_yaml_config_file -from clp_py_utils.s3_utils import get_s3_file_metadata +from clp_py_utils.s3_utils import get_s3_object_metadata from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.fs_compression_task import compress from job_orchestration.scheduler.compress.partition import PathsToCompressBuffer @@ -80,9 +80,18 @@ def update_compression_job_metadata(db_cursor, job_id, kv): db_cursor.execute(query, values) -def process_fs_input_paths( +def _process_fs_input_paths( fs_input_conf: FsInputConfig, paths_to_compress_buffer: PathsToCompressBuffer -): +) -> None: + """ + Iterate through all files in fs_input_conf and adds their metadata to the + paths_to_compress_buffer. + Note: this method skips any files that do not exist. + :param fs_input_conf: FS configuration specifying the files to compress. + :param paths_to_compress_buffer: PathsToCompressBuffer containing the scheduling information + :return: None. + """ + for path_idx, path in enumerate(fs_input_conf.paths_to_compress, start=1): path = Path(path) @@ -113,19 +122,27 @@ def process_fs_input_paths( paths_to_compress_buffer.add_empty_directory(empty_directory) -def process_s3_input( - input_config: S3InputConfig, +def _process_s3_input( + s3_input_config: S3InputConfig, paths_to_compress_buffer: PathsToCompressBuffer, ) -> Result[bool, str]: - res = get_s3_file_metadata(input_config) + """ + Iterate through all objects under the / specified by s3_input_config, + and adds their metadata to the paths_to_compress_buffer + :param s3_input_config: S3 configuration specifying the bucket, key_prefix and credentials. + :param paths_to_compress_buffer: PathsToCompressBuffer containing the scheduling information + :return: Result.OK(True) on success, or Result.Err(str) with the error message otherwise. + """ + + res = get_s3_object_metadata(s3_input_config) if res.is_err(): logger.error(f"Failed to process s3 input: {res.err_value}") return res - file_metadata_list = res.ok_value - for file_metadata in file_metadata_list: - paths_to_compress_buffer.add_file(file_metadata) + object_metadata_list = res.ok_value + for object_metadata in object_metadata_list: + paths_to_compress_buffer.add_file(object_metadata) return Ok(True) @@ -158,9 +175,9 @@ def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection input_config = clp_io_config.input input_type = input_config.type if input_type == "fs": - process_fs_input_paths(input_config, paths_to_compress_buffer) + _process_fs_input_paths(input_config, paths_to_compress_buffer) elif input_type == "s3": - res = process_s3_input(input_config, paths_to_compress_buffer) + res = _process_s3_input(input_config, paths_to_compress_buffer) if res.is_err(): update_compression_job_metadata( db_cursor, From fd9dba288541c668fab69c1e2284fa34c09d7cc2 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:26:04 -0500 Subject: [PATCH 59/62] fixes --- .../scheduler/compress/compression_scheduler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index e20789529..69ca5dc07 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -31,7 +31,7 @@ CompressionTaskResult, ) from pydantic import ValidationError -from result import Ok, Result +from result import Ok, Err, Result # Setup logging logger = get_logger("compression_scheduler") @@ -141,6 +141,11 @@ def _process_s3_input( return res object_metadata_list = res.ok_value + if len(object_metadata_list) == 0: + error_msg = "Input url doesn't resolve to any object" + logger.error(error_msg) + return Err(error_msg) + for object_metadata in object_metadata_list: paths_to_compress_buffer.add_file(object_metadata) From f7a175c3297cc981784799de1e8318f77af98cc0 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:35:26 -0500 Subject: [PATCH 60/62] Rename task script --- .../job_orchestration/executor/compress/celeryconfig.py | 4 ++-- .../compress/{fs_compression_task.py => compression_task.py} | 0 .../scheduler/compress/compression_scheduler.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) rename components/job-orchestration/job_orchestration/executor/compress/{fs_compression_task.py => compression_task.py} (100%) diff --git a/components/job-orchestration/job_orchestration/executor/compress/celeryconfig.py b/components/job-orchestration/job_orchestration/executor/compress/celeryconfig.py index 9399972a8..4c22582a0 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/celeryconfig.py +++ b/components/job-orchestration/job_orchestration/executor/compress/celeryconfig.py @@ -6,13 +6,13 @@ # Force workers to consume only one task at a time worker_prefetch_multiplier = 1 imports = [ - "job_orchestration.executor.compress.fs_compression_task", + "job_orchestration.executor.compress.compression_task", ] # Queue settings task_queue_max_priority = TASK_QUEUE_HIGHEST_PRIORITY task_routes = { - "job_orchestration.executor.compress.fs_compression_task.compress": QueueName.COMPRESSION, + "job_orchestration.executor.compress.compression_task.compress": QueueName.COMPRESSION, } task_create_missing_queues = True diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/compression_task.py similarity index 100% rename from components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py rename to components/job-orchestration/job_orchestration/executor/compress/compression_task.py diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index 69ca5dc07..c9cb5a9cf 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -22,7 +22,7 @@ from clp_py_utils.core import read_yaml_config_file from clp_py_utils.s3_utils import get_s3_object_metadata from clp_py_utils.sql_adapter import SQL_Adapter -from job_orchestration.executor.compress.fs_compression_task import compress +from job_orchestration.executor.compress.compression_task import compress from job_orchestration.scheduler.compress.partition import PathsToCompressBuffer from job_orchestration.scheduler.constants import CompressionJobStatus, CompressionTaskStatus from job_orchestration.scheduler.job_config import ClpIoConfig, FsInputConfig, S3InputConfig @@ -31,7 +31,7 @@ CompressionTaskResult, ) from pydantic import ValidationError -from result import Ok, Err, Result +from result import Err, Ok, Result # Setup logging logger = get_logger("compression_scheduler") From 20488a123fc773cdf6fd1347cfaf239c09201404 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:45:51 -0500 Subject: [PATCH 61/62] fixes --- .../clp-package-utils/clp_package_utils/scripts/compress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/clp-package-utils/clp_package_utils/scripts/compress.py b/components/clp-package-utils/clp_package_utils/scripts/compress.py index 7c0ec5220..3b1ab3f3c 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/compress.py @@ -216,7 +216,7 @@ def main(argv): necessary_mounts = [mounts.clp_home, mounts.input_logs_dir, mounts.data_dir, mounts.logs_dir] - # Write targets to compress to a file + # Write compression targets to a file while True: # Get unused output path container_target_list_filename = f"{uuid.uuid4()}.txt" From 12d6b971cab9ba2f59443dfe8ffad69996ee4920 Mon Sep 17 00:00:00 2001 From: Haiqi Xu <14502009+haiqi96@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:53:43 -0500 Subject: [PATCH 62/62] Some captilization and update to the docstrings --- components/clp-py-utils/clp_py_utils/s3_utils.py | 5 ++++- .../job_orchestration/executor/compress/compression_task.py | 4 ++-- .../scheduler/compress/compression_scheduler.py | 6 +++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py index f2d7de605..90b5ead09 100644 --- a/components/clp-py-utils/clp_py_utils/s3_utils.py +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -47,7 +47,7 @@ def parse_aws_credentials_file(credentials_file_path: Path) -> Tuple[str, str]: def parse_s3_url(s3_url: str) -> Tuple[str, str, str]: """ - Parses the region_code, bucket and key_prefix from the given s3 url. The url must be either a + Parses the region_code, bucket and key_prefix from the given S3 url. The url must be either a host_style_url or path_style_url. :param s3_url: a host_style_url or path_style_url. :return: A tuple of (region_code, bucket, key_prefix) @@ -90,6 +90,9 @@ def generate_s3_virtual_hosted_style_url( def get_s3_object_metadata(s3_input_config: S3InputConfig) -> Result[List[FileMetadata], str]: """ Gets the metadata of objects under the / specified by s3_input_config. + Note: We reuse FileMetadata class to store the metadata of S3 objects. The object_key is stored + as path in FileMetadata. + :param s3_input_config: S3 configuration specifying the bucket, key_prefix and credentials. :return: Result.OK(List[FileMetadata]) containing the object metadata on success, otherwise Result.Err(str) with the error message. diff --git a/components/job-orchestration/job_orchestration/executor/compress/compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/compression_task.py index b35bffe86..fdc541361 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/compression_task.py @@ -246,7 +246,7 @@ def run_clp( yaml.safe_dump(clp_metadata_db_connection_config, db_config_file) db_config_file.close() - # Get s3 config + # Get S3 config s3_config: S3Config enable_s3_write = False storage_type = worker_config.archive_output.storage.type @@ -310,7 +310,7 @@ def run_clp( total_uncompressed_size = 0 total_compressed_size = 0 - # Handle job metadata update and s3 write if enabled + # Handle job metadata update and S3 write if enabled s3_error = None while not last_line_decoded: stats: Optional[Dict[str, Any]] = None diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index c9cb5a9cf..8e69c3ce9 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -128,7 +128,7 @@ def _process_s3_input( ) -> Result[bool, str]: """ Iterate through all objects under the / specified by s3_input_config, - and adds their metadata to the paths_to_compress_buffer + and adds their metadata to the paths_to_compress_buffer. :param s3_input_config: S3 configuration specifying the bucket, key_prefix and credentials. :param paths_to_compress_buffer: PathsToCompressBuffer containing the scheduling information :return: Result.OK(True) on success, or Result.Err(str) with the error message otherwise. @@ -137,7 +137,7 @@ def _process_s3_input( res = get_s3_object_metadata(s3_input_config) if res.is_err(): - logger.error(f"Failed to process s3 input: {res.err_value}") + logger.error(f"Failed to process S3 input: {res.err_value}") return res object_metadata_list = res.ok_value @@ -189,7 +189,7 @@ def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection job_id, { "status": CompressionJobStatus.FAILED, - "status_msg": f"Scheduler Failed for s3 input: {res.err_value}", + "status_msg": f"Scheduler Failed for S3 input: {res.err_value}", }, ) db_conn.commit()