From 6fa37bfc8f2d19d01f96a160fe9dbe03930fceb5 Mon Sep 17 00:00:00 2001 From: Colton Loftus <70598503+C-Loftus@users.noreply.github.com> Date: Tue, 28 Jan 2025 12:11:58 -0500 Subject: [PATCH] fix rclone bin --- .gitignore | 3 + Docker/dagster/Dockerfile_user_code | 5 -- Docker/dagster/rclone.sh | 9 --- userCode/main.py | 96 ++++++++++++++++++++++++++++- 4 files changed, 96 insertions(+), 17 deletions(-) delete mode 100755 Docker/dagster/rclone.sh diff --git a/.gitignore b/.gitignore index d2c0b09d..bba173b8 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,6 @@ rclone.conf tmp*/ storage/ + +rclone-current*.zip +rclone.zip \ No newline at end of file diff --git a/Docker/dagster/Dockerfile_user_code b/Docker/dagster/Dockerfile_user_code index e477cb2c..4fb6b84f 100644 --- a/Docker/dagster/Dockerfile_user_code +++ b/Docker/dagster/Dockerfile_user_code @@ -4,11 +4,6 @@ FROM python:3.10-slim COPY requirements.txt user_code_requirements.txt RUN pip install -r user_code_requirements.txt -# install rclone -RUN apt-get -y update; apt-get -y install curl unzip -COPY Docker/dagster/rclone.sh rclone.sh -RUN bash rclone.sh - # configs and runtime code WORKDIR /opt/dagster/app COPY userCode/ /opt/dagster/app/userCode diff --git a/Docker/dagster/rclone.sh b/Docker/dagster/rclone.sh deleted file mode 100755 index b705db26..00000000 --- a/Docker/dagster/rclone.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# this script downloads and installs rclone -curl -O https://downloads.rclone.org/rclone-current-linux-amd64.zip -unzip rclone-current-linux-amd64.zip -cd rclone-*-linux-amd64 -cp rclone /usr/bin/ -chown root:root /usr/bin/rclone -chmod 755 /usr/bin/rclone -rclone version diff --git a/userCode/main.py b/userCode/main.py index 97425430..40fb7cbf 100644 --- a/userCode/main.py +++ b/userCode/main.py @@ -1,7 +1,11 @@ import asyncio from datetime import datetime import os +import platform +import shutil +import subprocess from typing import Optional, Tuple +import zipfile from aiohttp import ClientSession, ClientTimeout from bs4 import BeautifulSoup from dagster import ( @@ -73,7 +77,93 @@ def nabu_config(): f.write(templated_data) +def ensure_local_bin_in_path(): + """Ensure ~/.local/bin is in the PATH.""" + local_bin = os.path.expanduser("~/.local/bin") + if local_bin not in os.environ["PATH"].split(os.pathsep): + os.environ["PATH"] += os.pathsep + local_bin + return local_bin + + @asset +def rclone_binary(): + """Download the rclone binary to a user-writable location in the PATH.""" + local_bin = ensure_local_bin_in_path() + os.makedirs(local_bin, exist_ok=True) + + # Check if rclone is already installed in ~/.local/bin + rclone_path = os.path.join(local_bin, "rclone") + if os.path.isfile(rclone_path): + print(f"Rclone is already installed at {rclone_path}.") + return + + # Determine the platform + system = platform.system().lower() + arch = platform.machine().lower() + + # Map system and architecture to the appropriate Rclone download URL + if system == "linux" and arch in ("x86_64", "amd64"): + download_url = "https://downloads.rclone.org/rclone-current-linux-amd64.zip" + elif system == "darwin" and arch in ("arm64", "aarch64"): + download_url = "https://downloads.rclone.org/rclone-current-osx-arm64.zip" + else: + raise SystemError( + "Unsupported system or architecture: {} on {}".format(arch, system) + ) + + # Download the file + def download_file(url, dest): + print(f"Downloading Rclone from {url}...") + response = requests.get(url, stream=True) + if response.status_code == 200: + with open(dest, "wb") as f: + shutil.copyfileobj(response.raw, f) + print("Download complete.") + else: + raise RuntimeError( + f"Failed to download file. HTTP Status Code: {response.status_code}" + ) + + zip_file = "rclone.zip" + download_file(download_url, zip_file) + + # Extract the downloaded zip file + with zipfile.ZipFile(zip_file, "r") as zip_ref: + print("Extracting Rclone...") + zip_ref.extractall("rclone_extracted") + + # Change to the extracted directory + extracted_dir = next( + ( + d + for d in os.listdir("rclone_extracted") + if os.path.isdir(os.path.join("rclone_extracted", d)) + ), + None, + ) + if not extracted_dir: + raise FileNotFoundError("Extracted Rclone directory not found.") + + extracted_path = os.path.join("rclone_extracted", extracted_dir) + + # Copy the Rclone binary to ~/.local/bin + rclone_binary = os.path.join(extracted_path, "rclone") + if not os.path.isfile(rclone_binary): + raise FileNotFoundError("Rclone binary not found in extracted directory.") + + print(f"Installing Rclone to {local_bin}...") + shutil.copy(rclone_binary, rclone_path) + os.chmod(rclone_path, 0o755) # Set executable permissions + + print("Verifying Rclone installation...") + subprocess.run(["rclone", "version"], check=True) + + os.remove(zip_file) + shutil.rmtree("rclone_extracted") + print("Installation complete.") + + +@asset(deps=[rclone_binary]) def rclone_config() -> str: """Create the rclone config by templating the rclone.conf.j2 template""" get_dagster_logger().info("Creating rclone config") @@ -101,9 +191,9 @@ def gleaner_config(context: AssetExecutionContext): sources = [] names: set[str] = set() - assert ( - len(Lines) > 0 - ), f"No sitemaps found in sitemap index {REMOTE_GLEANER_SITEMAP}" + assert len(Lines) > 0, ( + f"No sitemaps found in sitemap index {REMOTE_GLEANER_SITEMAP}" + ) for line in Lines: basename = REMOTE_GLEANER_SITEMAP.removesuffix(".xml")