FE-350 update manifest.py to remove managed access project ids from f…

…inal partition set (#341) * poetry was getting confused over python vs python3 during pkg installs - added python-is-python3 * FE-350 updated manifest.py to accept a csv with a new value for public (No/Yes), so that we filter out Managed Access (Yes) data from the final partition set which will make snapshots public. * fixed indentation issues, re-enabled upload to Dagster, removed print statements * added not about public/ma logic and updated data locations. dev SA doesn't have access to prod buckets. duh. * Updating poetry version used by Git to fix whl file corruption from old versions of poetry. * Update requirements.txt --------- Co-authored-by: dsp-fieldeng-bot <[email protected]>
DataBiosphere · Dec 3, 2024 · 91f2bcc · 91f2bcc
1 parent 67a0629
commit 91f2bcc
Show file tree

Hide file tree

Showing 5 changed files with 124 additions and 112 deletions.
diff --git a/.github/workflows/generate-requirements-file.yaml b/.github/workflows/generate-requirements-file.yaml
@@ -24,7 +24,7 @@ jobs:
       - name: Install Poetry
         uses: snok/install-poetry@v1
         with:
-          version: 1.1.9
+          version: 1.8.0
           virtualenvs-create: true
           virtualenvs-in-project: true
           installer-parallel: true

diff --git a/.github/workflows/validate_python.yaml b/.github/workflows/validate_python.yaml
@@ -30,7 +30,7 @@ jobs:
     - name: Install Poetry
       uses: snok/install-poetry@v1
       with:
-        version: 1.1.9
+        version: 1.8.0
     - name: Cache dependencies
       uses: actions/cache@v2
       env:

diff --git a/Dockerfile b/Dockerfile
@@ -10,6 +10,7 @@ ENV LANG='en_US.UTF-8' \
     SBT_VERSION=1.7.1
 
 # Install some helpful tools not included in the base image, as well as set up for JDK install
+# python-is-python3 makes python3 the default, to avoid issues with poetry
 RUN apt-get update  \
     && DEBIAN_FRONTEND=noninteractive \
     && apt-get install -yqq --no-install-recommends \
@@ -20,6 +21,7 @@ RUN apt-get update  \
     git \
     gnupg \
     locales \
+    python-is-python3 \
     sudo \
     tzdata \
     unzip \

diff --git a/orchestration/hca_manage/manifest.py b/orchestration/hca_manage/manifest.py
@@ -54,7 +54,6 @@
     "dev": {
         "EBI": "gs://broad-dsp-monster-hca-dev-ebi-staging/dev",
         "UCSC": "gs://broad-dsp-monster-hca-dev-ebi-staging/dev",
-        "TEST": "gs://broad-dsp-monster-hca-prod-ebi-storage/broad_test_dataset"
     }
 }
 ENV_PIPELINE_ENDINGS = {
@@ -92,21 +91,22 @@ def _sanitize_gs_path(path: str) -> str:
 
 
 def _parse_csv(csv_path: str, env: str, project_id_only: bool = False,
-               include_release_tag: bool = False, release_tag: str = "") -> list[list[str]]:
+               include_release_tag: bool = False, release_tag: str = "") -> tuple[list[list[str]], list[list[str]]]:
     keys = set()
+    public_projects = set()
     with open(csv_path, "r") as f:
         reader = csv.reader(f)
         for row in reader:
             if not row:
                 logging.debug("Empty path detected, skipping")
                 continue
 
-            assert len(row) == 2
+            assert len(row) == 3, "CSV must have 3 columns: institution, project_id, and Yes or No for Public"
             row = [x.strip() for x in row]
             institution = row[0].upper()
             project_id = find_project_id_in_str(row[1])
+            public = row[2].lower()
 
-            key = None
             if project_id_only:
                 project_id = row[1]
                 key = project_id
@@ -126,18 +126,26 @@ def _parse_csv(csv_path: str, env: str, project_id_only: bool = False,
                 key = key + f",{release_tag}"
             keys.add(key)
 
+            # make a separate set of public projects
+            # The question is "Does this project contain managed access data?"
+            # so no = public, yes = managed access (ma)
+            if public == "no":
+                public_projects.add(key)
+
     chunked_paths = chunked(keys, MAX_STAGING_AREAS_PER_PARTITION_SET)
-    return [chunk for chunk in chunked_paths]
+    chunked_paths_no_ma = chunked(public_projects, MAX_STAGING_AREAS_PER_PARTITION_SET)
+    return [chunk for chunk in chunked_paths], [chunk for chunk in chunked_paths_no_ma]
 
 
 def parse_and_load_manifest(env: str, csv_path: str, release_tag: str,
                             pipeline_name: str, project_id_only: bool = False,
-                            include_release_tag: bool = False) -> None:
+                            include_release_tag: bool = False, no_ma: bool = False) -> None:
     chunked_paths = _parse_csv(csv_path, env, project_id_only, include_release_tag, release_tag)
+    paths_to_use = chunked_paths[1] if no_ma else chunked_paths[0]
     storage_client = Client()
     bucket: Bucket = storage_client.bucket(bucket_name=ETL_PARTITION_BUCKETS[env])
 
-    for pos, chunk in enumerate(chunked_paths):
+    for pos, chunk in enumerate(paths_to_use):
         assert len(chunk), "At least one import path is required"
         qualifier = chr(pos + 97)  # dcp11_a, dcp11_b, etc.
         blob_name = f"{pipeline_name}/{release_tag}_{qualifier}_manifest.csv"
@@ -196,7 +204,8 @@ def load(args: argparse.Namespace) -> None:
         args.release_tag,
         f"make_snapshot_public_job_{ENV_PIPELINE_ENDINGS[args.env]}",
         project_id_only=True,
-        include_release_tag=True
+        include_release_tag=True,
+        no_ma=True
     )
     _reload_repository(_get_dagster_client())