From 9bad5122a5a23a74f5652495c5877be87e95f1ec Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 16:40:12 -0600 Subject: [PATCH 01/16] Init static analysis config --- .flake8 | 16 ++++ .mypy.ini | 6 ++ cryo_data_ingest/constants/paths.py | 4 + environment.yml | 2 +- pyproject.toml | 6 ++ tasks/__init__.py | 10 +++ tasks/env.py | 32 ++++++++ tasks/format.py | 19 +++++ tasks/test.py | 122 ++++++++++++++++++++++++++++ tasks/util.py | 16 ++++ 10 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 .flake8 create mode 100644 .mypy.ini create mode 100644 pyproject.toml create mode 100644 tasks/__init__.py create mode 100644 tasks/env.py create mode 100644 tasks/format.py create mode 100644 tasks/test.py create mode 100644 tasks/util.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..42beeb4 --- /dev/null +++ b/.flake8 @@ -0,0 +1,16 @@ +[flake8] +max-line-length = 100 +max-complexity = 8 + +# Temporary. Remove once black re-enabled: +inline-quotes = ' + +# flake8-import-order +application_import_names = cryo_data_ingest +import_order_style = pycharm + +# D1: Ignore errors requiring docstrings on everything. +# W503: Line breaks should occur after the binary operator to keep all variable names aligned. +# E731: Lambda assignments are OK, use your best judgement. +# C408: Unnecessary dict call - rewrite as a literal. +ignore = D1,W503,E731,C408 diff --git a/.mypy.ini b/.mypy.ini new file mode 100644 index 0000000..ba2b8c5 --- /dev/null +++ b/.mypy.ini @@ -0,0 +1,6 @@ +[mypy] +python_version = 3.10 +incremental = True +show_error_codes = True +check_untyped_defs = True +warn_unreachable = True diff --git a/cryo_data_ingest/constants/paths.py b/cryo_data_ingest/constants/paths.py index 5413a31..4b9322d 100644 --- a/cryo_data_ingest/constants/paths.py +++ b/cryo_data_ingest/constants/paths.py @@ -1,4 +1,8 @@ from pathlib import Path +PACKAGE_DIR = Path(__file__).parent.parent +PROJECT_DIR = PACKAGE_DIR.parent +SCRIPTS_DIR = PROJECT_DIR / 'scripts' + JSON_STORAGE_DIR = Path('/tmp/cryo-data-ingest') diff --git a/environment.yml b/environment.yml index 240f39d..e49c7f0 100644 --- a/environment.yml +++ b/environment.yml @@ -7,8 +7,8 @@ dependencies: - datalad ~=0.17.5 # Dev dependencies: - - bump2version ~=1.0 - invoke ~=1.7 + - bump2version ~=1.0 - black ~=22.3.0 - isort ~=5.10 - pytest ~=7.1 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..64dd288 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[tool.black] +skip-string-normalization = true + + +[tool.isort] +profile = "black" diff --git a/tasks/__init__.py b/tasks/__init__.py new file mode 100644 index 0000000..ff06e50 --- /dev/null +++ b/tasks/__init__.py @@ -0,0 +1,10 @@ +"""Invoke tasks.""" + +from invoke import Collection + +from . import env, format, test + +ns = Collection() +ns.add_collection(env) +ns.add_collection(format) +ns.add_collection(test) diff --git a/tasks/env.py b/tasks/env.py new file mode 100644 index 0000000..a359d62 --- /dev/null +++ b/tasks/env.py @@ -0,0 +1,32 @@ +from invoke import task + +from .util import PROJECT_DIR, print_and_run + +ENV_LOCKFILE = PROJECT_DIR / "environment-lock.yml" + + +@task(default=True) +def lock(ctx): + """Update the environment-lock.yml file from the current `cryo-data-ingest` environment.""" + print_and_run(f"conda env export -n cryo-data-ingest > {ENV_LOCKFILE}") + + with open(ENV_LOCKFILE, "r") as f: + lines = f.readlines() + + with open(ENV_LOCKFILE, "w") as f: + for line in lines: + # The prefix line contains machine-specific directory + if line.startswith('prefix: '): + continue + + # We don't want to use the NSIDC conda channel + if '- nsidc' in line: + continue + + # We want to replace the "defaults" channel with "nodefaults" so conda-forge + # is used for everything. + if '- defaults' in line: + f.write(line.replace('- defaults', '- nodefaults')) + continue + + f.write(line) diff --git a/tasks/format.py b/tasks/format.py new file mode 100644 index 0000000..6aefd74 --- /dev/null +++ b/tasks/format.py @@ -0,0 +1,19 @@ +from invoke import task + +from .util import PROJECT_DIR, print_and_run + + +@task(default=True) +def format(ctx): + """Apply formatting standards to the codebase.""" + # isort 5.10.1 does not support "magic trailing comma" feature of Black, so it will + # combine multiple imports to one line if they fit. + # + # https://github.com/PyCQA/isort/issues/1683 + print_and_run(f"isort {PROJECT_DIR}") + + # Black 22.1 has problems with string handling. We can work around those with + # `fmt: on` and `fmt: off` comments, but that's not fun. + # + # https://github.com/psf/black/issues/2188 + print_and_run(f"black {PROJECT_DIR}") diff --git a/tasks/test.py b/tasks/test.py new file mode 100644 index 0000000..cd122d6 --- /dev/null +++ b/tasks/test.py @@ -0,0 +1,122 @@ +import sys + +from invoke import task + +from .util import PROJECT_DIR, print_and_run + +sys.path.append(str(PROJECT_DIR)) + +from cryo_data_ingest.constants.paths import PACKAGE_DIR, PROJECT_DIR, SCRIPTS_DIR + + +@task +def shellcheck(ctx): + # FIXME: It's unclear why, but the return code seems to be getting swallowed. + print_and_run( + f'cd {PROJECT_DIR} &&' + f' for file in $(find {SCRIPTS_DIR} -type f -name "*.sh"); do' + ' shellcheck $file;' + ' done', + pty=True, + ) + + +@task() +def formatcheck(ctx): + """Check that the code conforms to formatting standards.""" + print_and_run(f"isort --check-only {PROJECT_DIR}") + print_and_run(f"black --check {PROJECT_DIR}") + print("🎉🙈 Format check passed.") + + +@task(aliases=('flake8',)) +def lint(ctx): + """Run static analysis with flake8.""" + print_and_run( + f'cd {PROJECT_DIR} &&' + f" flake8 --config {PROJECT_DIR / '.flake8'} {PACKAGE_DIR} {SCRIPTS_DIR}", + pty=True, + ) + # print_and_run( + # f'cd {PROJECT_DIR} &&' + # f' vulture --min-confidence 80 {PACKAGE_DIR} {SCRIPTS_DIR}', + # pty=True, + # ) + print("🎉👕 Linting passed.") + + +@task(aliases=('mypy',)) +def typecheck(ctx): + """Check for type correctness using mypy.""" + print_and_run(f"mypy --config-file={PROJECT_DIR / '.mypy.ini'} {PACKAGE_DIR}/") + print('🎉🦆 Type checking passed.') + + +@task(pre=[shellcheck, formatcheck, lint, typecheck]) +def static(ctx): + """Run all static analysis tasks.""" + pass + + +@task(aliases=('unit',)) +def unittest(ctx): + """Run unit tests.""" + print_and_run( + f'PYTHONPATH={PROJECT_DIR} pytest --verbose {PACKAGE_DIR}/test/', + pty=True, + ) + print('🎉🛠️ Unit tests passed.') + + +@task(aliases=('integration',)) +def integrationtest(ctx): + """Run integration tests.""" + print_and_run( + f'PYTHONPATH={PROJECT_DIR} pytest --verbose {PACKAGE_DIR}/test_integration', + pty=True, + ) + + print('🎉🧩 Integration tests passed.') + + +@task(aliases=('regression',)) +def regressiontest(ctx): + """Run regression tests.""" + print_and_run( + f'PYTHONPATH={PROJECT_DIR} pytest --verbose {PACKAGE_DIR}/test_regression', + pty=True, + ) + + print('🎉⏰ Regression tests passed.') + + +@task( + pre=[ + shellcheck, + lint, + formatcheck, + typecheck, + # unittest, + # integrationtest, + # regressiontest, + ], + default=True, +) +def all(ctx): + """Run all of the tests.""" + print("🎉❤️ All tests passed!") + + +@task( + pre=[ + shellcheck, + lint, + formatcheck, + typecheck, + # unittest, + # integrationtest, + ], +) +def ci(ctx): + """Run the tests that would be run in CI.""" + print("🎉❤️ CI tests passed!") diff --git a/tasks/util.py b/tasks/util.py new file mode 100644 index 0000000..714d14f --- /dev/null +++ b/tasks/util.py @@ -0,0 +1,16 @@ +import inspect +import os +from pathlib import Path + +from invoke import run + +PROJECT_DIR = Path(__file__).parent.parent + + +def print_and_run(cmd, **run_kwargs): + print(cmd) + kwargs = { + 'pty': True, + **run_kwargs, + } + return run(cmd, **kwargs) From 19541311f204c49072bf53ee46049b49bb32b2ad Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 16:43:47 -0600 Subject: [PATCH 02/16] Apply `black` formatting --- cryo_data_ingest/constants/paths.py | 1 - cryo_data_ingest/util/cmr.py | 14 ++++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cryo_data_ingest/constants/paths.py b/cryo_data_ingest/constants/paths.py index 4b9322d..551a7f9 100644 --- a/cryo_data_ingest/constants/paths.py +++ b/cryo_data_ingest/constants/paths.py @@ -1,6 +1,5 @@ from pathlib import Path - PACKAGE_DIR = Path(__file__).parent.parent PROJECT_DIR = PACKAGE_DIR.parent SCRIPTS_DIR = PROJECT_DIR / 'scripts' diff --git a/cryo_data_ingest/util/cmr.py b/cryo_data_ingest/util/cmr.py index 1f11474..6d7424e 100644 --- a/cryo_data_ingest/util/cmr.py +++ b/cryo_data_ingest/util/cmr.py @@ -1,10 +1,11 @@ import csv import json import logging -import requests from typing import Iterator, TypedDict from urllib.parse import urlparse +import requests + from cryo_data_ingest.constants.cmr import ( CMR_COLLECTIONS_SEARCH_URL, CMR_GRANULES_SEARCH_URL, @@ -33,6 +34,7 @@ class Granule(TypedDict): class OutputGranule(TypedDict): """Just the information needed to create a datalad URL file.""" + local_path: str link: str @@ -54,7 +56,6 @@ def _page_cmr_results( query_params = query_params if query_params else dict() query_headers = query_headers if query_headers else dict() - CMR_SEARCH_HEADER = 'CMR-Search-After' page_num = 1 while True: @@ -71,7 +72,9 @@ def _page_cmr_results( ) if page_num == 1: - logger.debug(f"Got {response.headers['CMR-Hits']} hits for query {cmr_query_url}") + logger.debug( + f"Got {response.headers['CMR-Hits']} hits for query {cmr_query_url}" + ) logger.debug(f'Got page {page_num}...') @@ -160,15 +163,14 @@ def write_collection_granules(collection: Collection) -> None: return logger.info( - f'Creating file for {collection_readable_id} ({len(granules)}' - ' granules)...' + f'Creating file for {collection_readable_id} ({len(granules)} granules)...' ) # TODO: the `local_path` should not include common path parts that are in common for # each granule output_granules: list[OutputGranule] = [ { - 'local_path': urlparse(g['url']).path[1:], # trim leading "/" + 'local_path': urlparse(g['url']).path[1:], # trim leading "/" 'link': g['url'], } for g in granules From 26cbc3050653a2e505bc7acc58fc976fd9d0262c Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 16:52:17 -0600 Subject: [PATCH 03/16] Move json2datalad script into scripts dir --- README.md | 8 +++++++- json2datalad.sh => scripts/json2datalad.sh | 0 2 files changed, 7 insertions(+), 1 deletion(-) rename json2datalad.sh => scripts/json2datalad.sh (100%) diff --git a/README.md b/README.md index 48b9d76..131882e 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,14 @@ _In early development. The following instructions are temporary._ 0. Set up conda environment (`conda env create`) 1. Activate conda environment (`conda activate cryo-data-ingest`) -2. Run the "main script" from the root of this repo: +2. Run the "cmr script" from the root of this repo: ``` PYTHONPATH=. python cryo_data_ingest/util/cmr.py ``` + +3. Run the "datalad script" and follow usage instructions: + + ``` + ./scripts/json2datalad.sh + ``` diff --git a/json2datalad.sh b/scripts/json2datalad.sh similarity index 100% rename from json2datalad.sh rename to scripts/json2datalad.sh From a35250a8351597cb5d6876d7a15d059350360b34 Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 17:10:24 -0600 Subject: [PATCH 04/16] Init GitHub Actions config --- .github/workflows/test.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..ef53e82 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,13 @@ +name: "test" +on: ["push"] +jobs: + setup-environment: + runs-on: "ubuntu-latest" + steps: + - uses: "actions/checkout@v3" + - run: "conda env create" + + test: + needs: "setup-environment" + steps: + - run: "conda activate cryo-data-ingest && inv test.ci" From 40b827c076e554e27588443a03277c241a289b18 Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 17:12:22 -0600 Subject: [PATCH 05/16] Try setting `runs-on` for second step Is context shared between steps? --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ef53e82..73febf0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,5 +9,6 @@ jobs: test: needs: "setup-environment" + runs-on: "ubuntu-latest" steps: - run: "conda activate cryo-data-ingest && inv test.ci" From a7b58085b9a2de76dc5cee201c38961ab1206802 Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 17:16:43 -0600 Subject: [PATCH 06/16] Run `conda init` in GHA workflow --- .github/workflows/test.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 73febf0..2855951 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,10 +5,12 @@ jobs: runs-on: "ubuntu-latest" steps: - uses: "actions/checkout@v3" - - run: "conda env create" + - name: "Create conda environment" + run: "conda init && conda env create" test: needs: "setup-environment" runs-on: "ubuntu-latest" steps: - - run: "conda activate cryo-data-ingest && inv test.ci" + - name: "Run static analysis / tests" + run: "conda activate cryo-data-ingest && inv test.ci" From 33cb5ba6cfa6a054cbf7efaf874e686efd954a6a Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 17:18:32 -0600 Subject: [PATCH 07/16] Move datalad to runtime deps --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index e49c7f0..ca6a7e2 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,6 @@ channels: - nodefaults dependencies: - python ~=3.10.4 - - datalad ~=0.17.5 # Dev dependencies: - invoke ~=1.7 @@ -25,3 +24,4 @@ dependencies: # Runtime dependencies: - click ~=8.1 - requests ~=2.23 + - datalad ~=0.17.5 From b00f9ae50bac5e37525374ac26850c987adb70ab Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 17:21:03 -0600 Subject: [PATCH 08/16] Try sourcing conda's activate script --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2855951..d3cfd53 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,4 +13,4 @@ jobs: runs-on: "ubuntu-latest" steps: - name: "Run static analysis / tests" - run: "conda activate cryo-data-ingest && inv test.ci" + run: ". activate cryo-data-ingest && inv test.ci" From b701a3113b291a651aa3312bf635ee3663a0c4ea Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 17:25:30 -0600 Subject: [PATCH 09/16] Try `conda init bash` Running out of ideas! --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d3cfd53..fb164db 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,11 +6,11 @@ jobs: steps: - uses: "actions/checkout@v3" - name: "Create conda environment" - run: "conda init && conda env create" + run: "conda init bash && conda env create" test: needs: "setup-environment" runs-on: "ubuntu-latest" steps: - name: "Run static analysis / tests" - run: ". activate cryo-data-ingest && inv test.ci" + run: "conda activate cryo-data-ingest && inv test.ci" From b0511707c2ce56b5ceef91aaa5d5490f8ca23a39 Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 17:30:52 -0600 Subject: [PATCH 10/16] Try bash login shell --- .github/workflows/test.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fb164db..c0ada6c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,13 +2,24 @@ name: "test" on: ["push"] jobs: setup-environment: + defaults: + run: + shell: "bash -l {0}" + runs-on: "ubuntu-latest" steps: - uses: "actions/checkout@v3" - name: "Create conda environment" run: "conda init bash && conda env create" + test: + defaults: + run: + # Login shell enables bash to process conda configuration so we can + # activate envs + shell: "bash -l {0}" + needs: "setup-environment" runs-on: "ubuntu-latest" steps: From 75cb1fd054374ef5d48ec59a40488ad8da97515a Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 17:35:25 -0600 Subject: [PATCH 11/16] Try using $CONDA envvar to activate --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c0ada6c..aa426ce 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,7 +10,7 @@ jobs: steps: - uses: "actions/checkout@v3" - name: "Create conda environment" - run: "conda init bash && conda env create" + run: "conda env create" test: @@ -24,4 +24,4 @@ jobs: runs-on: "ubuntu-latest" steps: - name: "Run static analysis / tests" - run: "conda activate cryo-data-ingest && inv test.ci" + run: "$CONDA/bin/activate cryo-data-ingest && inv test.ci" From 096c99c911f159348714839e888c90dfc04dad37 Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 17:44:57 -0600 Subject: [PATCH 12/16] Try using `conda-incubator/setup-miniconda` action --- .github/workflows/test.yml | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index aa426ce..7beb58c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,7 +1,8 @@ name: "test" on: ["push"] jobs: - setup-environment: + # TODO: How to split environment setup and test steps into different jobs? + test: defaults: run: shell: "bash -l {0}" @@ -9,19 +10,15 @@ jobs: runs-on: "ubuntu-latest" steps: - uses: "actions/checkout@v3" - - name: "Create conda environment" - run: "conda env create" - - test: - defaults: - run: - # Login shell enables bash to process conda configuration so we can - # activate envs - shell: "bash -l {0}" + - name: "Create conda environment" + uses: "conda-incubator/setup-miniconda@v2" + with: + environment-file: "environment.yml" + activate-environment: "cryo-data-ingest" + miniforge-version: "latest" + # TODO: Do we need the `setup-miniconda` action? Why not: + # run: "conda env create" - needs: "setup-environment" - runs-on: "ubuntu-latest" - steps: - name: "Run static analysis / tests" - run: "$CONDA/bin/activate cryo-data-ingest && inv test.ci" + run: "inv test.ci" From e887925fe7bbf519ae2c2fe6256388e545c0f56f Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 18:01:36 -0600 Subject: [PATCH 13/16] Address shellcheck messages --- scripts/json2datalad.sh | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/scripts/json2datalad.sh b/scripts/json2datalad.sh index 7479d89..62d039c 100755 --- a/scripts/json2datalad.sh +++ b/scripts/json2datalad.sh @@ -24,13 +24,17 @@ set -o pipefail set -o errexit # set -o errtrace -PROGNAME=$(basename $0) +PROGNAME=$(basename "$0") red='\033[0;31m'; orange='\033[0;33m'; green='\033[0;32m'; yellow='\033[0;93m'; nc='\033[0m' # No Color -log_info() { echo -e "${green}[$(date --iso-8601=seconds)] [INFO] [${PROGNAME}] ${@}${nc}"; } -log_warn() { echo -e "${orange}[$(date --iso-8601=seconds)] [WARN] [${PROGNAME}] ${@}${nc}"; } -log_err() { echo -e "${red}[$(date --iso-8601=seconds)] [ERR] [${PROGNAME}] ${@}${nc}" >&2; } -log_debug() { if [[ ${debug:-} == 1 ]]; then echo -e "${yellow}[$(date --iso-8601=seconds)] [DEBUG] [${PROGNAME}] ${@}${nc}"; fi } -err_exit() { echo -e "${red}[$(date --iso-8601=seconds)] [ERR] [${PROGNAME}] ${@:-"Unknown Error"}${nc}" >&2; exit 1; } +log_info() { + if [ "$verbose" = 1 ]; then + echo -e "${green}[$(date --iso-8601=seconds)] [INFO] [${PROGNAME}] ${*}${nc}"; + fi; +} +log_warn() { echo -e "${orange}[$(date --iso-8601=seconds)] [WARN] [${PROGNAME}] ${*}${nc}"; } +log_err() { echo -e "${red}[$(date --iso-8601=seconds)] [ERR] [${PROGNAME}] ${*}${nc}" >&2; } +log_debug() { if [[ ${debug:-} == 1 ]]; then echo -e "${yellow}[$(date --iso-8601=seconds)] [DEBUG] [${PROGNAME}] ${*}${nc}"; fi } +err_exit() { echo -e "${red}[$(date --iso-8601=seconds)] [ERR] [${PROGNAME}] ${*:-"Unknown Error"}${nc}" >&2; exit 1; } trap ctrl_c INT # trap ctrl-c and call ctrl_c() function ctrl_c() { @@ -89,25 +93,25 @@ if [[ -z ${datalad_dir:-} ]]; then print_usage; err_exit "-d not set"; else log_ # download a dataset into a local datalad repository function cdi_download() { log_info "Running datalad addurls (DRYRUN)..." - datalad addurls -d ${datalad_dir} -n --fast --nosave ${jsonfile} '{link}' '{local_path}' + datalad addurls -d "${datalad_dir}" -n --fast --nosave "${jsonfile}" '{link}' '{local_path}' log_info "Running datalad addurls..." - datalad addurls -d ${datalad_dir} --fast --nosave ${jsonfile} '{link}' '{local_path}' + datalad addurls -d "${datalad_dir}" --fast --nosave "${jsonfile}" '{link}' '{local_path}' log_info "Running datalad save..." - datalad save ${datalad_dir} -m "Created ${datalad_dir}" + datalad save "${datalad_dir}" -m "Created ${datalad_dir}" } # Create a GitHub remote and push a local datalad repository to it function cdi_set_remote() { log_info "Creating GitHub repository" gh repo create \ - cryo-data/${datalad_dir} \ + "cryo-data/${datalad_dir}" \ -d "${datalad_dir}" \ --public \ - -s ${datalad_dir} + -s "${datalad_dir}" # undo: gh repo delete cryo-data/${datalad_dir} log_info "Pushing to GitHub" - (cd ${datalad_dir}; git push -u origin main) - (cd ${datalad_dir}; datalad push) + (cd "${datalad_dir}"; git push -u origin main) + (cd "${datalad_dir}"; datalad push) } cdi_download From 4789f4db1dcd59c2ceac6a397151bf50e93fdfbc Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 18:05:05 -0600 Subject: [PATCH 14/16] More consistent conditional expression --- scripts/json2datalad.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/json2datalad.sh b/scripts/json2datalad.sh index 62d039c..b27546c 100755 --- a/scripts/json2datalad.sh +++ b/scripts/json2datalad.sh @@ -26,11 +26,7 @@ set -o errexit PROGNAME=$(basename "$0") red='\033[0;31m'; orange='\033[0;33m'; green='\033[0;32m'; yellow='\033[0;93m'; nc='\033[0m' # No Color -log_info() { - if [ "$verbose" = 1 ]; then - echo -e "${green}[$(date --iso-8601=seconds)] [INFO] [${PROGNAME}] ${*}${nc}"; - fi; -} +log_info() { if [[ "${verbose:-}" == 1 ]]; then echo -e "${green}[$(date --iso-8601=seconds)] [INFO] [${PROGNAME}] ${*}${nc}"; fi; } log_warn() { echo -e "${orange}[$(date --iso-8601=seconds)] [WARN] [${PROGNAME}] ${*}${nc}"; } log_err() { echo -e "${red}[$(date --iso-8601=seconds)] [ERR] [${PROGNAME}] ${*}${nc}" >&2; } log_debug() { if [[ ${debug:-} == 1 ]]; then echo -e "${yellow}[$(date --iso-8601=seconds)] [DEBUG] [${PROGNAME}] ${*}${nc}"; fi } From 199fcb0dcbf966db448ef53441b23ac600a69dd5 Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 18:10:59 -0600 Subject: [PATCH 15/16] Fix typechecker errors --- cryo_data_ingest/util/cmr.py | 8 +++++--- environment-lock.yml | 25 +++++++++++++++++++++++-- environment.yml | 4 +++- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/cryo_data_ingest/util/cmr.py b/cryo_data_ingest/util/cmr.py index 984603d..ec918d6 100644 --- a/cryo_data_ingest/util/cmr.py +++ b/cryo_data_ingest/util/cmr.py @@ -45,7 +45,7 @@ def _page_cmr_results( *, query_params: dict | None = None, query_headers: dict | None = None, -) -> Iterator[str]: +) -> Iterator[bytes]: """Generate results until there are no more pages. Results are returned as raw strings, not parsed as any specific format. Consumer is @@ -68,7 +68,7 @@ def _page_cmr_results( if not response.ok: raise RuntimeError( - f'CMR search failed with error: {response.content}', + f"CMR search failed with error: {response.content.decode('utf-8')}", ) if page_num == 1: @@ -112,7 +112,9 @@ def get_nsidc_collections() -> Iterator[Collection]: # TODO: Use the paging algorithm response = requests.get(cmr_query_url, timeout=REQUESTS_TIMEOUT) if not response.ok: - raise RuntimeError(f'CMR request failed with error: {response.content}') + raise RuntimeError( + f"CMR request failed with error: {response.content.decode('utf-8')}" + ) response_json = json.loads(response.content) collections_json = response_json['feed']['entry'] diff --git a/environment-lock.yml b/environment-lock.yml index 4454707..a38a58b 100644 --- a/environment-lock.yml +++ b/environment-lock.yml @@ -1,6 +1,5 @@ name: cryo-data-ingest channels: - - nsidc - conda-forge - nodefaults dependencies: @@ -38,7 +37,9 @@ dependencies: - freetype=2.12.1=hca18f0e_0 - gettext=0.19.8.1=h27087fc_1009 - git=2.37.3=pl5321h36853c3_0 - - git-annex=10.20220927=nodep_h1234567_0 + - git-annex=10.20220927=alldep_h2ca4687_100 + - gmp=6.2.1=h58526e2_0 + - gnupg=2.3.3=h7853c96_0 - humanize=4.4.0=pyhd8ed1ab_0 - idna=3.4=pyhd8ed1ab_0 - importlib-metadata=4.11.4=py310hff52083_0 @@ -57,15 +58,22 @@ dependencies: - lcms2=2.12=hddcbb42_0 - ld_impl_linux-64=2.36.1=hea4e1c9_2 - lerc=4.0.0=h27087fc_0 + - libassuan=2.5.5=h9c3ff4c_0 + - libcbor=0.9.0=h9c3ff4c_0 - libcurl=7.83.1=h7bff187_0 - libdeflate=1.14=h166bdaf_0 - libedit=3.1.20191231=he28a2e2_2 - libev=4.33=h516909a_1 - libffi=3.4.2=h7f98852_5 + - libfido2=1.11.0=h727a467_0 - libgcc-ng=12.1.0=h8d9b700_16 + - libgcrypt=1.10.1=h166bdaf_0 - libglib=2.74.0=h7a41b64_0 - libgomp=12.1.0=h8d9b700_16 + - libgpg-error=1.45=hc0c96e0_0 - libiconv=1.17=h166bdaf_0 + - libksba=1.3.5=hf484d3e_1000 + - libmagic=5.39=h753d276_1 - libnghttp2=1.47.0=hdcd2b5c_1 - libnsl=2.0.0=h7f98852_0 - libpng=1.6.38=h753d276_0 @@ -73,10 +81,13 @@ dependencies: - libssh2=1.10.0=haa6b8db_3 - libstdcxx-ng=12.1.0=ha89aaad_16 - libtiff=4.4.0=h55922b4_4 + - libudev1=249=h166bdaf_4 - libuuid=2.32.1=h7f98852_1000 - libwebp-base=1.2.4=h166bdaf_0 - libxcb=1.13=h7f98852_1004 - libzlib=1.2.12=h166bdaf_3 + - lsof=4.89=h7f98852_1 + - lz4-c=1.9.3=h9c3ff4c_1 - mccabe=0.6.1=py_1 - more-itertools=8.14.0=pyhd8ed1ab_0 - msgpack-python=1.0.4=py310hbf28c38_0 @@ -84,7 +95,10 @@ dependencies: - mypy=0.971=py310h5764c6d_0 - mypy_extensions=0.4.3=py310hff52083_5 - ncurses=6.3=h27087fc_1 + - npth=1.6=h27087fc_1001 + - ntbtls=0.1.2=hdbcaa40_1000 - openjpeg=2.5.0=h7d73246_1 + - openssh=9.0p1=hf695f80_0 - openssl=1.1.1q=h166bdaf_0 - p7zip=16.02=h9c3ff4c_1001 - packaging=21.3=pyhd8ed1ab_0 @@ -96,6 +110,7 @@ dependencies: - pip=22.2.2=pyhd8ed1ab_0 - platformdirs=2.5.2=pyhd8ed1ab_1 - pluggy=1.0.0=py310hff52083_3 + - popt=1.16=h0b475e3_2002 - psutil=5.9.2=py310h5764c6d_0 - pthread-stubs=0.4=h36c2ea0_1001 - py=1.11.0=pyh6c4a22f_0 @@ -116,16 +131,20 @@ dependencies: - requests=2.28.1=pyhd8ed1ab_1 - requests-ftp=0.3.1=py_1 - requests-toolbelt=0.9.1=py_0 + - rsync=3.2.6=h220164a_0 - secretstorage=3.3.3=py310hff52083_0 - setuptools=65.4.0=pyhd8ed1ab_0 - shellcheck=0.8.0=ha770c72_0 - simplejson=3.17.6=py310h5764c6d_1 - six=1.16.0=pyh6c4a22f_0 - snowballstemmer=2.2.0=pyhd8ed1ab_0 + - sqlite=3.39.3=h4ff8645_0 - tk=8.6.12=h27826a3_0 - tomli=2.0.1=pyhd8ed1ab_0 - tqdm=4.64.1=pyhd8ed1ab_0 - typed-ast=1.5.4=py310h5764c6d_0 + - types-requests=2.28.11=pyhd8ed1ab_0 + - types-urllib3=1.26.25=pyhd8ed1ab_0 - typing_extensions=4.3.0=pyha770c72_0 - tzdata=2022d=h191b570_0 - urllib3=1.26.11=pyhd8ed1ab_0 @@ -133,6 +152,8 @@ dependencies: - whoosh=2.7.4=py310hff52083_6 - xorg-libxau=1.0.9=h7f98852_0 - xorg-libxdmcp=1.1.3=h7f98852_0 + - xxhash=0.8.0=h7f98852_3 - xz=5.2.6=h166bdaf_0 - zipp=3.8.1=pyhd8ed1ab_0 + - zlib=1.2.12=h166bdaf_3 - zstd=1.5.2=h6239696_4 diff --git a/environment.yml b/environment.yml index ca6a7e2..a1b57e2 100644 --- a/environment.yml +++ b/environment.yml @@ -20,8 +20,10 @@ dependencies: - flake8-docstrings ~=1.6 - flake8-use-fstring ~=1.3 + - types-requests ~=2.28 + # Runtime dependencies: - click ~=8.1 - - requests ~=2.23 + - requests ~=2.28 - datalad ~=0.17.5 From 97f518a2afea6dcb59553a88d14b8f3482416bd3 Mon Sep 17 00:00:00 2001 From: Matt Fisher Date: Wed, 28 Sep 2022 18:12:33 -0600 Subject: [PATCH 16/16] Ensure parents of output directory exist --- cryo_data_ingest/util/cmr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryo_data_ingest/util/cmr.py b/cryo_data_ingest/util/cmr.py index ec918d6..e103f42 100644 --- a/cryo_data_ingest/util/cmr.py +++ b/cryo_data_ingest/util/cmr.py @@ -180,7 +180,7 @@ def write_collection_granules(collection: Collection) -> None: collection_fp = JSON_STORAGE_DIR / f'{collection_readable_id}.json' - JSON_STORAGE_DIR.mkdir(exist_ok=True) + JSON_STORAGE_DIR.mkdir(exist_ok=True, parents=True) with open(collection_fp, 'w') as f: json.dump(output_granules, f, indent=2) logger.info(f'Wrote {collection_fp}')