Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Static analysis and ci #7

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[flake8]
max-line-length = 100
max-complexity = 8

# Temporary. Remove once black re-enabled:
inline-quotes = '

# flake8-import-order
application_import_names = cryo_data_ingest
import_order_style = pycharm

# D1: Ignore errors requiring docstrings on everything.
# W503: Line breaks should occur after the binary operator to keep all variable names aligned.
# E731: Lambda assignments are OK, use your best judgement.
# C408: Unnecessary dict call - rewrite as a literal.
ignore = D1,W503,E731,C408
24 changes: 24 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: "test"
on: ["push"]
jobs:
# TODO: How to split environment setup and test steps into different jobs?
test:
defaults:
run:
shell: "bash -l {0}"

runs-on: "ubuntu-latest"
steps:
- uses: "actions/checkout@v3"

- name: "Create conda environment"
uses: "conda-incubator/setup-miniconda@v2"
with:
environment-file: "environment.yml"
activate-environment: "cryo-data-ingest"
miniforge-version: "latest"
# TODO: Do we need the `setup-miniconda` action? Why not:
# run: "conda env create"

- name: "Run static analysis / tests"
run: "inv test.ci"
6 changes: 6 additions & 0 deletions .mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[mypy]
python_version = 3.10
incremental = True
show_error_codes = True
check_untyped_defs = True
warn_unreachable = True
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ _In early development. The following instructions are temporary._

0. Set up conda environment (`conda env create`)
1. Activate conda environment (`conda activate cryo-data-ingest`)
2. Run the "main script" from the root of this repo:
2. Run the "cmr script" from the root of this repo:

```
PYTHONPATH=. python cryo_data_ingest/util/cmr.py
```

3. Run the "datalad script" and follow usage instructions:

```
./scripts/json2datalad.sh
```
3 changes: 3 additions & 0 deletions cryo_data_ingest/constants/paths.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from pathlib import Path

PACKAGE_DIR = Path(__file__).parent.parent
PROJECT_DIR = PACKAGE_DIR.parent
SCRIPTS_DIR = PROJECT_DIR / 'scripts'

JSON_STORAGE_DIR = Path('/tmp/cryo-data-ingest')
24 changes: 14 additions & 10 deletions cryo_data_ingest/util/cmr.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import csv
import json
import logging
import requests
from typing import Iterator, TypedDict
from urllib.parse import urlparse

import requests

from cryo_data_ingest.constants.cmr import (
CMR_COLLECTIONS_SEARCH_URL,
CMR_GRANULES_SEARCH_URL,
Expand Down Expand Up @@ -33,6 +34,7 @@ class Granule(TypedDict):

class OutputGranule(TypedDict):
"""Just the information needed to create a datalad URL file."""

local_path: str
link: str

Expand All @@ -43,7 +45,7 @@ def _page_cmr_results(
*,
query_params: dict | None = None,
query_headers: dict | None = None,
) -> Iterator[str]:
) -> Iterator[bytes]:
"""Generate results until there are no more pages.

Results are returned as raw strings, not parsed as any specific format. Consumer is
Expand All @@ -54,7 +56,6 @@ def _page_cmr_results(
query_params = query_params if query_params else dict()
query_headers = query_headers if query_headers else dict()


CMR_SEARCH_HEADER = 'CMR-Search-After'
page_num = 1
while True:
Expand All @@ -67,11 +68,13 @@ def _page_cmr_results(

if not response.ok:
raise RuntimeError(
f'CMR search failed with error: {response.content}',
f"CMR search failed with error: {response.content.decode('utf-8')}",
)

if page_num == 1:
logger.debug(f"Got {response.headers['CMR-Hits']} hits for query {cmr_query_url}")
logger.debug(
f"Got {response.headers['CMR-Hits']} hits for query {cmr_query_url}"
)

logger.debug(f'Got page {page_num}...')

Expand Down Expand Up @@ -109,7 +112,9 @@ def get_nsidc_collections() -> Iterator[Collection]:
# TODO: Use the paging algorithm
response = requests.get(cmr_query_url, timeout=REQUESTS_TIMEOUT)
if not response.ok:
raise RuntimeError(f'CMR request failed with error: {response.content}')
raise RuntimeError(
f"CMR request failed with error: {response.content.decode('utf-8')}"
)

response_json = json.loads(response.content)
collections_json = response_json['feed']['entry']
Expand Down Expand Up @@ -160,23 +165,22 @@ def write_collection_granules(collection: Collection) -> None:
return

logger.info(
f'Creating file for {collection_readable_id} ({len(granules)}'
' granules)...'
f'Creating file for {collection_readable_id} ({len(granules)} granules)...'
)

# TODO: the `local_path` should not include common path parts that are in common for
# each granule
output_granules: list[OutputGranule] = [
{
'local_path': urlparse(g['url']).path[1:], # trim leading "/"
'local_path': urlparse(g['url']).path[1:], # trim leading "/"
'link': g['url'],
}
for g in granules
]

collection_fp = JSON_STORAGE_DIR / f'{collection_readable_id}.json'

JSON_STORAGE_DIR.mkdir(exist_ok=True)
JSON_STORAGE_DIR.mkdir(exist_ok=True, parents=True)
with open(collection_fp, 'w') as f:
json.dump(output_granules, f, indent=2)
logger.info(f'Wrote {collection_fp}')
Expand Down
25 changes: 23 additions & 2 deletions environment-lock.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
name: cryo-data-ingest
channels:
- nsidc
- conda-forge
- nodefaults
dependencies:
Expand Down Expand Up @@ -38,7 +37,9 @@ dependencies:
- freetype=2.12.1=hca18f0e_0
- gettext=0.19.8.1=h27087fc_1009
- git=2.37.3=pl5321h36853c3_0
- git-annex=10.20220927=nodep_h1234567_0
- git-annex=10.20220927=alldep_h2ca4687_100
- gmp=6.2.1=h58526e2_0
- gnupg=2.3.3=h7853c96_0
- humanize=4.4.0=pyhd8ed1ab_0
- idna=3.4=pyhd8ed1ab_0
- importlib-metadata=4.11.4=py310hff52083_0
Expand All @@ -57,34 +58,47 @@ dependencies:
- lcms2=2.12=hddcbb42_0
- ld_impl_linux-64=2.36.1=hea4e1c9_2
- lerc=4.0.0=h27087fc_0
- libassuan=2.5.5=h9c3ff4c_0
- libcbor=0.9.0=h9c3ff4c_0
- libcurl=7.83.1=h7bff187_0
- libdeflate=1.14=h166bdaf_0
- libedit=3.1.20191231=he28a2e2_2
- libev=4.33=h516909a_1
- libffi=3.4.2=h7f98852_5
- libfido2=1.11.0=h727a467_0
- libgcc-ng=12.1.0=h8d9b700_16
- libgcrypt=1.10.1=h166bdaf_0
- libglib=2.74.0=h7a41b64_0
- libgomp=12.1.0=h8d9b700_16
- libgpg-error=1.45=hc0c96e0_0
- libiconv=1.17=h166bdaf_0
- libksba=1.3.5=hf484d3e_1000
- libmagic=5.39=h753d276_1
- libnghttp2=1.47.0=hdcd2b5c_1
- libnsl=2.0.0=h7f98852_0
- libpng=1.6.38=h753d276_0
- libsqlite=3.39.3=h753d276_0
- libssh2=1.10.0=haa6b8db_3
- libstdcxx-ng=12.1.0=ha89aaad_16
- libtiff=4.4.0=h55922b4_4
- libudev1=249=h166bdaf_4
- libuuid=2.32.1=h7f98852_1000
- libwebp-base=1.2.4=h166bdaf_0
- libxcb=1.13=h7f98852_1004
- libzlib=1.2.12=h166bdaf_3
- lsof=4.89=h7f98852_1
- lz4-c=1.9.3=h9c3ff4c_1
- mccabe=0.6.1=py_1
- more-itertools=8.14.0=pyhd8ed1ab_0
- msgpack-python=1.0.4=py310hbf28c38_0
- mutagen=1.45.1=pyhd8ed1ab_0
- mypy=0.971=py310h5764c6d_0
- mypy_extensions=0.4.3=py310hff52083_5
- ncurses=6.3=h27087fc_1
- npth=1.6=h27087fc_1001
- ntbtls=0.1.2=hdbcaa40_1000
- openjpeg=2.5.0=h7d73246_1
- openssh=9.0p1=hf695f80_0
- openssl=1.1.1q=h166bdaf_0
- p7zip=16.02=h9c3ff4c_1001
- packaging=21.3=pyhd8ed1ab_0
Expand All @@ -96,6 +110,7 @@ dependencies:
- pip=22.2.2=pyhd8ed1ab_0
- platformdirs=2.5.2=pyhd8ed1ab_1
- pluggy=1.0.0=py310hff52083_3
- popt=1.16=h0b475e3_2002
- psutil=5.9.2=py310h5764c6d_0
- pthread-stubs=0.4=h36c2ea0_1001
- py=1.11.0=pyh6c4a22f_0
Expand All @@ -116,23 +131,29 @@ dependencies:
- requests=2.28.1=pyhd8ed1ab_1
- requests-ftp=0.3.1=py_1
- requests-toolbelt=0.9.1=py_0
- rsync=3.2.6=h220164a_0
- secretstorage=3.3.3=py310hff52083_0
- setuptools=65.4.0=pyhd8ed1ab_0
- shellcheck=0.8.0=ha770c72_0
- simplejson=3.17.6=py310h5764c6d_1
- six=1.16.0=pyh6c4a22f_0
- snowballstemmer=2.2.0=pyhd8ed1ab_0
- sqlite=3.39.3=h4ff8645_0
- tk=8.6.12=h27826a3_0
- tomli=2.0.1=pyhd8ed1ab_0
- tqdm=4.64.1=pyhd8ed1ab_0
- typed-ast=1.5.4=py310h5764c6d_0
- types-requests=2.28.11=pyhd8ed1ab_0
- types-urllib3=1.26.25=pyhd8ed1ab_0
- typing_extensions=4.3.0=pyha770c72_0
- tzdata=2022d=h191b570_0
- urllib3=1.26.11=pyhd8ed1ab_0
- wheel=0.37.1=pyhd8ed1ab_0
- whoosh=2.7.4=py310hff52083_6
- xorg-libxau=1.0.9=h7f98852_0
- xorg-libxdmcp=1.1.3=h7f98852_0
- xxhash=0.8.0=h7f98852_3
- xz=5.2.6=h166bdaf_0
- zipp=3.8.1=pyhd8ed1ab_0
- zlib=1.2.12=h166bdaf_3
- zstd=1.5.2=h6239696_4
8 changes: 5 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@ channels:
- nodefaults
dependencies:
- python ~=3.10.4
- datalad ~=0.17.5

# Dev dependencies:
- bump2version ~=1.0
- invoke ~=1.7
- bump2version ~=1.0
- black ~=22.3.0
- isort ~=5.10
- pytest ~=7.1
Expand All @@ -21,7 +20,10 @@ dependencies:
- flake8-docstrings ~=1.6
- flake8-use-fstring ~=1.3

- types-requests ~=2.28


# Runtime dependencies:
- click ~=8.1
- requests ~=2.23
- requests ~=2.28
- datalad ~=0.17.5
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[tool.black]
skip-string-normalization = true


[tool.isort]
profile = "black"
26 changes: 13 additions & 13 deletions json2datalad.sh → scripts/json2datalad.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ set -o pipefail
set -o errexit
# set -o errtrace

PROGNAME=$(basename $0)
PROGNAME=$(basename "$0")
red='\033[0;31m'; orange='\033[0;33m'; green='\033[0;32m'; yellow='\033[0;93m'; nc='\033[0m' # No Color
log_info() { echo -e "${green}[$(date --iso-8601=seconds)] [INFO] [${PROGNAME}] ${@}${nc}"; }
log_warn() { echo -e "${orange}[$(date --iso-8601=seconds)] [WARN] [${PROGNAME}] ${@}${nc}"; }
log_err() { echo -e "${red}[$(date --iso-8601=seconds)] [ERR] [${PROGNAME}] ${@}${nc}" >&2; }
log_debug() { if [[ ${debug:-} == 1 ]]; then echo -e "${yellow}[$(date --iso-8601=seconds)] [DEBUG] [${PROGNAME}] ${@}${nc}"; fi }
err_exit() { echo -e "${red}[$(date --iso-8601=seconds)] [ERR] [${PROGNAME}] ${@:-"Unknown Error"}${nc}" >&2; exit 1; }
log_info() { if [[ "${verbose:-}" == 1 ]]; then echo -e "${green}[$(date --iso-8601=seconds)] [INFO] [${PROGNAME}] ${*}${nc}"; fi; }
Copy link
Collaborator Author

@MattF-NSIDC MattF-NSIDC Sep 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mankoff the $verbose variable previously had no effect. What do you think about this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure? When processing the CLI arg/flags I had set -o xtrace; which made things very verbose. But this is a good addition.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, to be more clear, the --verbose flag does have an effect, but the $verbose variable was not being used anywhere in the code. We could remove the $verbose variable entirely and only do set -o xtrace when the flag is passed? What do you think?

log_warn() { echo -e "${orange}[$(date --iso-8601=seconds)] [WARN] [${PROGNAME}] ${*}${nc}"; }
log_err() { echo -e "${red}[$(date --iso-8601=seconds)] [ERR] [${PROGNAME}] ${*}${nc}" >&2; }
log_debug() { if [[ ${debug:-} == 1 ]]; then echo -e "${yellow}[$(date --iso-8601=seconds)] [DEBUG] [${PROGNAME}] ${*}${nc}"; fi }
err_exit() { echo -e "${red}[$(date --iso-8601=seconds)] [ERR] [${PROGNAME}] ${*:-"Unknown Error"}${nc}" >&2; exit 1; }

trap ctrl_c INT # trap ctrl-c and call ctrl_c()
function ctrl_c() {
Expand Down Expand Up @@ -89,25 +89,25 @@ if [[ -z ${datalad_dir:-} ]]; then print_usage; err_exit "-d not set"; else log_
# download a dataset into a local datalad repository
function cdi_download() {
log_info "Running datalad addurls (DRYRUN)..."
datalad addurls -d ${datalad_dir} -n --fast --nosave ${jsonfile} '{link}' '{local_path}'
datalad addurls -d "${datalad_dir}" -n --fast --nosave "${jsonfile}" '{link}' '{local_path}'
log_info "Running datalad addurls..."
datalad addurls -d ${datalad_dir} --fast --nosave ${jsonfile} '{link}' '{local_path}'
datalad addurls -d "${datalad_dir}" --fast --nosave "${jsonfile}" '{link}' '{local_path}'
log_info "Running datalad save..."
datalad save ${datalad_dir} -m "Created ${datalad_dir}"
datalad save "${datalad_dir}" -m "Created ${datalad_dir}"
}

# Create a GitHub remote and push a local datalad repository to it
function cdi_set_remote() {
log_info "Creating GitHub repository"
gh repo create \
cryo-data/${datalad_dir} \
"cryo-data/${datalad_dir}" \
-d "${datalad_dir}" \
--public \
-s ${datalad_dir}
-s "${datalad_dir}"
# undo: gh repo delete cryo-data/${datalad_dir}
log_info "Pushing to GitHub"
(cd ${datalad_dir}; git push -u origin main)
(cd ${datalad_dir}; datalad push)
(cd "${datalad_dir}"; git push -u origin main)
(cd "${datalad_dir}"; datalad push)
}

cdi_download
Expand Down
10 changes: 10 additions & 0 deletions tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Invoke tasks."""

from invoke import Collection

from . import env, format, test

ns = Collection()
ns.add_collection(env)
ns.add_collection(format)
ns.add_collection(test)
32 changes: 32 additions & 0 deletions tasks/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from invoke import task

from .util import PROJECT_DIR, print_and_run

ENV_LOCKFILE = PROJECT_DIR / "environment-lock.yml"


@task(default=True)
def lock(ctx):
"""Update the environment-lock.yml file from the current `cryo-data-ingest` environment."""
print_and_run(f"conda env export -n cryo-data-ingest > {ENV_LOCKFILE}")

with open(ENV_LOCKFILE, "r") as f:
lines = f.readlines()

with open(ENV_LOCKFILE, "w") as f:
for line in lines:
# The prefix line contains machine-specific directory
if line.startswith('prefix: '):
continue

# We don't want to use the NSIDC conda channel
if '- nsidc' in line:
continue

# We want to replace the "defaults" channel with "nodefaults" so conda-forge
# is used for everything.
if '- defaults' in line:
f.write(line.replace('- defaults', '- nodefaults'))
continue

f.write(line)
Loading