From e28a6d52c68a1d3bc3c5a2ce378cebb023ca5ac4 Mon Sep 17 00:00:00 2001 From: Rahul Mahrsee <86819420+mahrsee1997@users.noreply.github.com> Date: Fri, 21 Jul 2023 09:46:07 +0530 Subject: [PATCH 01/16] Pinned cython version to 0.29.34. (#363) * Pinned cython version. * Pinned cython version in ci-yamls. * Pinned cython in github workflow ci.yml. * Fix github workflow type-check ci.yml. * fix yaml syntax. * trying to fix ci.yml. --- .github/workflows/ci.yml | 40 +++++++++++++++++++--------------------- ci3.8.yml | 1 + ci3.9.yml | 1 + environment.yml | 1 + 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0325c936..31cefb0d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -91,6 +91,8 @@ jobs: runs-on: ubuntu-latest strategy: fail-fast: false + matrix: + python-version: ["3.8"] steps: - name: Cancel previous uses: styfle/cancel-workflow-action@0.7.0 @@ -98,28 +100,24 @@ jobs: access_token: ${{ github.token }} if: ${{github.ref != 'refs/head/main'}} - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 + - name: conda cache + uses: actions/cache@v2 + env: + # Increase this value to reset cache if etc/example-environment.yml has not changed + CACHE_NUMBER: 0 with: - python-version: "3.8" - - name: Setup conda - uses: s-weigand/setup-conda@v1 + path: ~/conda_pkgs_dir + key: + ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ matrix.python-version }}-${{ hashFiles('ci3.8.yml') }} + - name: Setup conda environment + uses: conda-incubator/setup-miniconda@v2 with: - update-conda: true - python-version: "3.8" - conda-channels: anaconda, conda-forge - - name: Install ecCodes - run: | - conda install -y eccodes>=2.21.0 -c conda-forge - conda install -y pyproj -c conda-forge - conda install -y gdal -c conda-forge - - name: Get pip cache dir - id: pip-cache - run: | - python -m pip install --upgrade pip wheel - echo "::set-output name=dir::$(pip cache dir)" - - name: Install weather-tools + python-version: ${{ matrix.python-version }} + channels: conda-forge + environment-file: ci${{ matrix.python-version}}.yml + activate-environment: weather-tools + - name: Install weather-tools[test] run: | - pip install -e .[test] --use-deprecated=legacy-resolver + conda run -n weather-tools pip install -e .[test] --use-deprecated=legacy-resolver - name: Run type checker - run: pytype + run: conda run -n weather-tools pytype diff --git a/ci3.8.yml b/ci3.8.yml index d6a1e0bd..6b105598 100644 --- a/ci3.8.yml +++ b/ci3.8.yml @@ -34,5 +34,6 @@ dependencies: - google-cloud-sdk=410.0.0 - aria2=1.36.0 - pip: + - cython==0.29.34 - earthengine-api==0.1.329 - .[test] diff --git a/ci3.9.yml b/ci3.9.yml index a43cec16..a3cfac6c 100644 --- a/ci3.9.yml +++ b/ci3.9.yml @@ -34,5 +34,6 @@ dependencies: - xarray==2023.1.0 - ruff==0.0.260 - pip: + - cython==0.29.34 - earthengine-api==0.1.329 - .[test] diff --git a/environment.yml b/environment.yml index eae35f9c..e260c4f8 100644 --- a/environment.yml +++ b/environment.yml @@ -26,6 +26,7 @@ dependencies: - aria2=1.36.0 - pip=22.3 - pip: + - cython==0.29.34 - earthengine-api==0.1.329 - firebase-admin==6.0.1 - . From bb2274d2be4ed51a9a6cc3bc79a8be110b250509 Mon Sep 17 00:00:00 2001 From: aniketinfocusp <122869307+aniketinfocusp@users.noreply.github.com> Date: Mon, 24 Jul 2023 12:18:25 +0530 Subject: [PATCH 02/16] Added time metrics to `weather-mv` ee (#361) * added time metrics to weather-mv ee * added doc string and lint fixes * added return type * updated time metrics to timestamp from ISO string. * updated get utc timestamp * fix lint issues --- weather_mv/loader_pipeline/ee.py | 8 +++++++- weather_mv/loader_pipeline/util.py | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/weather_mv/loader_pipeline/ee.py b/weather_mv/loader_pipeline/ee.py index d12c320c..bff24ad5 100644 --- a/weather_mv/loader_pipeline/ee.py +++ b/weather_mv/loader_pipeline/ee.py @@ -37,7 +37,7 @@ from rasterio.io import MemoryFile from .sinks import ToDataSink, open_dataset, open_local, KwargsFactoryMixin -from .util import make_attrs_ee_compatible, RateLimit, validate_region +from .util import make_attrs_ee_compatible, RateLimit, validate_region, get_utc_timestamp logger = logging.getLogger(__name__) @@ -432,6 +432,8 @@ def add_to_queue(self, queue: Queue, item: t.Any): def convert_to_asset(self, queue: Queue, uri: str): """Converts source data into EE asset (GeoTiff or CSV) and uploads it to the bucket.""" logger.info(f'Converting {uri!r} to COGs...') + job_start_time = get_utc_timestamp() + with open_dataset(uri, self.open_dataset_kwargs, self.disable_grib_schema_normalization, @@ -447,6 +449,8 @@ def convert_to_asset(self, queue: Queue, uri: str): ('start_time', 'end_time', 'is_normalized')) dtype, crs, transform = (attrs.pop(key) for key in ['dtype', 'crs', 'transform']) attrs.update({'is_normalized': str(is_normalized)}) # EE properties does not support bool. + # Adding job_start_time to properites. + attrs["job_start_time"] = job_start_time # Make attrs EE ingestable. attrs = make_attrs_ee_compatible(attrs) @@ -602,6 +606,8 @@ def start_ingestion(self, asset_request: t.Dict) -> str: """Creates COG-backed asset in earth engine. Returns the asset id.""" self.check_setup() + asset_request['properties']['ingestion_time'] = get_utc_timestamp() + try: if self.ee_asset_type == 'IMAGE': result = ee.data.createAsset(asset_request) diff --git a/weather_mv/loader_pipeline/util.py b/weather_mv/loader_pipeline/util.py index a31a06a9..079b86de 100644 --- a/weather_mv/loader_pipeline/util.py +++ b/weather_mv/loader_pipeline/util.py @@ -28,7 +28,6 @@ import uuid from functools import partial from urllib.parse import urlparse - import apache_beam as beam import numpy as np import pandas as pd @@ -134,6 +133,9 @@ def _check_for_coords_vars(ds_data_var: str, target_var: str) -> bool: specified by the user.""" return ds_data_var.endswith('_'+target_var) or ds_data_var.startswith(target_var+'_') +def get_utc_timestamp() -> float: + """Returns the current UTC Timestamp.""" + return datetime.datetime.now().timestamp() def _only_target_coordinate_vars(ds: xr.Dataset, data_vars: t.List[str]) -> t.List[str]: """If the user specifies target fields in the dataset, get all the matching coords & data vars.""" From b8bb7bef8046dabaaad30a02a7aaf8a5ccc01eae Mon Sep 17 00:00:00 2001 From: dabhi_cusp <123355381+dabhicusp@users.noreply.github.com> Date: Tue, 8 Aug 2023 14:55:29 +0530 Subject: [PATCH 03/16] Weather-sp updated. (#377) * Splitter updated. * Bumping weather-sp version to v0.3.2. --- weather_sp/setup.py | 2 +- weather_sp/splitter_pipeline/file_splitters.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/weather_sp/setup.py b/weather_sp/setup.py index 59786c8e..e22279cd 100644 --- a/weather_sp/setup.py +++ b/weather_sp/setup.py @@ -44,7 +44,7 @@ packages=find_packages(), author='Anthromets', author_email='anthromets-ecmwf@google.com', - version='0.3.1', + version='0.3.2', url='https://weather-tools.readthedocs.io/en/latest/weather_sp/', description='A tool to split weather data files into per-variable files.', install_requires=beam_gcp_requirements + base_requirements, diff --git a/weather_sp/splitter_pipeline/file_splitters.py b/weather_sp/splitter_pipeline/file_splitters.py index 6456c426..7a4d3c77 100644 --- a/weather_sp/splitter_pipeline/file_splitters.py +++ b/weather_sp/splitter_pipeline/file_splitters.py @@ -16,6 +16,7 @@ import itertools import logging import os +import re import shutil import string import subprocess @@ -158,6 +159,10 @@ class GribSplitterV2(GribSplitter): See https://confluence.ecmwf.int/display/ECC/grib_copy. """ + def replace_non_numeric_bracket(self, match: re.Match) -> str: + value = match.group(1) + return f"[{value}]" if not value.isdigit() else "{" + value + "}" + def split_data(self) -> None: if not self.output_info.split_dims(): raise ValueError('No splitting specified in template.') @@ -172,7 +177,10 @@ def split_data(self) -> None: unformatted_output_path = self.output_info.unformatted_output_path() prefix, _ = os.path.split(next(iter(string.Formatter().parse(unformatted_output_path)))[0]) _, tail = unformatted_output_path.split(prefix) - output_template = tail.replace('{', '[').replace('}', ']') + + # Replace { with [ and } with ] only for non-numeric values inside {} of tail + output_str = re.sub(r'\{(\w+)\}', self.replace_non_numeric_bracket, tail) + output_template = output_str.format(*self.output_info.template_folders) slash = '/' delimiter = 'DELIMITER' From 3464844419a9db6aa134add9c4f77358c4e3a4ff Mon Sep 17 00:00:00 2001 From: DeepGabani <60647051+deepgabani8@users.noreply.github.com> Date: Fri, 11 Aug 2023 12:20:47 +0530 Subject: [PATCH 04/16] Update sparse data ingestion to handle huge datasets (#376) * Creating CSVs using file IO instead of pandas df * Updated meshgrid call * Removed meshgrid, since it could run out of memory for multiple dimensions * Added comments. * Resolved a CI error * using csv library to write a csv + small changes. --- weather_mv/loader_pipeline/ee.py | 51 ++++++++++++++++++++--------- weather_mv/loader_pipeline/sinks.py | 5 +++ 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/weather_mv/loader_pipeline/ee.py b/weather_mv/loader_pipeline/ee.py index bff24ad5..094b3a17 100644 --- a/weather_mv/loader_pipeline/ee.py +++ b/weather_mv/loader_pipeline/ee.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse +import csv import dataclasses import json import logging +import math import os import re import shutil @@ -27,7 +29,6 @@ import apache_beam as beam import ee import numpy as np -import xarray as xr from apache_beam.io.filesystems import FileSystems from apache_beam.io.gcp.gcsio import WRITE_CHUNK_SIZE from apache_beam.options.pipeline_options import PipelineOptions @@ -36,7 +37,7 @@ from google.auth.transport import requests from rasterio.io import MemoryFile -from .sinks import ToDataSink, open_dataset, open_local, KwargsFactoryMixin +from .sinks import ToDataSink, open_dataset, open_local, KwargsFactoryMixin, upload from .util import make_attrs_ee_compatible, RateLimit, validate_region, get_utc_timestamp logger = logging.getLogger(__name__) @@ -51,6 +52,7 @@ 'IMAGE': '.tiff', 'TABLE': '.csv' } +ROWS_PER_WRITE = 10_000 # Number of rows per feature collection write. def is_compute_engine() -> bool: @@ -486,21 +488,40 @@ def convert_to_asset(self, queue: Queue, uri: str): channel_names = [] file_name = f'{asset_name}.csv' - df = xr.Dataset.to_dataframe(ds) - df = df.reset_index() - # NULL and NaN create data-type mismatch issue in ee therefore replacing all of them. - # fillna fills in NaNs, NULLs, and NaTs but we have to exclude NaTs. - non_nat = df.select_dtypes(exclude=['datetime', 'timedelta', 'datetimetz']) - df[non_nat.columns] = non_nat.fillna(-9999) + shape = math.prod(list(ds.dims.values())) + # Names of dimesions, coordinates and data variables. + dims = list(ds.dims) + coords = [c for c in list(ds.coords) if c not in dims] + vars = list(ds.data_vars) + header = dims + coords + vars - # Copy in-memory dataframe to gcs. + # Data of dimesions, coordinates and data variables. + dims_data = [ds[dim].data for dim in dims] + coords_data = [np.full((shape,), ds[coord].data) for coord in coords] + vars_data = [ds[var].data.flatten() for var in vars] + data = coords_data + vars_data + + dims_shape = [len(ds[dim].data) for dim in dims] + + def get_dims_data(index: int) -> t.List[t.Any]: + """Returns dimensions for the given flattened index.""" + return [ + dim[int(index / math.prod(dims_shape[i+1:])) % len(dim)] for (i, dim) in enumerate(dims_data) + ] + + # Copy CSV to gcs. target_path = os.path.join(self.asset_location, file_name) - with tempfile.NamedTemporaryFile() as tmp_df: - df.to_csv(tmp_df.name, index=False) - tmp_df.flush() - tmp_df.seek(0) - with FileSystems().create(target_path) as dst: - shutil.copyfileobj(tmp_df, dst, WRITE_CHUNK_SIZE) + with tempfile.NamedTemporaryFile() as temp: + with open(temp.name, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerows([header]) + # Write rows in batches. + for i in range(0, shape, ROWS_PER_WRITE): + writer.writerows( + [get_dims_data(i) + list(row) for row in zip(*[d[i:i + ROWS_PER_WRITE] for d in data])] + ) + + upload(temp.name, target_path) asset_data = AssetData( name=asset_name, diff --git a/weather_mv/loader_pipeline/sinks.py b/weather_mv/loader_pipeline/sinks.py index 0f8cd561..22569bbb 100644 --- a/weather_mv/loader_pipeline/sinks.py +++ b/weather_mv/loader_pipeline/sinks.py @@ -326,6 +326,11 @@ def __open_dataset_file(filename: str, False) +def upload(src: str, dst: str) -> None: + """Uploads a file to the specified GCS bucket destination.""" + subprocess.run(f'gsutil -m cp {src} {dst}'.split(), check=True, capture_output=True, text=True, input="n/n") + + def copy(src: str, dst: str) -> None: """Copy data via `gcloud alpha storage` or `gsutil`.""" errors: t.List[subprocess.CalledProcessError] = [] From 9cdf0e7bb54d4b07d25a6f86fbf2e74c02d61a24 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Wed, 16 Aug 2023 11:57:22 -0700 Subject: [PATCH 05/16] `weather-mv` will ingest data into BQ from Zarr much faster. (#357) * Fixed issues found loading Zarr into BQ. Found a couple of errors with loading a Zarr dataset into BigQuery. * Base weather-tools install requires gcsfs. * Not normalized by default. * Parallel Zarr ingestion into BQ. * Fix setup.py syntax error. * Fixing Zarr + Xarray-Beam support. * Added happy path unit test for parallel zarr reading in BQ. * fix flake8 issues. * Better whitespace. * Adding open_ds kwargs to open zarr. * Attempting to fix pickling issues. * Another attempt to fix pickling error, now in transform. * Experiment: is xbeam.open_zarr the issue? * adding engine=zarr. * open_zarr --> open_dataset w/ engine. * delete regrid * Pinned Zarr version. * Hard coded current CL for docker image. * rm unnecessary delete. * Only recent years. * All data w/ streaming inserts. * Experiment: added windowing. * Documented `timestamp_row` fn. * Self-review: Prepared changes for PR. * Small cleanup. * Remove debug isel. * Added types to `to_rows()`. * Fixed flake8 lint errors. * Better types for `to_rows()`. * Test updated and 'chunks' removed from zarr_kwargs * Zarr version updated. --------- Co-authored-by: dabhi_cusp --- ci3.8.yml | 3 +- ci3.9.yml | 3 +- environment.yml | 3 +- setup.py | 6 +- weather_mv/loader_pipeline/bq.py | 135 +++++++++++++--------- weather_mv/loader_pipeline/bq_test.py | 48 ++++++-- weather_mv/loader_pipeline/pipeline.py | 8 +- weather_mv/loader_pipeline/regrid_test.py | 2 +- weather_mv/loader_pipeline/sinks.py | 4 +- weather_mv/loader_pipeline/sinks_test.py | 1 + weather_mv/loader_pipeline/util_test.py | 37 +++--- weather_mv/setup.py | 3 + 12 files changed, 163 insertions(+), 90 deletions(-) diff --git a/ci3.8.yml b/ci3.8.yml index 6b105598..803e59c4 100644 --- a/ci3.8.yml +++ b/ci3.8.yml @@ -16,7 +16,7 @@ dependencies: - requests=2.28.1 - netcdf4=1.6.1 - rioxarray=0.13.4 - - xarray-beam=0.3.1 + - xarray-beam=0.6.2 - ecmwf-api-client=1.6.3 - fsspec=2022.11.0 - gcsfs=2022.11.0 @@ -33,6 +33,7 @@ dependencies: - ruff==0.0.260 - google-cloud-sdk=410.0.0 - aria2=1.36.0 + - zarr=2.15.0 - pip: - cython==0.29.34 - earthengine-api==0.1.329 diff --git a/ci3.9.yml b/ci3.9.yml index a3cfac6c..e9e0671f 100644 --- a/ci3.9.yml +++ b/ci3.9.yml @@ -16,7 +16,7 @@ dependencies: - requests=2.28.1 - netcdf4=1.6.1 - rioxarray=0.13.4 - - xarray-beam=0.3.1 + - xarray-beam=0.6.2 - ecmwf-api-client=1.6.3 - fsspec=2022.11.0 - gcsfs=2022.11.0 @@ -33,6 +33,7 @@ dependencies: - aria2=1.36.0 - xarray==2023.1.0 - ruff==0.0.260 + - zarr=2.15.0 - pip: - cython==0.29.34 - earthengine-api==0.1.329 diff --git a/environment.yml b/environment.yml index e260c4f8..0b043980 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: - python=3.8.13 - apache-beam=2.40.0 - - xarray-beam=0.3.1 + - xarray-beam=0.6.2 - xarray=2023.1.0 - fsspec=2022.11.0 - gcsfs=2022.11.0 @@ -25,6 +25,7 @@ dependencies: - google-cloud-sdk=410.0.0 - aria2=1.36.0 - pip=22.3 + - zarr=2.15.0 - pip: - cython==0.29.34 - earthengine-api==0.1.329 diff --git a/setup.py b/setup.py index af798889..dedb552e 100644 --- a/setup.py +++ b/setup.py @@ -57,8 +57,9 @@ "earthengine-api>=0.1.263", "pyproj", # requires separate binary installation! "gdal", # requires separate binary installation! - "xarray-beam==0.3.1", + "xarray-beam==0.6.2", "gcsfs==2022.11.0", + "zarr==2.15.0", ] weather_sp_requirements = [ @@ -82,6 +83,7 @@ "memray", "pytest-memray", "h5py", + "pooch", ] all_test_requirements = beam_gcp_requirements + weather_dl_requirements + \ @@ -115,7 +117,7 @@ ], python_requires='>=3.8, <3.10', - install_requires=['apache-beam[gcp]==2.40.0'], + install_requires=['apache-beam[gcp]==2.40.0', 'gcsfs==2022.11.0'], use_scm_version=True, setup_requires=['setuptools_scm'], scripts=['weather_dl/weather-dl', 'weather_mv/weather-mv', 'weather_sp/weather-sp'], diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py index 58120940..5f466419 100644 --- a/weather_mv/loader_pipeline/bq.py +++ b/weather_mv/loader_pipeline/bq.py @@ -24,8 +24,10 @@ import geojson import numpy as np import xarray as xr +import xarray_beam as xbeam from apache_beam.io import WriteToBigQuery, BigQueryDisposition from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.transforms import window from google.cloud import bigquery from xarray.core.utils import ensure_us_time_resolution @@ -236,73 +238,90 @@ def extract_rows(self, uri: str, coordinates: t.List[t.Dict]) -> t.Iterator[t.Di with open_dataset(uri, self.xarray_open_dataset_kwargs, self.disable_grib_schema_normalization, self.tif_metadata_for_datetime, is_zarr=self.zarr) as ds: data_ds: xr.Dataset = _only_target_vars(ds, self.variables) + yield from self.to_rows(coordinates, data_ds, uri) - first_ts_raw = data_ds.time[0].values if isinstance(data_ds.time.values, - np.ndarray) else data_ds.time.values - first_time_step = to_json_serializable_type(first_ts_raw) - - for it in coordinates: - # Use those index values to select a Dataset containing one row of data. - row_ds = data_ds.loc[it] - - # Create a Name-Value map for data columns. Result looks like: - # {'d': -2.0187, 'cc': 0.007812, 'z': 50049.8, 'rr': None} - row = {n: to_json_serializable_type(ensure_us_time_resolution(v.values)) - for n, v in row_ds.data_vars.items()} - - # Serialize coordinates. - it = {k: to_json_serializable_type(v) for k, v in it.items()} - - # Add indexed coordinates. - row.update(it) - # Add un-indexed coordinates. - for c in row_ds.coords: - if c not in it and (not self.variables or c in self.variables): - row[c] = to_json_serializable_type(ensure_us_time_resolution(row_ds[c].values)) - - # Add import metadata. - row[DATA_IMPORT_TIME_COLUMN] = self.import_time - row[DATA_URI_COLUMN] = uri - row[DATA_FIRST_STEP] = first_time_step - - longitude = ((row['longitude'] + 180) % 360) - 180 - row[GEO_POINT_COLUMN] = fetch_geo_point(row['latitude'], longitude) - row[GEO_POLYGON_COLUMN] = ( - fetch_geo_polygon(row["latitude"], longitude, self.lat_grid_resolution, self.lon_grid_resolution) - if not self.skip_creating_polygon - else None - ) - # 'row' ends up looking like: - # {'latitude': 88.0, 'longitude': 2.0, 'time': '2015-01-01 06:00:00', 'd': -2.0187, 'cc': 0.007812, - # 'z': 50049.8, 'data_import_time': '2020-12-05 00:12:02.424573 UTC', ...} - beam.metrics.Metrics.counter('Success', 'ExtractRows').inc() - yield row + def to_rows(self, coordinates: t.Iterable[t.Dict], ds: xr.Dataset, uri: str) -> t.Iterator[t.Dict]: + first_ts_raw = ( + ds.time[0].values if isinstance(ds.time.values, np.ndarray) + else ds.time.values + ) + first_time_step = to_json_serializable_type(first_ts_raw) + for it in coordinates: + # Use those index values to select a Dataset containing one row of data. + row_ds = ds.loc[it] + + # Create a Name-Value map for data columns. Result looks like: + # {'d': -2.0187, 'cc': 0.007812, 'z': 50049.8, 'rr': None} + row = {n: to_json_serializable_type(ensure_us_time_resolution(v.values)) + for n, v in row_ds.data_vars.items()} + + # Serialize coordinates. + it = {k: to_json_serializable_type(v) for k, v in it.items()} + + # Add indexed coordinates. + row.update(it) + # Add un-indexed coordinates. + for c in row_ds.coords: + if c not in it and (not self.variables or c in self.variables): + row[c] = to_json_serializable_type(ensure_us_time_resolution(row_ds[c].values)) + + # Add import metadata. + row[DATA_IMPORT_TIME_COLUMN] = self.import_time + row[DATA_URI_COLUMN] = uri + row[DATA_FIRST_STEP] = first_time_step + + longitude = ((row['longitude'] + 180) % 360) - 180 + row[GEO_POINT_COLUMN] = fetch_geo_point(row['latitude'], longitude) + row[GEO_POLYGON_COLUMN] = ( + fetch_geo_polygon(row["latitude"], longitude, self.lat_grid_resolution, self.lon_grid_resolution) + if not self.skip_creating_polygon + else None + ) + # 'row' ends up looking like: + # {'latitude': 88.0, 'longitude': 2.0, 'time': '2015-01-01 06:00:00', 'd': -2.0187, 'cc': 0.007812, + # 'z': 50049.8, 'data_import_time': '2020-12-05 00:12:02.424573 UTC', ...} + beam.metrics.Metrics.counter('Success', 'ExtractRows').inc() + yield row + + def chunks_to_rows(self, _, ds: xr.Dataset) -> t.Iterator[t.Dict]: + uri = ds.attrs.get(DATA_URI_COLUMN, '') + # Re-calculate import time for streaming extractions. + if not self.import_time or self.zarr: + self.import_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) + yield from self.to_rows(get_coordinates(ds, uri), ds, uri) def expand(self, paths): """Extract rows of variables from data paths into a BigQuery table.""" - extracted_rows = ( + if not self.zarr: + extracted_rows = ( paths | 'PrepareCoordinates' >> beam.FlatMap(self.prepare_coordinates) | beam.Reshuffle() | 'ExtractRows' >> beam.FlatMapTuple(self.extract_rows) - ) - - if not self.dry_run: - ( - extracted_rows - | 'WriteToBigQuery' >> WriteToBigQuery( - project=self.table.project, - dataset=self.table.dataset_id, - table=self.table.table_id, - write_disposition=BigQueryDisposition.WRITE_APPEND, - create_disposition=BigQueryDisposition.CREATE_NEVER) ) else: - ( - extracted_rows - | 'Log Extracted Rows' >> beam.Map(logger.debug) + ds, chunks = xbeam.open_zarr(self.first_uri, **self.xarray_open_dataset_kwargs) + ds.attrs[DATA_URI_COLUMN] = self.first_uri + extracted_rows = ( + paths + | 'OpenChunks' >> xbeam.DatasetToChunks(ds, chunks) + | 'ExtractRows' >> beam.FlatMapTuple(self.chunks_to_rows) + | 'Window' >> beam.WindowInto(window.FixedWindows(60)) + | 'AddTimestamp' >> beam.Map(timestamp_row) ) + if self.dry_run: + return extracted_rows | 'Log Rows' >> beam.Map(logger.info) + return ( + extracted_rows + | 'WriteToBigQuery' >> WriteToBigQuery( + project=self.table.project, + dataset=self.table.dataset_id, + table=self.table.table_id, + write_disposition=BigQueryDisposition.WRITE_APPEND, + create_disposition=BigQueryDisposition.CREATE_NEVER) + ) + def map_dtype_to_sql_type(var_type: np.dtype) -> str: """Maps a np.dtype to a suitable BigQuery column type.""" @@ -343,6 +362,12 @@ def to_table_schema(columns: t.List[t.Tuple[str, str]]) -> t.List[bigquery.Schem return fields +def timestamp_row(it: t.Dict) -> window.TimestampedValue: + """Associate an extracted row with the import_time timestamp.""" + timestamp = it[DATA_IMPORT_TIME_COLUMN].timestamp() + return window.TimestampedValue(it, timestamp) + + def fetch_geo_point(lat: float, long: float) -> str: """Calculates a geography point from an input latitude and longitude.""" if lat > LATITUDE_RANGE[1] or lat < LATITUDE_RANGE[0]: diff --git a/weather_mv/loader_pipeline/bq_test.py b/weather_mv/loader_pipeline/bq_test.py index ed96cd9d..224a7a93 100644 --- a/weather_mv/loader_pipeline/bq_test.py +++ b/weather_mv/loader_pipeline/bq_test.py @@ -15,6 +15,7 @@ import json import logging import os +import tempfile import typing as t import unittest @@ -23,6 +24,8 @@ import pandas as pd import simplejson import xarray as xr +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.util import assert_that, is_not_empty from google.cloud.bigquery import SchemaField from .bq import ( @@ -205,13 +208,13 @@ def extract(self, data_path, *, variables=None, area=None, open_dataset_kwargs=N skip_creating_polygon: bool = False) -> t.Iterator[t.Dict]: if zarr_kwargs is None: zarr_kwargs = {} - op = ToBigQuery.from_kwargs(first_uri=data_path, dry_run=True, zarr=zarr, zarr_kwargs=zarr_kwargs, - output_table='foo.bar.baz', variables=variables, area=area, - xarray_open_dataset_kwargs=open_dataset_kwargs, import_time=import_time, - infer_schema=False, tif_metadata_for_datetime=tif_metadata_for_datetime, - skip_region_validation=True, - disable_grib_schema_normalization=disable_grib_schema_normalization, - coordinate_chunk_size=1000, skip_creating_polygon=skip_creating_polygon) + op = ToBigQuery.from_kwargs( + first_uri=data_path, dry_run=True, zarr=zarr, zarr_kwargs=zarr_kwargs, + output_table='foo.bar.baz', variables=variables, area=area, + xarray_open_dataset_kwargs=open_dataset_kwargs, import_time=import_time, infer_schema=False, + tif_metadata_for_datetime=tif_metadata_for_datetime, skip_region_validation=True, + disable_grib_schema_normalization=disable_grib_schema_normalization, coordinate_chunk_size=1000, + skip_creating_polygon=skip_creating_polygon) coords = op.prepare_coordinates(data_path) for uri, chunk in coords: yield from op.extract_rows(uri, chunk) @@ -737,5 +740,36 @@ def test_multiple_editions__with_vars__includes_coordinates_in_vars__with_schema self.assertRowsEqual(actual, expected) +class ExtractRowsFromZarrTest(ExtractRowsTestBase): + + def setUp(self) -> None: + super().setUp() + self.tmpdir = tempfile.TemporaryDirectory() + + def tearDown(self) -> None: + super().tearDown() + self.tmpdir.cleanup() + + def test_extracts_rows(self): + input_zarr = os.path.join(self.tmpdir.name, 'air_temp.zarr') + + ds = ( + xr.tutorial.open_dataset('air_temperature', cache_dir=self.test_data_folder) + .isel(time=slice(0, 4), lat=slice(0, 4), lon=slice(0, 4)) + .rename(dict(lon='longitude', lat='latitude')) + ) + ds.to_zarr(input_zarr) + + op = ToBigQuery.from_kwargs( + first_uri=input_zarr, zarr_kwargs=dict(), dry_run=True, zarr=True, output_table='foo.bar.baz', + variables=list(), area=list(), xarray_open_dataset_kwargs=dict(), import_time=None, infer_schema=False, + tif_metadata_for_datetime=None, skip_region_validation=True, disable_grib_schema_normalization=False, + ) + + with TestPipeline() as p: + result = p | op + assert_that(result, is_not_empty()) + + if __name__ == '__main__': unittest.main() diff --git a/weather_mv/loader_pipeline/pipeline.py b/weather_mv/loader_pipeline/pipeline.py index c12bd5f5..f6d40c41 100644 --- a/weather_mv/loader_pipeline/pipeline.py +++ b/weather_mv/loader_pipeline/pipeline.py @@ -27,7 +27,7 @@ from .streaming import GroupMessagesByFixedWindows, ParsePaths logger = logging.getLogger(__name__) -SDK_CONTAINER_IMAGE='gcr.io/weather-tools-prod/weather-tools:0.0.0' +SDK_CONTAINER_IMAGE = 'gcr.io/weather-tools-prod/weather-tools:0.0.0' def configure_logger(verbosity: int) -> None: @@ -55,8 +55,9 @@ def pipeline(known_args: argparse.Namespace, pipeline_args: t.List[str]) -> None known_args.first_uri = next(iter(all_uris)) with beam.Pipeline(argv=pipeline_args) as p: - if known_args.topic or known_args.subscription: - + if known_args.zarr: + paths = p + elif known_args.topic or known_args.subscription: paths = ( p # Windowing is based on this code sample: @@ -140,7 +141,6 @@ def run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]: # Validate Zarr arguments if known_args.uris.endswith('.zarr'): known_args.zarr = True - known_args.zarr_kwargs['chunks'] = known_args.zarr_kwargs.get('chunks', None) if known_args.zarr_kwargs and not known_args.zarr: raise ValueError('`--zarr_kwargs` argument is only allowed with valid Zarr input URI.') diff --git a/weather_mv/loader_pipeline/regrid_test.py b/weather_mv/loader_pipeline/regrid_test.py index 5cc5b2a1..87ffaad4 100644 --- a/weather_mv/loader_pipeline/regrid_test.py +++ b/weather_mv/loader_pipeline/regrid_test.py @@ -122,7 +122,7 @@ def test_zarr__coarsen(self): self.Op, first_uri=input_zarr, output_path=output_zarr, - zarr_input_chunks={"time": 5}, + zarr_input_chunks={"time": 25}, zarr=True ) diff --git a/weather_mv/loader_pipeline/sinks.py b/weather_mv/loader_pipeline/sinks.py index 22569bbb..d123ce23 100644 --- a/weather_mv/loader_pipeline/sinks.py +++ b/weather_mv/loader_pipeline/sinks.py @@ -177,7 +177,7 @@ def _replace_dataarray_names_with_long_names(ds: xr.Dataset): datetime_value_ms = None try: datetime_value_s = (int(end_time.timestamp()) if end_time is not None - else int(ds.attrs[tif_metadata_for_datetime]) / 1000.0) + else int(ds.attrs[tif_metadata_for_datetime]) / 1000.0) ds = ds.assign_coords({'time': datetime.datetime.utcfromtimestamp(datetime_value_s)}) except KeyError: raise RuntimeError(f"Invalid datetime metadata of tif: {tif_metadata_for_datetime}.") @@ -380,7 +380,7 @@ def open_dataset(uri: str, """Open the dataset at 'uri' and return a xarray.Dataset.""" try: if is_zarr: - ds: xr.Dataset = xr.open_dataset(uri, engine='zarr', **open_dataset_kwargs) + ds: xr.Dataset = _add_is_normalized_attr(xr.open_dataset(uri, engine='zarr', **open_dataset_kwargs), False) beam.metrics.Metrics.counter('Success', 'ReadNetcdfData').inc() yield ds ds.close() diff --git a/weather_mv/loader_pipeline/sinks_test.py b/weather_mv/loader_pipeline/sinks_test.py index f7ca1641..a759c586 100644 --- a/weather_mv/loader_pipeline/sinks_test.py +++ b/weather_mv/loader_pipeline/sinks_test.py @@ -112,6 +112,7 @@ def test_opens_zarr(self): with open_dataset(self.test_zarr_path, is_zarr=True, open_dataset_kwargs={}) as ds: self.assertIsNotNone(ds) self.assertEqual(list(ds.data_vars), ['cape', 'd2m']) + def test_open_dataset__fits_memory_bounds(self): with write_netcdf() as test_netcdf_path: with limit_memory(max_memory=30): diff --git a/weather_mv/loader_pipeline/util_test.py b/weather_mv/loader_pipeline/util_test.py index dae9c873..65d9169e 100644 --- a/weather_mv/loader_pipeline/util_test.py +++ b/weather_mv/loader_pipeline/util_test.py @@ -38,9 +38,10 @@ def test_gets_indexed_coordinates(self): ds = xr.open_dataset(self.test_data_path) self.assertEqual( next(get_coordinates(ds)), - {'latitude': 49.0, - 'longitude':-108.0, - 'time': datetime.fromisoformat('2018-01-02T06:00:00+00:00').replace(tzinfo=None)} + { + 'latitude': 49.0, + 'longitude': -108.0, + 'time': datetime.fromisoformat('2018-01-02T06:00:00+00:00').replace(tzinfo=None)} ) def test_no_duplicate_coordinates(self): @@ -91,24 +92,28 @@ def test_get_coordinates(self): actual, [ [ - {'longitude': -108.0, - 'latitude': 49.0, - 'time': datetime.fromisoformat('2018-01-02T06:00:00+00:00').replace(tzinfo=None) + { + 'longitude': -108.0, + 'latitude': 49.0, + 'time': datetime.fromisoformat('2018-01-02T06:00:00+00:00').replace(tzinfo=None) }, - {'longitude': -108.0, - 'latitude': 49.0, - 'time': datetime.fromisoformat('2018-01-02T07:00:00+00:00').replace(tzinfo=None) + { + 'longitude': -108.0, + 'latitude': 49.0, + 'time': datetime.fromisoformat('2018-01-02T07:00:00+00:00').replace(tzinfo=None) }, - {'longitude': -108.0, - 'latitude': 49.0, - 'time': datetime.fromisoformat('2018-01-02T08:00:00+00:00').replace(tzinfo=None) + { + 'longitude': -108.0, + 'latitude': 49.0, + 'time': datetime.fromisoformat('2018-01-02T08:00:00+00:00').replace(tzinfo=None) }, ], [ - {'longitude': -108.0, - 'latitude': 49.0, - 'time': datetime.fromisoformat('2018-01-02T09:00:00+00:00').replace(tzinfo=None) - } + { + 'longitude': -108.0, + 'latitude': 49.0, + 'time': datetime.fromisoformat('2018-01-02T09:00:00+00:00').replace(tzinfo=None) + } ] ] ) diff --git a/weather_mv/setup.py b/weather_mv/setup.py index bfe09713..f200a822 100644 --- a/weather_mv/setup.py +++ b/weather_mv/setup.py @@ -45,6 +45,7 @@ "numpy==1.22.4", "pandas==1.5.1", "xarray==2023.1.0", + "xarray-beam==0.6.2", "cfgrib==0.9.10.2", "netcdf4==1.6.1", "geojson==2.5.0", @@ -55,6 +56,8 @@ "earthengine-api>=0.1.263", "pyproj==3.4.0", # requires separate binary installation! "gdal==3.5.1", # requires separate binary installation! + "gcsfs==2022.11.0", + "zarr==2.15.0", ] setup( From 90a3664d19808571233b8e6f134f66e85c89d263 Mon Sep 17 00:00:00 2001 From: dabhi_cusp <123355381+dabhicusp@users.noreply.github.com> Date: Wed, 13 Sep 2023 22:29:27 +0530 Subject: [PATCH 06/16] Fixed the geo-polygon injestion in bigquery. (#391) * Polygon in bigquery injestion updated. * Band-aid for Issue #392 * Data type of Geo_polygon is updated in schema. * Test-cases updated. --- .github/workflows/ci.yml | 2 +- weather_mv/loader_pipeline/bq.py | 6 ++--- weather_mv/loader_pipeline/bq_test.py | 38 +++++++++++++-------------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 31cefb0d..ac86adc1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -84,7 +84,7 @@ jobs: echo "::set-output name=dir::$(pip cache dir)" - name: Install linter run: | - pip install ruff + pip install ruff==0.0.280 - name: Lint project run: ruff check . type-check: diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py index 5f466419..f62f81e1 100644 --- a/weather_mv/loader_pipeline/bq.py +++ b/weather_mv/loader_pipeline/bq.py @@ -357,7 +357,7 @@ def to_table_schema(columns: t.List[t.Tuple[str, str]]) -> t.List[bigquery.Schem fields.append(bigquery.SchemaField(DATA_URI_COLUMN, 'STRING', mode='NULLABLE')) fields.append(bigquery.SchemaField(DATA_FIRST_STEP, 'TIMESTAMP', mode='NULLABLE')) fields.append(bigquery.SchemaField(GEO_POINT_COLUMN, 'GEOGRAPHY', mode='NULLABLE')) - fields.append(bigquery.SchemaField(GEO_POLYGON_COLUMN, 'STRING', mode='NULLABLE')) + fields.append(bigquery.SchemaField(GEO_POLYGON_COLUMN, 'GEOGRAPHY', mode='NULLABLE')) return fields @@ -392,13 +392,13 @@ def fetch_geo_polygon(latitude: float, longitude: float, lat_grid_resolution: fl The `get_lat_lon_range` function gives the `.` point and `bound_point` gives the `*` point. """ lat_lon_bound = bound_point(latitude, longitude, lat_grid_resolution, lon_grid_resolution) - polygon = geojson.dumps(geojson.Polygon([ + polygon = geojson.dumps(geojson.Polygon([[ (lat_lon_bound[0][0], lat_lon_bound[0][1]), # lower_left (lat_lon_bound[1][0], lat_lon_bound[1][1]), # upper_left (lat_lon_bound[2][0], lat_lon_bound[2][1]), # upper_right (lat_lon_bound[3][0], lat_lon_bound[3][1]), # lower_right (lat_lon_bound[0][0], lat_lon_bound[0][1]), # lower_left - ])) + ]])) return polygon diff --git a/weather_mv/loader_pipeline/bq_test.py b/weather_mv/loader_pipeline/bq_test.py index 224a7a93..174c6cd4 100644 --- a/weather_mv/loader_pipeline/bq_test.py +++ b/weather_mv/loader_pipeline/bq_test.py @@ -78,7 +78,7 @@ def test_schema_generation(self): SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None), SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None), SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None), - SchemaField('geo_polygon', 'STRING', 'NULLABLE', None, (), None) + SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None) ] self.assertListEqual(schema, expected_schema) @@ -94,7 +94,7 @@ def test_schema_generation__with_schema_normalization(self): SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None), SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None), SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None), - SchemaField('geo_polygon', 'STRING', 'NULLABLE', None, (), None) + SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None) ] self.assertListEqual(schema, expected_schema) @@ -110,7 +110,7 @@ def test_schema_generation__with_target_columns(self): SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None), SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None), SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None), - SchemaField('geo_polygon', 'STRING', 'NULLABLE', None, (), None) + SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None) ] self.assertListEqual(schema, expected_schema) @@ -126,7 +126,7 @@ def test_schema_generation__with_target_columns__with_schema_normalization(self) SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None), SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None), SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None), - SchemaField('geo_polygon', 'STRING', 'NULLABLE', None, (), None) + SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None) ] self.assertListEqual(schema, expected_schema) @@ -143,7 +143,7 @@ def test_schema_generation__no_targets_specified(self): SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None), SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None), SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None), - SchemaField('geo_polygon', 'STRING', 'NULLABLE', None, (), None) + SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None) ] self.assertListEqual(schema, expected_schema) @@ -160,7 +160,7 @@ def test_schema_generation__no_targets_specified__with_schema_normalization(self SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None), SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None), SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None), - SchemaField('geo_polygon', 'STRING', 'NULLABLE', None, (), None) + SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None) ] self.assertListEqual(schema, expected_schema) @@ -194,7 +194,7 @@ def test_schema_generation__non_index_coords(self): SchemaField('data_uri', 'STRING', 'NULLABLE', None, (), None), SchemaField('data_first_step', 'TIMESTAMP', 'NULLABLE', None, (), None), SchemaField('geo_point', 'GEOGRAPHY', 'NULLABLE', None, (), None), - SchemaField('geo_polygon', 'STRING', 'NULLABLE', None, (), None) + SchemaField('geo_polygon', 'GEOGRAPHY', 'NULLABLE', None, (), None) ] self.assertListEqual(schema, expected_schema) @@ -401,18 +401,18 @@ def test_extract_rows__with_valid_lat_long_with_polygon(self): valid_lat_long = [[-90, 0], [-90, -180], [-45, -180], [-45, 180], [0, 0], [90, 180], [45, -180], [-90, 180], [90, 1], [0, 180], [1, -180], [90, -180]] actual_val = [ - '{"type": "Polygon", "coordinates": [[-1, 89], [-1, -89], [1, -89], [1, 89], [-1, 89]]}', - '{"type": "Polygon", "coordinates": [[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]}', - '{"type": "Polygon", "coordinates": [[179, -46], [179, -44], [-179, -44], [-179, -46], [179, -46]]}', - '{"type": "Polygon", "coordinates": [[179, -46], [179, -44], [-179, -44], [-179, -46], [179, -46]]}', - '{"type": "Polygon", "coordinates": [[-1, -1], [-1, 1], [1, 1], [1, -1], [-1, -1]]}', - '{"type": "Polygon", "coordinates": [[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]}', - '{"type": "Polygon", "coordinates": [[179, 44], [179, 46], [-179, 46], [-179, 44], [179, 44]]}', - '{"type": "Polygon", "coordinates": [[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]}', - '{"type": "Polygon", "coordinates": [[0, 89], [0, -89], [2, -89], [2, 89], [0, 89]]}', - '{"type": "Polygon", "coordinates": [[179, -1], [179, 1], [-179, 1], [-179, -1], [179, -1]]}', - '{"type": "Polygon", "coordinates": [[179, 0], [179, 2], [-179, 2], [-179, 0], [179, 0]]}', - '{"type": "Polygon", "coordinates": [[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]}' + '{"type": "Polygon", "coordinates": [[[-1, 89], [-1, -89], [1, -89], [1, 89], [-1, 89]]]}', + '{"type": "Polygon", "coordinates": [[[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]]}', + '{"type": "Polygon", "coordinates": [[[179, -46], [179, -44], [-179, -44], [-179, -46], [179, -46]]]}', + '{"type": "Polygon", "coordinates": [[[179, -46], [179, -44], [-179, -44], [-179, -46], [179, -46]]]}', + '{"type": "Polygon", "coordinates": [[[-1, -1], [-1, 1], [1, 1], [1, -1], [-1, -1]]]}', + '{"type": "Polygon", "coordinates": [[[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]]}', + '{"type": "Polygon", "coordinates": [[[179, 44], [179, 46], [-179, 46], [-179, 44], [179, 44]]]}', + '{"type": "Polygon", "coordinates": [[[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]]}', + '{"type": "Polygon", "coordinates": [[[0, 89], [0, -89], [2, -89], [2, 89], [0, 89]]]}', + '{"type": "Polygon", "coordinates": [[[179, -1], [179, 1], [-179, 1], [-179, -1], [179, -1]]]}', + '{"type": "Polygon", "coordinates": [[[179, 0], [179, 2], [-179, 2], [-179, 0], [179, 0]]]}', + '{"type": "Polygon", "coordinates": [[[179, 89], [179, -89], [-179, -89], [-179, 89], [179, 89]]]}' ] lat_grid_resolution = 1 lon_grid_resolution = 1 From efec4cf3f92de0da36ad901245dccac52b85207e Mon Sep 17 00:00:00 2001 From: Piyush-Ingale <122958815+Piyush-Ingale@users.noreply.github.com> Date: Mon, 18 Sep 2023 10:06:16 +0000 Subject: [PATCH 07/16] weather-mv: Added a flag for preprocessing tiff file (#387) * added valid time as coordinate * minor test case changes * updated readme according to new flag * made the new flag optional * fixed lint errors * Added test case * removed test file * Addressed comments * weather-mv version bump --- weather_dl/download_pipeline/util.py | 2 +- weather_mv/README.md | 9 ++-- weather_mv/loader_pipeline/bq.py | 34 +++++++----- weather_mv/loader_pipeline/bq_test.py | 39 +++++++++++--- weather_mv/loader_pipeline/pipeline_test.py | 8 +-- weather_mv/loader_pipeline/sinks.py | 49 ++++++++++++++---- weather_mv/loader_pipeline/sinks_test.py | 5 +- weather_mv/loader_pipeline/streaming.py | 2 +- weather_mv/setup.py | 2 +- .../test_data/test_data_tif_start_time.tif | Bin 24206 -> 0 bytes weather_mv/test_data/test_data_tif_time.tif | Bin 0 -> 28051 bytes 11 files changed, 109 insertions(+), 41 deletions(-) delete mode 100644 weather_mv/test_data/test_data_tif_start_time.tif create mode 100644 weather_mv/test_data/test_data_tif_time.tif diff --git a/weather_dl/download_pipeline/util.py b/weather_dl/download_pipeline/util.py index 1ee9e24e..3e92b8d8 100644 --- a/weather_dl/download_pipeline/util.py +++ b/weather_dl/download_pipeline/util.py @@ -105,7 +105,7 @@ def to_json_serializable_type(value: t.Any) -> t.Any: elif type(value) == np.ndarray: # Will return a scaler if array is of size 1, else will return a list. return value.tolist() - elif type(value) == datetime.datetime or type(value) == str or type(value) == np.datetime64: + elif isinstance(value, datetime.datetime) or isinstance(value, str) or isinstance(value, np.datetime64): # Assume strings are ISO format timestamps... try: value = datetime.datetime.fromisoformat(value) diff --git a/weather_mv/README.md b/weather_mv/README.md index eff4717d..ee32b103 100644 --- a/weather_mv/README.md +++ b/weather_mv/README.md @@ -61,7 +61,8 @@ usage: weather-mv bigquery [-h] -i URIS [--topic TOPIC] [--window_size WINDOW_SI -o OUTPUT_TABLE [-v variables [variables ...]] [-a area [area ...]] [--import_time IMPORT_TIME] [--infer_schema] [--xarray_open_dataset_kwargs XARRAY_OPEN_DATASET_KWARGS] - [--tif_metadata_for_datetime TIF_METADATA_FOR_DATETIME] [-s] + [--tif_metadata_for_start_time TIF_METADATA_FOR_START_TIME] + [--tif_metadata_for_end_time TIF_METADATA_FOR_END_TIME] [-s] [--coordinate_chunk_size COORDINATE_CHUNK_SIZE] ['--skip_creating_polygon'] ``` @@ -80,7 +81,8 @@ _Command options_: * `--xarray_open_dataset_kwargs`: Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string. * `--coordinate_chunk_size`: The size of the chunk of coordinates used for extracting vector data into BigQuery. Used to tune parallel uploads. -* `--tif_metadata_for_datetime` : Metadata that contains tif file's timestamp. Applicable only for tif files. +* `--tif_metadata_for_start_time` : Metadata that contains tif file's start/initialization time. Applicable only for tif files. +* `--tif_metadata_for_end_time` : Metadata that contains tif file's end/forecast time. Applicable only for tif files (optional). * `-s, --skip-region-validation` : Skip validation of regions for data migration. Default: off. * `--disable_grib_schema_normalization` : To disable grib's schema normalization. Default: off. * `--skip_creating_polygon` : Not ingest grid points as polygons in BigQuery. Default: Ingest grid points as Polygon in @@ -139,7 +141,8 @@ weather-mv bq --uris "gs://your-bucket/*.tif" \ --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ --temp_location "gs://$BUCKET/tmp" \ # Needed for batch writes to BigQuery --direct_num_workers 2 \ - --tif_metadata_for_datetime start_time + --tif_metadata_for_start_time start_time \ + --tif_metadata_for_end_time end_time ``` Upload only a subset of variables: diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py index f62f81e1..e1b39a22 100644 --- a/weather_mv/loader_pipeline/bq.py +++ b/weather_mv/loader_pipeline/bq.py @@ -77,8 +77,10 @@ class ToBigQuery(ToDataSink): infer_schema: If true, this sink will attempt to read in an example data file read all its variables, and generate a BigQuery schema. xarray_open_dataset_kwargs: A dictionary of kwargs to pass to xr.open_dataset(). - tif_metadata_for_datetime: If the input is a .tif file, parse the tif metadata at - this location for a timestamp. + tif_metadata_for_start_time: If the input is a .tif file, parse the tif metadata at + this location for a start time / initialization time. + tif_metadata_for_end_time: If the input is a .tif file, parse the tif metadata at + this location for a end/forecast time. skip_region_validation: Turn off validation that checks if all Cloud resources are in the same region. disable_grib_schema_normalization: Turn off grib's schema normalization; Default: normalization enabled. @@ -94,7 +96,8 @@ class ToBigQuery(ToDataSink): import_time: t.Optional[datetime.datetime] infer_schema: bool xarray_open_dataset_kwargs: t.Dict - tif_metadata_for_datetime: t.Optional[str] + tif_metadata_for_start_time: t.Optional[str] + tif_metadata_for_end_time: t.Optional[str] skip_region_validation: bool disable_grib_schema_normalization: bool coordinate_chunk_size: int = 10_000 @@ -125,8 +128,11 @@ def add_parser_arguments(cls, subparser: argparse.ArgumentParser): 'off') subparser.add_argument('--xarray_open_dataset_kwargs', type=json.loads, default='{}', help='Keyword-args to pass into `xarray.open_dataset()` in the form of a JSON string.') - subparser.add_argument('--tif_metadata_for_datetime', type=str, default=None, - help='Metadata that contains tif file\'s timestamp. ' + subparser.add_argument('--tif_metadata_for_start_time', type=str, default=None, + help='Metadata that contains tif file\'s start/initialization time. ' + 'Applicable only for tif files.') + subparser.add_argument('--tif_metadata_for_end_time', type=str, default=None, + help='Metadata that contains tif file\'s end/forecast time. ' 'Applicable only for tif files.') subparser.add_argument('-s', '--skip-region-validation', action='store_true', default=False, help='Skip validation of regions for data migration. Default: off') @@ -146,10 +152,12 @@ def validate_arguments(cls, known_args: argparse.Namespace, pipeline_args: t.Lis # Check that all arguments are supplied for COG input. _, uri_extension = os.path.splitext(known_args.uris) - if uri_extension == '.tif' and not known_args.tif_metadata_for_datetime: - raise RuntimeError("'--tif_metadata_for_datetime' is required for tif files.") - elif uri_extension != '.tif' and known_args.tif_metadata_for_datetime: - raise RuntimeError("'--tif_metadata_for_datetime' can be specified only for tif files.") + if (uri_extension in ['.tif', '.tiff'] and not known_args.tif_metadata_for_start_time): + raise RuntimeError("'--tif_metadata_for_start_time' is required for tif files.") + elif (uri_extension not in ['.tif', '.tiff'] and (known_args.tif_metadata_for_start_time + or known_args.tif_metadata_for_end_time)): + raise RuntimeError("'--tif_metadata_for_start_time' and " + "'--tif_metadata_for_end_time' can be specified only for tif files.") # Check that Cloud resource regions are consistent. if not (known_args.dry_run or known_args.skip_region_validation): @@ -164,8 +172,8 @@ def __post_init__(self): if self.zarr: self.xarray_open_dataset_kwargs = self.zarr_kwargs with open_dataset(self.first_uri, self.xarray_open_dataset_kwargs, - self.disable_grib_schema_normalization, self.tif_metadata_for_datetime, - is_zarr=self.zarr) as open_ds: + self.disable_grib_schema_normalization, self.tif_metadata_for_start_time, + self.tif_metadata_for_end_time, is_zarr=self.zarr) as open_ds: if not self.skip_creating_polygon: logger.warning("Assumes that equal distance between consecutive points of latitude " @@ -217,7 +225,7 @@ def prepare_coordinates(self, uri: str) -> t.Iterator[t.Tuple[str, t.List[t.Dict logger.info(f'Preparing coordinates for: {uri!r}.') with open_dataset(uri, self.xarray_open_dataset_kwargs, self.disable_grib_schema_normalization, - self.tif_metadata_for_datetime, is_zarr=self.zarr) as ds: + self.tif_metadata_for_start_time, self.tif_metadata_for_end_time, is_zarr=self.zarr) as ds: data_ds: xr.Dataset = _only_target_vars(ds, self.variables) if self.area: n, w, s, e = self.area @@ -236,7 +244,7 @@ def extract_rows(self, uri: str, coordinates: t.List[t.Dict]) -> t.Iterator[t.Di self.import_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) with open_dataset(uri, self.xarray_open_dataset_kwargs, self.disable_grib_schema_normalization, - self.tif_metadata_for_datetime, is_zarr=self.zarr) as ds: + self.tif_metadata_for_start_time, self.tif_metadata_for_end_time, is_zarr=self.zarr) as ds: data_ds: xr.Dataset = _only_target_vars(ds, self.variables) yield from self.to_rows(coordinates, data_ds, uri) diff --git a/weather_mv/loader_pipeline/bq_test.py b/weather_mv/loader_pipeline/bq_test.py index 174c6cd4..0faa567c 100644 --- a/weather_mv/loader_pipeline/bq_test.py +++ b/weather_mv/loader_pipeline/bq_test.py @@ -204,7 +204,7 @@ class ExtractRowsTestBase(TestDataBase): def extract(self, data_path, *, variables=None, area=None, open_dataset_kwargs=None, import_time=DEFAULT_IMPORT_TIME, disable_grib_schema_normalization=False, - tif_metadata_for_datetime=None, zarr: bool = False, zarr_kwargs=None, + tif_metadata_for_start_time=None, tif_metadata_for_end_time=None, zarr: bool = False, zarr_kwargs=None, skip_creating_polygon: bool = False) -> t.Iterator[t.Dict]: if zarr_kwargs is None: zarr_kwargs = {} @@ -212,7 +212,8 @@ def extract(self, data_path, *, variables=None, area=None, open_dataset_kwargs=N first_uri=data_path, dry_run=True, zarr=zarr, zarr_kwargs=zarr_kwargs, output_table='foo.bar.baz', variables=variables, area=area, xarray_open_dataset_kwargs=open_dataset_kwargs, import_time=import_time, infer_schema=False, - tif_metadata_for_datetime=tif_metadata_for_datetime, skip_region_validation=True, + tif_metadata_for_start_time=tif_metadata_for_start_time, + tif_metadata_for_end_time=tif_metadata_for_end_time, skip_region_validation=True, disable_grib_schema_normalization=disable_grib_schema_normalization, coordinate_chunk_size=1000, skip_creating_polygon=skip_creating_polygon) coords = op.prepare_coordinates(data_path) @@ -472,10 +473,35 @@ class ExtractRowsTifSupportTest(ExtractRowsTestBase): def setUp(self) -> None: super().setUp() - self.test_data_path = f'{self.test_data_folder}/test_data_tif_start_time.tif' + self.test_data_path = f'{self.test_data_folder}/test_data_tif_time.tif' - def test_extract_rows(self): - actual = next(self.extract(self.test_data_path, tif_metadata_for_datetime='start_time')) + def test_extract_rows_with_end_time(self): + actual = next( + self.extract(self.test_data_path, tif_metadata_for_start_time='start_time', + tif_metadata_for_end_time='end_time') + ) + expected = { + 'dewpoint_temperature_2m': 281.09349060058594, + 'temperature_2m': 296.8329772949219, + 'data_import_time': '1970-01-01T00:00:00+00:00', + 'data_first_step': '2020-07-01T00:00:00+00:00', + 'data_uri': self.test_data_path, + 'latitude': 42.09783344918844, + 'longitude': -123.66686981141397, + 'time': '2020-07-01T00:00:00+00:00', + 'valid_time': '2020-07-01T00:00:00+00:00', + 'geo_point': geojson.dumps(geojson.Point((-123.66687, 42.097833))), + 'geo_polygon': geojson.dumps(geojson.Polygon([ + (-123.669853, 42.095605), (-123.669853, 42.100066), + (-123.663885, 42.100066), (-123.663885, 42.095605), + (-123.669853, 42.095605)])) + } + self.assertRowsEqual(actual, expected) + + def test_extract_rows_without_end_time(self): + actual = next( + self.extract(self.test_data_path, tif_metadata_for_start_time='start_time') + ) expected = { 'dewpoint_temperature_2m': 281.09349060058594, 'temperature_2m': 296.8329772949219, @@ -763,7 +789,8 @@ def test_extracts_rows(self): op = ToBigQuery.from_kwargs( first_uri=input_zarr, zarr_kwargs=dict(), dry_run=True, zarr=True, output_table='foo.bar.baz', variables=list(), area=list(), xarray_open_dataset_kwargs=dict(), import_time=None, infer_schema=False, - tif_metadata_for_datetime=None, skip_region_validation=True, disable_grib_schema_normalization=False, + tif_metadata_for_start_time=None, tif_metadata_for_end_time=None, skip_region_validation=True, + disable_grib_schema_normalization=False, ) with TestPipeline() as p: diff --git a/weather_mv/loader_pipeline/pipeline_test.py b/weather_mv/loader_pipeline/pipeline_test.py index 4d546192..3834b537 100644 --- a/weather_mv/loader_pipeline/pipeline_test.py +++ b/weather_mv/loader_pipeline/pipeline_test.py @@ -30,7 +30,7 @@ def setUp(self) -> None: ).split() self.tif_base_cli_args = ( 'weather-mv bq ' - f'-i {self.test_data_folder}/test_data_tif_start_time.tif ' + f'-i {self.test_data_folder}/test_data_tif_time.tif ' '-o myproject.mydataset.mytable ' '--import_time 2022-02-04T22:22:12.125893 ' '-s' @@ -62,7 +62,8 @@ def setUp(self) -> None: 'xarray_open_dataset_kwargs': {}, 'coordinate_chunk_size': 10_000, 'disable_grib_schema_normalization': False, - 'tif_metadata_for_datetime': None, + 'tif_metadata_for_start_time': None, + 'tif_metadata_for_end_time': None, 'zarr': False, 'zarr_kwargs': {}, 'log_level': 2, @@ -83,7 +84,8 @@ def test_log_level_arg(self): def test_tif_metadata_for_datetime_raise_error_for_non_tif_file(self): with self.assertRaisesRegex(RuntimeError, 'can be specified only for tif files.'): - run(self.base_cli_args + '--tif_metadata_for_datetime start_time'.split()) + run(self.base_cli_args + '--tif_metadata_for_start_time start_time ' + '--tif_metadata_for_end_time end_time'.split()) def test_tif_metadata_for_datetime_raise_error_if_flag_is_absent(self): with self.assertRaisesRegex(RuntimeError, 'is required for tif files.'): diff --git a/weather_mv/loader_pipeline/sinks.py b/weather_mv/loader_pipeline/sinks.py index d123ce23..332e1aac 100644 --- a/weather_mv/loader_pipeline/sinks.py +++ b/weather_mv/loader_pipeline/sinks.py @@ -138,8 +138,9 @@ def rearrange_time_list(order_list: t.List, time_list: t.List) -> t.List: return datetime.datetime(*time_list) -def _preprocess_tif(ds: xr.Dataset, filename: str, tif_metadata_for_datetime: str, uri: str, - band_names_dict: t.Dict, initialization_time_regex: str, forecast_time_regex: str) -> xr.Dataset: +def _preprocess_tif(ds: xr.Dataset, filename: str, tif_metadata_for_start_time: str, + tif_metadata_for_end_time: str, uri: str, band_names_dict: t.Dict, + initialization_time_regex: str, forecast_time_regex: str) -> xr.Dataset: """Transforms (y, x) coordinates into (lat, long) and adds bands data in data variables. This also retrieves datetime from tif's metadata and stores it into dataset. @@ -162,6 +163,7 @@ def _replace_dataarray_names_with_long_names(ds: xr.Dataset): ds = _replace_dataarray_names_with_long_names(ds) end_time = None + start_time = None if initialization_time_regex and forecast_time_regex: try: start_time = match_datetime(uri, initialization_time_regex) @@ -174,15 +176,38 @@ def _replace_dataarray_names_with_long_names(ds: xr.Dataset): ds.attrs['start_time'] = start_time ds.attrs['end_time'] = end_time - datetime_value_ms = None + init_time = None + forecast_time = None + coords = {} try: - datetime_value_s = (int(end_time.timestamp()) if end_time is not None - else int(ds.attrs[tif_metadata_for_datetime]) / 1000.0) - ds = ds.assign_coords({'time': datetime.datetime.utcfromtimestamp(datetime_value_s)}) - except KeyError: - raise RuntimeError(f"Invalid datetime metadata of tif: {tif_metadata_for_datetime}.") + # if start_time/end_time is in integer milliseconds + init_time = (int(start_time.timestamp()) if start_time is not None + else int(ds.attrs[tif_metadata_for_start_time]) / 1000.0) + coords['time'] = datetime.datetime.utcfromtimestamp(init_time) + + if tif_metadata_for_end_time: + forecast_time = (int(end_time.timestamp()) if end_time is not None + else int(ds.attrs[tif_metadata_for_end_time]) / 1000.0) + coords['valid_time'] = datetime.datetime.utcfromtimestamp(forecast_time) + + ds = ds.assign_coords(coords) + except KeyError as e: + raise RuntimeError(f"Invalid datetime metadata of tif: {e}.") except ValueError: - raise RuntimeError(f"Invalid datetime value in tif's metadata: {datetime_value_ms}.") + try: + # if start_time/end_time is in UTC string format + init_time = (int(start_time.timestamp()) if start_time is not None + else datetime.datetime.strptime(ds.attrs[tif_metadata_for_start_time], '%Y-%m-%dT%H:%M:%SZ')) + coords['time'] = init_time + + if tif_metadata_for_end_time: + forecast_time = (int(end_time.timestamp()) if end_time is not None + else datetime.datetime.strptime(ds.attrs[tif_metadata_for_end_time], '%Y-%m-%dT%H:%M:%SZ')) + coords['valid_time'] = forecast_time + + ds = ds.assign_coords(coords) + except ValueError as e: + raise RuntimeError(f"Invalid datetime value in tif's metadata: {e}.") return ds @@ -372,7 +397,8 @@ def open_local(uri: str) -> t.Iterator[str]: def open_dataset(uri: str, open_dataset_kwargs: t.Optional[t.Dict] = None, disable_grib_schema_normalization: bool = False, - tif_metadata_for_datetime: t.Optional[str] = None, + tif_metadata_for_start_time: t.Optional[str] = None, + tif_metadata_for_end_time: t.Optional[str] = None, band_names_dict: t.Optional[t.Dict] = None, initialization_time_regex: t.Optional[str] = None, forecast_time_regex: t.Optional[str] = None, @@ -394,7 +420,8 @@ def open_dataset(uri: str, if uri_extension in ['.tif', '.tiff']: xr_dataset = _preprocess_tif(xr_dataset, local_path, - tif_metadata_for_datetime, + tif_metadata_for_start_time, + tif_metadata_for_end_time, uri, band_names_dict, initialization_time_regex, diff --git a/weather_mv/loader_pipeline/sinks_test.py b/weather_mv/loader_pipeline/sinks_test.py index a759c586..fde060ad 100644 --- a/weather_mv/loader_pipeline/sinks_test.py +++ b/weather_mv/loader_pipeline/sinks_test.py @@ -84,7 +84,7 @@ def setUp(self) -> None: super().setUp() self.test_data_path = os.path.join(self.test_data_folder, 'test_data_20180101.nc') self.test_grib_path = os.path.join(self.test_data_folder, 'test_data_grib_single_timestep') - self.test_tif_path = os.path.join(self.test_data_folder, 'test_data_tif_start_time.tif') + self.test_tif_path = os.path.join(self.test_data_folder, 'test_data_tif_time.tif') self.test_zarr_path = os.path.join(self.test_data_folder, 'test_data.zarr') def test_opens_grib_files(self): @@ -104,7 +104,8 @@ def test_accepts_xarray_kwargs(self): self.assertDictContainsSubset({'is_normalized': False}, ds2.attrs) def test_opens_tif_files(self): - with open_dataset(self.test_tif_path, tif_metadata_for_datetime='start_time') as ds: + with open_dataset(self.test_tif_path, tif_metadata_for_start_time='start_time', + tif_metadata_for_end_time='end_time') as ds: self.assertIsNotNone(ds) self.assertDictContainsSubset({'is_normalized': False}, ds.attrs) diff --git a/weather_mv/loader_pipeline/streaming.py b/weather_mv/loader_pipeline/streaming.py index 7210b2e7..3a7a8f49 100644 --- a/weather_mv/loader_pipeline/streaming.py +++ b/weather_mv/loader_pipeline/streaming.py @@ -84,7 +84,7 @@ def try_parse_message(cls, message_body: t.Union[str, t.Dict]) -> t.Dict: try: return json.loads(message_body) except (json.JSONDecodeError, TypeError): - if type(message_body) is dict: + if isinstance(message_body, dict): return message_body raise diff --git a/weather_mv/setup.py b/weather_mv/setup.py index f200a822..b46121a5 100644 --- a/weather_mv/setup.py +++ b/weather_mv/setup.py @@ -65,7 +65,7 @@ packages=find_packages(), author='Anthromets', author_email='anthromets-ecmwf@google.com', - version='0.2.17', + version='0.2.18', url='https://weather-tools.readthedocs.io/en/latest/weather_mv/', description='A tool to load weather data into BigQuery.', install_requires=beam_gcp_requirements + base_requirements, diff --git a/weather_mv/test_data/test_data_tif_start_time.tif b/weather_mv/test_data/test_data_tif_start_time.tif deleted file mode 100644 index 82f32dd7bc06870da54ae382979c2d1e6488d693..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24206 zcma&NWmFtdvo;DLgy0Ur-QC^YWoB>*G7#L|5;O#NhY&)5;O_1&gTpYm1b2tad%knl zI`^*c$L+nk>*?y+)!x0!l$4m^77gLx@ZjL!5#ZqA;NUFZX88Yw-@V~?Z#>DjhVb8f z#JBYi|Be6C2minEf8etDb!Gl?q5N~x-W-OVh`+|9&MC@Dpi+`$f%j%E&EF=|;kfQ+`1CQwO3 zoto0k%)#0Ijmb$(>FV?!U(a@Bd5HrDO$pC^;x)%;`dd$#)2ayv(b1%U6Cl_dwwT=Y5T>;5~c7P=bm-I z7&l$P>Vj!3*IF=hqUdr2?EygZYxIljU)7N~u|mK>ZHr}14sR)l2|v@=(ZRS_6uy5% z$bL_!dddEhMAdfuOO0W<%Zj}^%)SX);Ve@2p4w%_5%M62R{Lc_zIGYyBycG|a{ubVm_AC)eO9sF5Z$WMxP12T($Dx0%%OSJ zzEMvtn#H;m=H!iw>wFRmVF5L_y~Oe45bwLUoQqEymMov%wvy?cY7DMl-nHf`Qe-Xc z-#+{{hH*IU9YsBMykf#e@E$15oiit5{?;MBlbj-vd-sP^l<<8wmOb8V_xB?*6lpAf zx{p0mSzwed&dV4q>a#Wpv|g-LK8)T3G=r$bo|ag;*pWI5qJffV#~As9Ixx8cgSQVsBrvvN(c3x$d+Z!3e!zG^?$SQ6_v&p58?9N&17 zJyc)*Td$p2A89Rp&D(TSY}d(6d1I zUrcc6Lj7En31iB2+-H4H{48^Qe!gF06{_WbN0)DYUE3LY_P084>g@D78F?Y#yBOWl zLEN~;3{gFygkf_&Lh87|TY`4DKb!?mXuP+Dyvoo0U}-~S0kFoo*B~K5#3R_o9RX^v znV>jfm|2t6yPJ;5$1`sUz03OqiNW`9Pg~9W67f6w}ndG_f|AguD3%<0-#}(QLTeTL+J*4)Q+GY+^ zawy{LLh6)AbXFRaNj$fjloKO2=uTQwFI!Yd8qPaTjXvD7o?NumYM+y8;P;*D*?)1M4cKyl(jr#Gk-V-=hD0%=^TE^s|gI`0NK-kNnV_K{lhkb73vNPTjo0 zmZ9C$a24--KYG3>TbSl1f%MHJnpv=pDw9q9pN~;kzeHsn&WY*zzGvZNWvl++5@zcE zaEte=4H(6x-z}`zPPp~nELtm;TV8LRXiL^6MyrBfei2Ht73vbJ90?J=1|H@ZR#Bf6 zhY1g6pW(N*duxA?7~%^*-R7Q5z{K7eYP7xFc598tA=?chi=d%V0kxFa6wi(bNJM>i4bh&=-L4;xw6aCylesG1B=C5ma4e$Fz7Y6_FC&qTZZ z0;>PeGGLS2?u@(izyr+QvC+Ta1ZKTlFiBb{+_-`Q6fU5p$$z*)TY~-IY3pV z&HnKR*r5VYB2hnR7xVYPQ0-7se55^=PXSTuj53)53Zrw2FM^BanA1QMtIeg{C&nsD zK5Tzyn@Uq0D?v+sY&*}U=_!9L=7$pu|J03D@CY}Le^e5n&xTdl3h%XD+~3@&bFyZ*65w=#*xjUyv1w#Lr#!S=5rzDy8ZS`W>Q7TUR@oP;u3se}U`nR6>YA$LH=mgemzn!R? zx9!$D#0Qt9qUq7?Le?7xM?3~Bx(p4of{LylY0H?ngpo}9S9nBv$+$J=eR`NBj?=O=WGMC!|2-$1Ibm)n@%;k5 zbjUMHZ$-@7zP_S)uw`V?lFhI3V*_B(5Yyb=IHvkWet5}{!^$3-mU3e6*$n)1YE@d9 zc{8xEqN`$mJl;shf3vYl5@QV$y=J&v+lPWQ+)u_AK(~%((A!4wFF8a8rwwXyVWy&_XS9$#_=RnKj=c5yc@%urcJ)_fHx3z%D2P7#qLm8XZlc>5{ zdFX1Ip$niQ3nFYRaqz5C3?(o1FfrL(6M^{b#WpUqoOm7Suhdw*)IHq3zB@L;t>1S< zT4{Cse*8SzwBP>Jf9Hg?@ubpzuFg;-@{)CCVAo^ACib`9vG`n^2FJUSWJ|D-=)%Zp z_#DP!B_zRoX;eFYabj356qIq98#gJN8+P!lP;zZjIqwTIK75X-*3QA6U2Us7^!#Hv zE|gX1ep2Zn{$cJmD+uOql7HSZu?K~5`1nsWU$!Dm9R+0V_`}M5{WR2~V|@1GRPGa2`r0Jgi~7@PB|Iu75=!M9s>b5;Sy`{<$Jffc*RxSL4a&Z=+3onL zxi|{+QOPR}<*RrajCh83np7BP$C#XV^Lv=@vEi|}5V2g4v8vIr&ats+C&=Mgv4dl= zK3_3UlVG1y({s_&Xi>7ktuwA$VQBH;CWx{n@Y86C;GQetQ6J!$GvHOr;bE)cyJ&JG z0BN=K-=7qH&~o`O8G7p+ffOHxgjw;x6@qjYNmvy{I34u~Gq=P!3k5p|MS0`nc>yw4 z5wc4uQFT7CR^1Ca2T64^$yqO|(+VkP#|vl6OPLh9qa1pSS`f8VR2cv3_;{2JC^J*vopRxv>@&cQR8=uORh^k5kXOjV!iw%343+I;z^*cFg zEdc(sA>O$uL4rAFf)$mP9f7tGjjI?zkQq#G=(Ty_fAmI%38izbzea4)gB zZis0guu~SXyB@M@kF(ZHvtG=x@jxVS7bE~HVo6(KmwW7xL-C|jamb}4szQ6ll2 zL;H@NmxSYroTiSN+6`VB=UIA|p3ilHz0%`L7KlH?sT1hCnM=4%?rvXMY&n74(rzr8} zD48=T{jyNHDplsqS9UK{zAB~#mMa1~sDM3`?$xSz^(s8HFVhh8BsFqPh}!HHnLCWU zZciQ8nx7r4!F#ux{KyY{*5rfNbidUE1ZZAkYw_W0c@SyUlkO!DQ76(+Ln5@bd9;(_ zv>_n?+({9)dXd^@VYhCPIVB)ZuP9_t_;M5k5&_~%00I3#4>e#Ns$?>z&MdAj9-$QA zRU8+p3whSPLJ$WcNrLS3=A88L-Sk0Tx*$J2cX++}NLjoPIs8(2{7U(`S_R$!X;7;S zs8bfyD+d~s2aPJ=c^O^#7|wn(PAM^-D>r$^p?Wo}#yhQ&+^90!stUSQogE;>8#VEG z{Y&s>W!n1h?;0U9dYPWN`}H zI7E1{N_0s|J2)9Yu5|)=QbjC*cW76x>0O02_f7At`En+#aG8EDe>g)TW#nXl8{3i1 z);i!B0Sm&*|5iV6UwW3V29(e3g8s3&_8L?#GU+e}{3^7npS9ytuJ4%x)fhM|*ntN+ z-P=|V8z0*nc0xL|jQS40srI|}ZJyUY1y~Tp_Jg?vquWMV631t1o6Eut6>?|xYl+p= zvvi9mtgMN|0!yvSmJjPlP)eJ?&K}9^hU2&If_loF?nKrkaH0k>Tm{=Ig1B8K3f+a; z**gJV%ekJy?R!Wet-ag#E#k40rM*WN7f-Lq4|fjEE-4kJ8jV|f*Ja|TqfZ%yAl z>jU5qASZ(f`>V5ENCvoEqX-ld#=emaRR43K7)nG-peoU-3m{fXWG%cE+)J@RYp(jLOA<_3iE^WwVKixdTCU;LfG?b{KY zNXpce(Y%twmD6$ulPG9ih0!Q!yBD&kXkYd7r~%w}L^J@`h%#D04>}b9@LCQ40(lsG zEn~EKFly34VFT+HjS4^YRN9ro)hu|-JajD*kcF8pj`l}dEby~sfXisKva}nJxlar%VpV1c&vsVf zPR=VDX=$r0`A3{uQv7dut-9*a;HI_0o`ik6c2L`MiN?W@0Me*&357_I~ea-N9}P1S+k6~J`M7WyZP^lOu9WI$xOQkFsMM> zpA`Ue9s$+{$)FK@=WHG5kDX;EP+_6n^8BQC@2&CaH`sD&T%^qc6c%PuwPx(y?K7{N zwB>7}5OVOafWGt z>5FiM+P~n`HEAGGJ~4%2+2YrvE1^;Ga))6}*9s8K$7k6KI~}^xZsfY z4wPb%gi1$X^V19765)}$B{;LIGdEc?Zjy@lyJ#u1_bX807oh3=IJaf(AGp~BTvo&+ zGziGwZID_dyTvBVizs-XlUvOC#UCtm0%iV;>bY;Obw*~34JU2ibc@s{`}rnoEn7%~_7mj&Q7pQtD- zvE`Mn2OpUwR+j`0QjT?`yS+0XW2Abw9{nkU(*QiCl-c*WY$mO$KV{s?ke_IE*#_4Y zTSgJ|(ae7#+%1uK!loCs2=6cl&_G>ivo>!XWs&PJLMwBBie+6UlZ#DGBb#xFv$v}L z(?vXZ;A4R_b+&o_+qD`{93(1W{o+y`7@=iLCG%w*^A}M5hur=GB@;f)@8r}ZZC@>A z8xTujazmQ-gPE#L0$#i}ximoA_xLw3BpJt6Rmq0&dzh$c3SVT(v{Ve@<=0(E9G(hi6Y7LsIeXUF=<$;<{S)~ zx6&mFF|->yu;GcdpD=h8Izso)ntO0qM8*7ErWRU_>p3ufKaFa4boBG`A#e)$Rh)I~ zp$0NCF{vUURt9}n$8#~MAH`u+u9jU-kUMAaPeQu5pR7Py>R55b@ByQnnoQ4vD902dHiit^}P;e2}CaU z*h5@@*bCtlNAK2?AJ8i$3ug@rsnH=AF*|^MXU(Omz9B)|@YjlPaQNBp`D1Qb)zGFI zTsh!bOKg7lW{u5r4&05BS{V5_GD2&IbXKU1P{B^cxr&Vfn6%qRSX$~(K>uV;sA)=F69K9LRa zRM6fS*D!ZkRh~;m(peTWjk=B%8qBQJwF@w-U{jY{$l#&c{wr2l!{61=asxf`3#y^+ zFk8wJu(p5Buk%cWENAK3Id~tW&&7!RS%L`dLOXLV8$8AyQ?DKEhMRA74xov;l3&C& z^1WgsrWOu;VMnhWUhU^=S>yLMY8+ke;i>EXs7f}ZEIph>s6GAyn42aPMNP2UA*SyZzFT)^i|l7VhDiNGDoX{Y>k*nB2ODI>VWfQ=GRG z^7=JWkjQTU*y`DoD=9a%{g0DEk$lWW?cIjI8;!B#{ zLqKDL=WF$H;5__fed=St$@4dn*R3bhGz70zx(AUs&y#?|n=W#uL5Z}8tqz`tyI1(l zy4S~7ae*PeGB_cbR7RdONt#b>t8i^?@MUEP&84|rerCzcTm7>H_Z6s1-1maQuqz(_o zOOHCX$)%nsq){bAWKJXuZzU}0rHvRwwk{+@89ULKBzT#nTO+w>S*uCex$rmyL)j#v z9wf3JWoVwHArqptlcIeHqO<(Hs{E3c0y4>h#G&ZzA`rILS(#PQ8d32ES}~a?WNtj% zK2{l7QCV44IawolS-am-s*2(WyWM>RvPpa05N9cSHQ74izQ5Pfq%;x;Cmm1PJbhYn z2**7ljdH45(m+P(+N+*yz5Y;LL3e$*(reDzp3c@688qWwUXz|9PI)}Go+tAGQVW@~ z_mX(vE}&Jf5n>msu%z->Z<}PhyZmm4B!6T-Jexbbny1wFQ<*qo(7ixe^tZBVp|a5*W&0v2EyG&c;=#HA z*)|P@D}O2fGKKAOspzt9_ux^qU=`Akq4$)Ec-1PbfFY$DMf=F1Xrs~4h&M}Ms4P;k zPk)%zbm%HkUba(Sv`dNSrGLA7;Jx**|CizH44Lfy0ieLJYN~2<8e6Ei0-DrVS*|i( z-q`B6>XD1;WBQ2yv>NG*8Y@IibXHA;c`yXV!~1veDRAVfLgl^MxPRq%S@}q-mx^WW zkWrWlFZHNwjoSA5_*KJDLHlT1^n}dq1h8$ws70l(QKhe2?($icwL?j?bD}IkWi?)X zc0d()u7P%;>Ygd@J~&)7}Xzu~yr~^lD$%IHgA0y@vavM)oh& ztDCV}rYUrKZ3hPJ07mT?ChhEETEjn@w2M;#ND9#RGdpY}^&I1&6)n$b6TE8^h{#iO z+z>g`3DR(NOZuUDOf4}Sbq8EPJvQL?KP~?=t>|OO4gug;QcJc+Mvg*NbXaSLbh=+= zQuRv1(0GP-LZb{#8=rcHPI)$kR?>qRc&zbeeuB`oKqRZ0q$M-s93VMP5Qt0dx8Ibp z?##iFCKOFtjvoLrR53Q1DmRAYhyuz5Cp<7F0?c*DEC3RBEsq;5F1uNjVpysD>*GIDXlBecEj97M%NS2 zcS}(GDTtBI5}(KtNObP$z!)?S1;^{tr7ndL>8GU6vaQa0KvqDqEJ|nS4Q@&PH zxeilM=PIbi)T7QctjRQ`&9t!7NUhh%_z4#942yYz<-Ecw;8vgzW40NRj(6+w2&>@X zb%)V)kMZ@e$@P@!^}?xjz0KgJCd5T?3=DLX0gTT#=<6qF8|2t#A8;@nF(Xs4HrVmZ z#0kt4|83|MnmOW|@e`WGeln{dLSG|B-X$^1B{l0H+?XIYTcb4F_lDzSO^}q2N~JgV zWHh(=wCSC<8Oywx%Vgfcj9tNk!p>@b!)897F^j(fc;T7f$zEgIHWAw}QQMe@=22bm zY?ZsOrR*=`J(z@x%^06pyhvJNNLi9gYlBM`pry-&r^|CUru`SNc?H<6BJ4(K{e$v4 zfeNfcetS*PR9qdVt^qUAggL^m5@@cgfA|~-`Wy>d&(&G4&|U99{miewu4Aw%>1>Wc zVNOnE&Q5JMPh+lb1fGuDcrn`4^s?eNw^BC&ubF~RK7(Iqb}=k=$t|sP>~W#?2A?Cm(++K?B63(Blp99?9WF<4n+UZ ziP;~BJs^ntQ4t@xn{eOpGSL2SlBc;>-GwoOOQ!A%5eJOlZZ zS}1?MEqwvMm5C4)=#_F0#dR-!}wjoDn3(RTK`Q_^&k8_bgp);1~s5u+n=B z5Nl!+-6auTh2gs_RXU!Mvb7-IG>tz%6xMZko}A%gpfBx3o=`EQcuv{=CPnq<+=2U# z6yIj0{CQiChjQCd0{{kG?wN8_kbqvD{PB7#0 zw%MgY`Z2V{S>cdx&*OW-}*&1>4bWx-OSpxV7sG}MM2}iEghx#xFtbM`JJeV`fIq| z*2P0X?xaTE%EjHoGcJd~%HG4nqi7-7pGTa`JFuXb$~|wy!bcAMNZ~=8?pwsWaMfKVZI$n3L}<}8RzNqcrkSiY=aWSI*@hY~zi%{B`9+`wi>XJ_;lhj*$jcyzsh6_n8u(b-YlbvRykgxz#_JG^Kb;8H|K%qr;#L&-o7=*!QvGeHP z%>@104mYdb<0MJm^&Y&pXxh4uvlM!doM72ZOO*r`lIKIr#d0)6+T~=oPO)k>>yGJ8 zpKwciv&ntEt@qwXQ7HJn5BaU=OCOTI-$D@g$3{3ct>|Fcv7ppWxKEADt~#}`9YFb`&W>YIXFG!0tOYG9?-~ zUk#!fx-iOP8o5~Q*`?Gv{p73?ZN!aMZlMm`9qmZcbQ;{!IfS zc-4R`4{Wt)9OO5DCyo@fc-P)u;JMrZs@ydc_h{TV>PT7Gfxgslc^d`JP5V5i6^@_! zk;4X0Hee#9QLSE$kQ;NDTkW;EL@L8o!0q75Q{e6G z-j4tjVqVrz=YA@g@Dz;9-=a3h2Wo)c*PTUMtQ6UGfi=yKDnejZ)2o)WnhSW9gAh(i+E3p& zZ4$bNlaT%xQKp}nV2*tuI(n4 z+Ro-_ahdQL$N9CY8tJ0WBBxi4zd5U)oAltPp?@yEH~XCTpBObUyyQL~Vl1!tcP(N% z+5JNNU$Z9BTA6(E`#WR>vw_@$+&*MVcwWCS1d-H`1&72d^|Rv7vG_v(}&#ueZFG;L8G07Vl5{)hxalJMNZ?@xxWL%3AGqwI~Gs z(lV-?%Mr50?|$Z4q(|X5+^&lm5Mc)6O{ zDGQIf@Ne!-jsNmLM0-YRo8JU$tA<$~j>kLgjq@+-MkHa*Cv^Q$b>EY;rPGuu)ENIH z{F5K@^;2Pr5ozJ=$Tmg+YtVWuwIuKU8_7`f{%cg+=9PP~rd$eOF4SzVSH1<=7U*z9 z{wzzWFfz@A1eUUWE?-z{g#8<5E0udLPe*{$Q+_orBlE5TT_PSCxvaHesx7D#h-#2Md?W^ED*%FKM>D#oxz&4SN2W z-zKq$FE3GUTB4w}%uY$Xuu>adl3)ar7A9Zh<2S$0(ZNwX_o#p_{JB-%gFe=$@z4p6 zEky8|pv|S9g09;Z{#==~!Ln-|&L$QhBJEGt^ttmNyBas2_es%%ct3kl%_2+6nr|(9Az=DJ<`ds~Y|3enm7CPuJA4ch{H-uvA zOJSMEpW6BPO2g%6H~Z@~Vds0@&-P?tRX6ngiz^hC_Bkmv7vZ|ihQfQsN4<47sdHl_ z1}Db>^$ngoGb@ee2YcD~P5;#WR$CKGPaGgE9J&9Xtu?nW)n^ugLhH3AT)#OeQQK|9 zGmPXhz$vpgf+fXky-6TsBNsQciOy}q8xU@`C{kR{Ub)$33OIM<8Q@bd>tFSXam5gd z;!ADXUOr4ZoB&idn}{#8wWcond({dR@$Ig{vfXk$Chqsd_EulNfK!Y!+|=#&o1=a^ zs5LgWSgo(z2NgjF1DYBoK+pg~uj`nt)|R!i4r8x!kC*;{Mi_Wq_qEm;WpLT+)nQ$% zw9eBXN$1Y#!v+-A>XnPVCIa)_Fuv@(J)!B{AS9n94br)@4z!v2`txXjy`UTYCc22WJml7Z%(E`lGnkk6N-*$0T@FJ|Cx|kj~;ha;YF|W)WcJ zp-laSOxvKGTjQUD394Pss@l=9MqbYbUR^?8tG^LW^`a$2!f~*{$3uwXcZeqWFq{vG z7J4sR4uv}=~pOYYRieOcSk!fL*o+D!?EMQNwKDh9Z$K~K* zALE?#l2_)FJLTYG>*8xBP-+=W&7dMVN8oegQsUrMU_uDBoCqE1s3xIQ=ML16iLfJxxK(K5q=--Dh->~6bSIuMMAuxQ*M!mAl96&dG2pl{XnQdv z`Z3G|Y9tI|J4=vN50h~%VQY<&<3uq!Yhi1hQY3IuU_UZ*T{F8DGFN8caejTA)@P|f zpiIQXOGL+;8S3VyeRgGH)8=8r++cIAW~<6!J9qA#8b?hCBGNuay^y21xToblrFDs- z$Ihd#E~2}{q)Wo4gWz%{5pssc#hf}p~Q1q2%@lay-;mM#3eF)QX>WrDlvx;9k+!@5;|8Bk0@kP3>P8> zSP)BEj!qKb)z08aQecE=h$l%fxvBBiit%wtQ1F2Hrkwb3pP7>^_|BiHaNc6i4gxjp zQZ?QDSA+uGlY$pC(lt~vI2`O;7VPe9LT>Sym*v9R8p6p+vH%3x%RCW0Ee=2}F`$#E zwpRquD(Y@VjAyxU2~_}kDJ1(T%myjq1#9qZh(mV7d9IltV&d*GiggLRcnG+FN8H&W z+`3X_V7bHZY@NobDheoyMJoP+)6#hc@O5-^NAP`kDN!P^9@B_e3+I`TJ&%zYn*wlR-q>iKu z6w&3g(M`qt?7qdWgVzHD=v_x>@KfmlkM)vO%%{^dYw`8#^X2Oc<*(5VJb8CL8ZA>* zEax@A1OTvNw_$yUOnr|B+zbnq4vL(!}YC)x#glX@u^J`LQI zM^G}I=>m<*wdrR8Dp&BW83DQ(?}}Qol)A2%nU`P{Kbg7qvwLm6`<1R9p{_oDp2b{& zC4q#c4i8s@fX6>fu$P;cu9sJupVva57QdkpK^G}Ot?hiHoldJ=YN6eHuXkEB^-ZOB zLm$pUp)X;n4`G~-Zjw)0n%!-duUDRLTKU@_&sSjFxx3qi5Y<^1(>Zl8pm8qX782mK z;o!Lycyk#ffDq*Q7$|TUn6?*mhZxkj^tJKW(d+!1!1XtCOWf{{EK~&e?;5||(FB_V zf`8cs>oEm?;tReb3;87!;>{IG?Gx%P6}%`ECa4tVtrfBe3cJ$}7c>sl6Q|YV32!os z5Clhf+k`IKv#js3P~G5DuY8xb`<@>2-TMc_T`Wy{lDn>qJD!kPikF-BpKl9F=BYGM zo(y^oEKz!$QE7G_3l^S))|z+2o+nEh-bmSyG^b{e6qIi+R75e!DI;rXGO~Fo#|M)0DKAIAKWFJ6muMyT9+s=Wm6frVy>ywobel)~UQV)wg_0h3 zo8xDXT|P1X&j%c7VFH>=l3&6U`O8GVh*5tnvlI|>{PN|NH=t$6WGXNaDae#4SXTJ` zKH#^s+V4!60$-IvLMMGsp+7A)`il#mi{lnehegXl#l#`S1`&*(PGtGb5|$9-n~~%1 zi4ug95_~ffT5=NZ=@Nz0{2$8Wn(<3B^GjPw?Da)bmSoC&6v~=ak}}lF?zNJL1X6ta zlZ01N40cj7hbx3k%a;!-h^)g}?!$ebBU<6pS`pJ9ku&|!Gw;hYTOQLz@H0tpt20t+ zgsQ83U#nXgs+W=Sn(woTp6K)kvs)!{n_qusBGmfv)wT*{XNhGQD%2Uk*JWws8G`Fp z0QDrg^@cL_t!DL)DutOAe~8uo7=Q|ejSH>93iG=QAA_0Xi{h2sQov^6EPZ9cXO=CxZOwKq4nZ;^Beb$3{BbTkil zY{_&|-FEs7x2;Tfi7a*bZFIFBbUmJTi`;kn!?znDcAqL0XEXFf7F*t=8YBRbr&jM+&a=-K!*Zs93%ncsMLuJ{=Tw z8Eo?yeDWzLhLsxy42galYPlYI3L6%ot69yj65$vTVXA&)8SyWy@-G<`EzdNh8foRp z%&r{E?i?j;Y;3h@5OHYmUutM|X?XOgBhjDmbDPLAZuAQn7ln@dZ;ZD+k3a29k{(Wo zh_$rVwvuGE8g{m36|_Rh+DIy=p7Eze8m9xOrdxZbpZR7)N?M>|(_{=YP?qUz{2o#C z*(dBdF}3zps<}4gIe(&gQL?#bt@(D){IhW0Yt6n8*U(MDXUQHYHw;Re_ zg8F9-8$*Wzmez7Mpv@XE;su!K;MnTuo6okIbh4_gd7QL;og8`HzkWj;eO>H%qy2HJ z?Re9KW;6SB%Ft(0bZ#;LezJpO(u8L+S72+6Z<}0b+vv10M{c%V2|~sJ$>EtXmS{7U zn#l$2bm;HA81IV7>>3N~1={Vki!5Xv@B3Zvw?6MbzIJw`ba$j3i03UpqZUje|B06@ z=9Vvz7cP?5FPc;@<#ryC_a2!H9(`|K8i-sb>t8mWUhy|RURyaKpIe5Gt_Jj;2BNw~sm28Z(br(XVH%r3x;HOoyEzKGYi@hw zLASXfwDle5bLANZd!g8Rk)u1>*gFxy$!#M>eeBfzR3>k76b8llls@i`#w~lBSK))*Rr? zyk&9@GjM+*5!?}V#+ba^3wswPTcAyIRea3Nryey+ThAD#jjxk8Wm9Jm;96i(vdG-P z+@4r#RX*bY^pnF_(QDE^zAp8sIq;4xich9B_!Z;d`I*CO@%a@YJjR@}NmN~gB%(*| zOrJH-kS1!lkGr|7!;gn<0L*2;x{4|CXYyYiXE~{4n_Is49Ir#{q#fDH;7yi8>ZKbY zbfB@d{1rE(YHQfLjopYPYh=}_#ow_~q;h86LnOfQ)S?qBi13RP?=#8Z&gEbL*Q-Fm z@X2-mb9H4;6l=#^c?!I2Ow}LKl|m$9u^sdu*g`&FD|hjDwt+ONsGy zoNF?QM;vIqE@%=Mg7Fs|STfakAe^@%8s%Nt{ZIg|oJ*M@f!w*f^(SHML?mK)uDa03 zd~V)2S|x5^8og4Ddmf`QcXAQ4a!p+&s|xpQBfCn?RVSw^58fcRYAx?1uNn_h8gs~MQYwHFs4fTy>^IY&2VQ@vt^~bhV=er68<8xc z8V4Ac@g2vQE~cWVm@N@7k6J7fsO5l{QymJe%h?Q#Ai3yj%?{b8Uy92ijxv zbVsC!txvIhkz45%Vo@!hs5ZV?YPz$01Y@OwP$VXoKRoO#iG!1OGfW&(yYziF()aXp z-;&6PI>NHEvB=Hf_LtWD3auaPT}TGC#(chR~%=)v{xOGkoU`;v^Jc!If=~Qb~v@(ymmQ@5+e0Dw+UeOxrpi#4!E>=Q4Fa9 zjfvx?uVXX}F7ewPbT5;kfU%h;7~jNI_Or{(m9d&JOK{fJ9kF9)d26+FdO7VJe#}^ehmshaecTOv>))FkI6h zKe3c$XyZ_|(b%xNxk?o+9_}#n<~MX)&XSc6pC%{m3m@7xch~rVEzcw;Sb}d}ryGTB z#U|I(OlW>(8jT&!#V9~ukQ|cl`J-NYpir-b#_p%PWB2mFt>%$Mvc2p1nt(z&gnUcZ zE^bn5P2|?K&LnEY*GXEoo#F4PCCfyDE4BQPLOCsMR(HY$@=b@=*n_FedeZ3+wc!I1 zMmpAC0f~io9LPHDg{X%ql`*$ncQ`#iSsT7#u^UMaA+YILHl|(dei+?l*99sq0S*Z(8m}0+v1Gh3+M+uU>!RefXv+j4btq4=EpyK1{QPNJiTd>{^{GIJmg} zZyOOmc2-+>Q1D)Ef7Ltge0o6}g+ZZ&^jm_q@%C^gfO5MpB}v;LO}oWR#gaFnFp)cB z$k&ZF%Dy<2I~JfKq@=}6a-Nhz{1cd&pk?czkdne(Jo~Q*NatRZim&83_8p1)k7!$I zEk-3ba*RInc3qj%L(Tl($@g}uSmho)j%tyE{H5h?72`N9x_>XxSfGKGwZ5$rD6gMO zcf6|!aJp)n4@ImkPpdBnyfuV&EbK#XbCY%l^`ls<%Gw@lFAu1v4wY;i_}&M) z>f1SZR5lO{EXku#%4gS871Tv8X)Cii7R-GrNZs7fHVlw25$kG92Co1NJk$=fcbnsdmIW&X;}XVGTVvqgYv!^wWH8|5S*NXD=W?xe(3ctiC_p2yVSy-8zZCIH zneMJ7>~C-2>Gg9L;pXeaCzWqj(VJB_%za}|M7(GAu;> z0zObQB#Xy*vqN{S(R2*)t6MI@#QpMv&f}pWPpaQU#@@Tn{^)$}>>*RjxN>jxK)-lB z!nz3%5PPx#*j(~yk;2w0HYMjptP{DlEg{&OWo{)N{6%xbMIhUD{ZbFl%6Z~OviLH; zBv<;z`Q<&j?ZYHzkAc>Oheu<3wr{DP{Kkv-L6*>{##+ad$-+mGFUKf0UbXxS2P6 zXlQXRw*0-~F31&zeI%H60D}xW zqJ!B+z7peYr&+_|!La*Rr9Cl${5=6Tn}?3KPdQ=5en;=`9oF`V_B4e9!?eHGaUOG* z=-anhQ7|ja$n?L2rbn*~bvLo?Z6X4r_of0Bd$7XG!!)}SF__%`BjH-(Alwrv?%6HJ z;7uFm*E)&3>%hDvSdhZWb5QTi?d#UFXyffG#olvJ=lyHYo|deQ z4qm*L(PCk@a&fm^@!!O?+azyw4m^EiJdkbiP>PPf<{gjc(li&+ESKWjR6V?OJu+<_ zs|>t$_tHs^(plYYyew@pz11RG(knlt$5=&yaJ_h3y|moDyga?KyuCoaL?iev8CuCu zLCGYV-U50d!;CIk!QN5^u{yM_$K`gQgjnw~zma6uwnQI`o!H@ew~@S5^e5Sn5?OyG z*(hi?>svP4es>*dU!Pj{s(N>yMt8Ov5hP&yhUt#Txi)hZWqD(BkLQ^(O? z$KIxbD+lT3?c#{B8J2ZRP## zgp9=dM5Vgl|3B5-c{tSnzc=udwQMbtY~K>1?8Z=rl$|lQ!5B04-RzrPs<9-bzDQY8 zX(j8R1*s4jOW7llv1M$t*&W~ex_{@~=en=!T-W*UocBMU=XJgR|NQZKKVJ{@B{s&K z|Dd_~;WmDJv;OgBV`8&8soB-$a}C8T*4FAtWOZpw&9T_#saUJgI17FU3%S3{*$UNL zE*7_3zv#m*60f&--S}b-`*@+%V!^9rO4IVtMoX#pC;qgi{99RDK{oui+mB_npSjbn zf4ALCyOjvKQj`5DKd1F`4&ZmUW$8Vm8-%uF=5O^cTj^i15-l_nec1l_p;fGHa{#%` z^+|_$Nn60vj`PtS#!uU}oJ{f^znppbg}>DH@gHlscxz(Pmm4b90d9L;E|bp6k2k9H zpQN_(-)v!LTrRD(9<8%}%=`G{@2{WVf9-4dI@S1<-BiDoYkBJd=vZ!h{>M(aPoFwJ zS>``zXA^DZFrBga?F)GOC&l*pkF4@zYy+NkPCc_9ecs9L1&fw;Oue$2deQOzJ6-=I z*gV1d(4Vj8lj->t;8L1JY&E!Z#LoDwUBo+U_PAaCFT20tUvH?}z5n3g%IFH1?jp`~ z-I#q;AGse3x+xiW?Zab-(m!2aGP;H`yFN1=YL*>l)4JBuA)-qVIX2{M3dHbEw>=Bu zwFU`E?2e>DZt)=bTaeOu$PgCNxw+RW>t-)Pqz*u!{E%z`NWY-)i7Wgb#ruXVZUTfI zvkwY04+(FJ^ppwrjEM?2hza+L+yDL5mj6qeTtTmfl-1uB%_M23@CkF`W2gM*CdNY@ z3q!Vj{dD3$i?X8ivB9%wd`HiFXT|oQ0l+O24+K%T}9YOkzV1OgSz!3#>loE0==DL6{xF9aNpp0GOKrZQq zF8LcSWtUtU%v|V~UB<3B#z{EIOA1pDi)b8i60>lPvvdU?>S5YB(rtvH)~+b9kbr}0 znM7ZMlPlfcHQmLP=?WFI=$ChcYB)nd9#Ak$_=G#ZaPff?b~gcVXbVD^i4@-U>K}s- zh9{qoNFtR;3 zIWPfpnAColMxLYm9ax!LUqQa>n5_`i3N}{Yx?Kq4JRA{lgfStWfZ`GG6Hlq9o*Ga4 zp^rz>OFXHb|BQD_u!B(C$QXWvNf_BC4xq~5@~`0<<#13199-#ERs}~@!{ch;=`?sj zts6aDh#CTab7=o^G5@{Ga7L3CpTv*Z%pcnyMgh%P79UgkKaDCkds(!OI?0Us+!-ac zj8fZ1!Cyvc?RNw^QlMW)OFBne=%fAZUW}g6l`_OWEJ89dQCs$=Bu2sYnz9c@K*$n?tmWBS(LN zM}OEM&JQ6phUCV_<*!*YuY1@@dX#X-NxX6D+IaSQCH>wo%?MVo`JJ;h47%^#@?aRe z>-}>d49N!*+wl?G_A%KHEpUfs2>GD$`{RY7ND-g-Lq6$6zY0ZtUW@rOO8C$p`TUgh zSv>5sbJSO?WTfC3oZ2vfLjP!K@TJ!IlHT~1z4dK)=UXT}z)Zkd)H-!9NkF5^@>KX$NB9N+n+1-8(DB!pK`t601s46we)|{v@uxHW8!sR&FZ%Bo`!9k56ifodO_52L0vyZ& zOfCnYuLK~?ktMUqSEc!!tPVVU98hZ?FkqMe+CF6wJhkI6HSIXH`Xg{(8soTYpo$a2 zz?tFZQaSCKk_2TGx-o*>8DtMek|*P?XW+v`My-!CV< z$8!I4dggRI|8!EsbSrB5DJp0Y4MXks>^eMQd1L}w>Kk-)#^j)9BRN=s~L%* zLp&HA6CAJXCq?w-#`}q%4(6Von7#@ZkDt{{m^6r+HA(cdq{5NcL*j4v8)*1jYD~tb z`RxGw;|)T70{t@#{Y_GTJEYCNzB70HmOnZ@WG5pOnHef>gKV`$B5kJJ?gs=}Pm%Kj zk{*Oo7m}0m<}&Wji$9o~&Y#yTm{%yA%li)HgWe&OO3;R==Enl<4jZ-UV8{LJ}^xkP!U(NN{6kaMxGQ_^ufh(oAc| zLKmK?&=WDx8}Ty*{<`0{YrqdV7((u!%}JV7Ntq3bL%kkGHL9Sx2AGATA>j9+Zn<-U zH$zSC`m0Qa1~Ed(ztIjeXw9tIt~+Rx`NjCTP=|SRL;pPa`IKhSd{S{({0sEr3Yu%R zB)EzZe-!5M7=wO-QCM2iCKcoV%%#78^;9OahgdECL9;MR9-1&)?_-j`26zbV&w9u(ltJA9xrB!#8)u7oq z%Zx>ZS+uu08ohweVd9gf@IU|H7n!Rw>Z`3etA%GVg(VoE{?b#{+yHwihr_Z0u-pw< zPkEROHo;VAIahc&7(`eSSx)B1dJAANrmUIE1fDrj2*j4Y%yz%B)O}!S+>D)h5Ib(i zZnI|(g4t7dR)Y9eFpi|-0vrivlA%avr(it|3U0y%-3{*GIc!IgWiG{JG*Ubt}t zsmNn^Mvaxe!&dw<;GC%r(^_NS|e?YLJVxSDV7n!MfbRbL5!_ z@@^=(8%wSO650$2bp~Ok1Omo*DVc;xAXB{e-g_c>&tiC16jqTryZjQn$cnOLN!g9Z zLaws+cDPh+Beke&?n!Hce(a@`wcOOTXE)cX@341WA_t+7<8CBKCJFh4gUMRk&5F{J z465@W&A_69zH*kb*C{y*+{mcH=&09SWcLtmZ~>W8NKW>fm3TPoU5u)Ggc^K|!eCiK zxM+8+4NCY%Fn$B`Jlgx^MgndlLyOnSiJk#&ED<(#$-M4pR#Rn+MAW8Y+2ZaEqWASR z#aebkEipMMG9ft<@^-WB-R59@B(Q!nA#F?O!hc%&oPV&+9@N>%iOVR;^pX zZCkk#ThET*OtVQ%ot$nuX}l{cp*t$4dp)yvz404o*FRdQc-#Haw)f+0^mv@r(1w-o zcJAQz)A;D3(&)OE(M_Yzp5C-vA7P>-oOt#0?b%MGs%VCp5k4ZRiEtd~I#_?G(QPM~pq_tWpGQU@to1$H6FU;#gK)gL>*kUB|n zcwqh$O#;`hcg_tkCv??3T z?S3BIlx&))&?%MQJap^RdmSTD_;uNoHb#}E9|xdvBX>q`M7Z{*YEjP&al?A?KI-|m zdD_MomA6tA0}HjA)=49P`jN%D&9$e<$N@*Q4gs1}X!VIK}?00xjS5uJc zk45ggEOcAir}34x*v3rbAI(!sF%8-f@2@`&@-`YxGxM*^@0iSfU)}hpIvQ!ZtiK<7 zDdeh`l-jdbJA+%Gf3V#BwA0Y{fK?Ga{HZ#0@aQqw@TDH?v$3D`+B64rt$U8 zLW5T4QPKal-emtMQ>A_X`Ri&m_>0y`vt=r&PBm2`wk~RMhE!+QF&#%2YpO}QD*&+f zN?j9Z1A&H&DSGq+NH7m)015(g=FoArFWa}T6~79zeOXv~+=3eDk#jOL2sSTvHxx!L zc@U!R){+_SsUG+WZOrL3j`xb&gYX>HdN3!b>amfzpL1^-)hkof_|++$)Uc zJrBc+7rhE|J_9bZ7@5v24~-Fg{4&C`G|d7f>(EbQK0uGvEy`vv-YhE zda$_Nv@9nf(A-I*MLD_m7ac_o$+^q6y1b_-eSvP}p6hVis?+Pla@?gq|WKoSKZIkMT&kX>>UJo_26bx|+Y@7Od+nAW2ks!`yqE{*9PG=Nl@WJmY z6Y9$BulS;}gmr01HZ!VlreG-=c3fx0-&@6q)!=2dhaXy#`FH6>7f~=*`}Dy}wJutDULOLhMILlo#%TV!)tacgBiPxF(<|xgxuM-#@k#9!Kvvcx z;;v`JN0B-MS$);(tec^qLd%VwE=)aGF(5k(k6oO(u~O*L6%BzgL3{t`BA3N@NMGJb zxhdzgKECgk){mwA==`^c2XFL5(oZ~x9xm=J@oY=>l@F=eqno&9PWRUTLU0sr6Y;m( zHneP%8dM%TTjtmtM5s)8{T)-&zVzK@PjAZByVEys@6rx{6w(c{s@gNY8ez`O8&~Fh zD=g`_|3H1-p7ugY_eJ0QD_8Xr+n1b^FYUf)=mm-S+#}ue8?edLS7&$R((VTi$P~e! zren(hl~seVH-^5kzsvP1WI?nVhKybS6;#~)HLC0I^3yX-bIw#D5 z*;m<~Wk}DGs3Pdx-_kEVksX;{f8zWb0Xgdf(G1=267NP@?Yg_{W~>&6{Q)?#JD$cv zJQEZyIDVwX0tL5?IkP@V4JGHt^;w8rV`}Ir8IujAA9Hsf|rJ72tI}YgN$ZOWtWS#PH#}Q$dc; zVYc%OyE;l0+370`qA~ieH<~5D>;zcypbxt9SQ=AwnM^(HExrfK1_zq%9Wpc%W$NlR zgmf7V8V8SZ_^`DyJua_AL%>2lM9q{QuHwPjv7?W*2h_fKLTp138YS%3yzibEhtM%V zh6Yln&tt}EUf$*f$KXZlNG^0frTg}rru~4?IoMP~D4N#L_U^i&z|9_r+Jl zUx>kXC0&Ux%&-)(BA()JA zZ5ftT3}Vp>S>&Alk-Rz-b1}VMU3qd&%Jeb&bwPbJb$$L~?u$hby>O#-sjDEE%xSEl z19LbF@bMT-|RfRU+H)) zRIqFOZU>*a#lURtli1nC3Jyrq3QX<7(xXku$W2Ey z(=ncz@snARQr_JyyqqPXoh^dR6R9W=S$=3!{%yN_c4xViR`I%4i+M=$%^}CCR&3M0 z+XAB47SRe(mShhLvm(0Ox9d2}CsxddRo&lh!>2vV?)lBvv&fdPlen$R#oF+%9N>2Z zOJ)g3R_q>Fk$masE$AdKn58J#`JqoPKz9_U9OTYx6&)G(=U%3 zmaiO>Uzt$QVJP6{6e^htD=UgR97Wu^V&%5tD&Jqa0)OE}{*&HRC4Xx5)M;Ji(|EPh zRT`&Pb(C}gN_ZosDv;8u*%@7nGkBXbRp2wLPRhDaWjsu|3ZcB}tD+mIf)7=xLaD6c zRCP(J_$bw?IMvm}v%1u?_|&si>1S87)O54e@Of%g1!}9s>bfQB_!sI`W$LSy=X7c3 z@NdplHJn>*KCjzy9^Zbxihh2zS3|d713#=$HKwsTp{dKz#LsC~F*R3Lv~)RI_;szS zZ7mj`ww{1CK}5S+Oq+FBM^8$JAfr<)ufsZ}tEa3>P}8l}&}Hf9=>hZzMtapCJ(iii zo`pWaM!y=Y&vF9jK>-99pc(;S`5Nd28W2MNlipMv2V^B0>QM~|sfN|*hO8_jy=)^w zp3z=ylvR8|ujB&Z#f9p!3#`hEdbEp#Hy5iLF0z`9^;(Px?Z(w~V^%LnuOCDh233!N zSQ93C3=_hfNj1}iwPLErF(s^C;0n>`N)6}=+ss_w!klPhUIR8~J6Y&ME%w&4YY-M}UrYT!OJb;H z4a$;@v(hJ75u>bX;;h(-*7{UyVybmbx-~n?MnBtzm}gT{V8br9)i1FnzObz+vt?J> z>C^0pZ|rIs?AXor`YravcKaH-{oV{&zaLB-2G@*%*%J=>3z> zT{%urz#dEkgVGRCj;|Xa(2W%8Mnk!AaP9z-J1NSY7U#}M^Z-yjNU0vQbPrAz43G^Y z<-uqLFix>2pv05(!jo3!$*F_`XmHXSIIRKBY4!rNc#+z@Xml@5F9OhyAPpmEV+hU! z62L%`=8!ZdlC$Cs;CPePy=mLtTs|KI0Uxr6Ppy~__pq;llrLGvw^rVld&+zNYeD{8vj78&0J2R$EjWPd6leeqB*Ox05rJIaAcMdla%fO3Du{~< zHXsF)qk?PWg1Lzy2GkI8YDjH*2sbO#AUl+t7g}2o$}JAtLz~Di!fMOHxRv1sv~cp9 z@Y;rOZgYe|OT=Dbw3Z&h?L`^%qsYUk+A-As)tlzfwM;a31!KU$kk>J_+ZY}n7ASzF zh+yl)u)M=KpcIZGgR7Ir@lN4^%6N(zzD@(r(;)x>1d0)%4n*LY5rGy&iVd+2OyoI{ zfKU;Xni|D$tXkKLukQPIE6I0g^!)uNOw!~7}W9#U#yxurqe;j2vu5K)jHxUnH z#8c+t>zMJpl>{IsfwG=Zx1F%YchykfYNW{3da*W*IPF*upz80x= ztzP5Wn$C4Y!1YL@>-C`PYi2hLEp9~G+^7fNSaYHpLaC83YCVFw=9^>~m=qbBRF6tp z!zCM%k|U#%>*JEw5>pJRDa6*l{`z0&d3SmJhvz-Lm;dE?A4mL$=Z)OU|MI-ad-)&F zo4eP@y0@4A{rt diff --git a/weather_mv/test_data/test_data_tif_time.tif b/weather_mv/test_data/test_data_tif_time.tif new file mode 100644 index 0000000000000000000000000000000000000000..32b7f63b97d0341bcd06dcef8ff61f1631d7778e GIT binary patch literal 28051 zcmbTd1yCK&_AUwp3BlbVKyY`5;K60%upzj+dk9Gg?hrf#3$7b?ci6yY;~RH(=kh!E z{HxA;bzarGceY3!adL8U@p8U5^YyTFf2S;|ucoafrXVK+Fp-j#Q`3|+`A74el(L$%l8Kt8 zjI5@Zrkbvailm~N=D(%NvXcLZ{~4yGu(`QbGB(PgxS6Wx{JBrLQFm zFwvA%mzC6#m3gP6s-~-IqM$0RrlKyXr68p&Ya%P7prxRuDyFLT?mZkH931=`IQZ8m zuy{S;{{thuVuV-O$Jg@a-*TkaYvh09f9@gpH~vSD{6FMk|AD3cjsGE&{tqkzkNB@~ zZ{EPc$^IMvvW0`w{2!fHPQWYvhx^TI!3;%zt>NGlU#qX*(BY6@vDquG40{9j{uL*@ z;_5I2IO0xSN>R~GW?)`gF1rJNd_fDpcmSQxP zP8KE}wvLuG;#_<}Jbe6I{I4$KMXx_V!pn1A+ zxLs?0_!V^T$R+Qf=RL=JX;TMVD`z(+ThsTF4rZ2a?g9Vm34r@2NWZ|nS9^0Vv;Hvz zjvR^Cb{OZ)*RD0rMh8)N#8BBkal1;LqhG^6W}@?cb3>tu>&{QGY5l{;5-FdLd&k-j z#QP<0bHB)h?hP2hHb(?_S;m=_6GIhH4jbl0Q#Onkzyc;V!xg+lK{ZD94{wk7)$RR(nT z<~|%Xgie7tz=>r>zp`J=jZpm_70629XKf%`tQ_!*%8){l!sE(Q@&1mXz|0hhE_W z)ic2~pBTx1#DA*6kEK`O`j-7&p+>9FMX@Hyl}gEtw}nAzSGkXCG?8_TXADmn!Z()a zuq-f+7qulco>+DyGJ!XGB{q?G_#`ojkAWmPnZ$-64aQd?kOe0>Qpiu?N7E@xC6%)& zP2-R8sZ1vwiK)#HV909BB(tdiAOtGf+K^;NpzbU|w59%Ra=D`+lwiyg2u(f;G?^pB z_-;Cv!WL~lPpFb?IiKQ~Wxaqe#hI5IQ9~1LP#j$mzQ~N5Utzfa)Z@%TL@?sUNQ5+= z2egY`C|wrd+wfpa)!y+kzVW>9F>Xk{AFWWXKp0u|LAVO}d`piP%Bd$u_TBW>oFL51 z=LbtT*Qg;{)cUV&wpbxmyssOA_IMF4g2%Kzn?k;r3h`Scj1-AmLj1bPLPAzDupM77 z^Iuj?+0Gf(Lj4BU?Gq2DJ`x6iyCR7JM7YO2;gre(IUOSBnVYE#_38SLSAE~u#RdcsJ%6q)X(Go z!G=J>%l{2*Kr3%EdX**BABav}$MgMkmR#D$1DfG(>JU#?0~$>6GgMZ0bXBPD`xH;` z2VfPx)6dv9SVZ)v>~q9f_@E%+3rS{xwVSGJkDxC0+LD%wYxLHD{HGDp4H>U!4bV{k z?NV~$I|H}9uabkPXzw$ZzQuEs$_|D}(Bub1C2$K(523Pr$Zt*ZoQP0Wm=|N%Ld;Fl zjL}roS7F+cDNfd`&{3QRvTS{;PSG4UP||l~+v;sf)jT#Ty zAd<_2lT|zHJ)yAPx089BUAymJ^VT0N_fh8O87ZZ7W1z%3qU=vCWK#9HFzLURSbwzS z*mD+`XBb-y^VqUkWflYock4{@xqig=3o8+tc^DLE6{g7VK9HL~y1Tm0rVaSqkuiA+ z7UH+3$urxk6@!Fgd1u9D7*7v?p{2#_LlCXucz^|3bqOEy-%ox8+9d^QrK)X=Blm5( z2S}!6+A%dc24ngMvbN>g74rAP z_o@XL&{xu{2)oC~umrY2;Z@TSePbDP=4JWRwSW{HwYS>B4u{vZ7u&&VElLvB+Syqt z=`#x~x|lX>RlhGuzoc!%k8L5OrKwoL??acIqpYI$3ldJKhP|R7V%~{*|`UQ_iHx>zExt_&q zyuOZYDhb6q(v|0=#LOCyr`n%e{vDdlVcaqdadvELZs0#(UOaG3K0nqUNpt3~EbB_+ zA+#Lz&EW*LRJvURj^$taod8*nvfTS!o13~1cSe_U+>__p?=i1ohR^^H_HWU`>|!>< z;dzyqacwdDWvh;CUpawyQ_W^2du3!_c*h>5{X$iTL^X=9Ev4;ne$xQDSCXq@2_Plg`KI=23(3=Pd3NG+ZjF#aOzTaiH0WF zK8u80F)(g!8k)GR07ve^xVDWZ(%p?6$LvbjQxO zCEDt-*@!p(c|xS9_4&`{QdqK|3dgXowbYey<*XkJc<>ZfeQi9k;BVr1^fb_PV|=pW z58(p~i*w$Jk=b5RR>uf~;_oDAH|CvFFWQ}W?gdv6uPR0sM8r4lU+fQ$IocY7kr(bK zi<)kn$5tf7&yHV8FHa<(4;@a|$0}Hzw`wSB9UVVUAE};BogY`L;9wQF>lK{qE@*Jk z7;uw#@L2d~Rpe)-qW|1ZB;7kWzI94OIWBt38Fv?#a9>e> zsGfN^nf-wM=OM1};iM0>Di@74<*^csRymJ0v4)Pd*%-5k9s@%k-@rg$IK`fOsX~CA zBCm3;uR8C)U>#we9_Ck(Az3gMJX z<8o2sfIpO0+2e8X;kgRoRg2-BN#b+K;=3x~SF7NkX%Mi7BAB7m2uSyQ&XER!H6wEhgQ>&?%bB1W{=*CfxgC)9@mk< z&4nYuGbQ695#RAEtRUSYdpaB^*M zIx^v$qfz7GUsm5yojq_}P;hgj3*kiyB_;~23zK(d8hq(xo^BGE?h+}e7FjnJ5abF1JS$Xdc7Mq5M@yv_4uSo#*#F-Ms%{atsx5O`wBzVpwJg_8cZ)w(+kfmZM zKL2K(A(OnM#n5Pznt-u!53#w8NheGmxs0*Lk73NnQ{XFc;0(y(Zc(V=amM3O;+t?J zyl{XoTZ1WCaVufoqurD%O?f*}zi!1< zC`)xGasKi-L^4->H_3pHGG3uTb1!MZQ#@CrSZhDo0AyUPU#@eIyl@rNWm>Iwm}1D6 z?Qn`li}~R#SqQS9kl}BgQp-trmnLKO*~P&Aoi{#jVu*I}zF4_+F1!4)A7>zh>soUr zj?%s$OCGf1PI+8GI%)EXHDd#J3K5yTW9y8#~hllshw-ir-}}NqSd3@mM0UejDg+rYd@%QSnb5= zL1d$xI)^kwAwxia>F`B`FO1{JPKofL-9sV+)_SW#t&h1{hc!1T5QyZPc> zxB7e+W{)}|22K~4l?|^4tSpNt6K{8Gf;?v1&%$>WWlh?TlgL6U2VQ_t4rufSQV(iu zx^>B&i-yw;P9erM4`~LZF%6B1e(l2OS~Rd7Mo$893Cn6$)pGUCSK$jV=aU7 zj{(f?cvS&WFrg|y*_lZB#kV~Hwdvslu}Q!Imc)b>299Kn78{9lC9evq?39-6n*7vE zc$Wfrrd&uVp}WO^E@muHR#R(aOMQ3-Bqpx|Q6|u9fIxtz2D3`#Ha)YDELUTwaH}YeBGi9{9AFe`MpPgMBC8YjQ=}7Cf zCV-LX2R0Y%@qn4nV@k$sMpwuttlv>;CxOz8fhqO3&-#g*<7nFt;zHvttt}JoP1)@` zp|Gs>W=zk#oiOBpA)9p7=z?7Xue^M_be*)^UDEp8GCRLxP??m=Cby4+2J%p?j0md) z$WcUYzM;?Gc&kBn^2ZUnGa2Po?Ma5yT89%?KqA%f1%5$~Vy1Fg>j-WvTL0-|NY?jT!@4WAIjQwJ3*6rjqQ;x5 z%b`jNG9i7gBMQZ~!jNnfLiv12^$q(6j+CGRswpqSckDs}sYZP?QxA?G*!?8Zg0>jM zrom=?kO=8e6U)v3!Rdl>UX|o|9`@q#^`iZ^$IBW#p|YCr zgaSS<)HpIrmz`v_$pObj*%RG!K&zNbH zpLCu!rWmfvoTXKkeBLzG>}$10ewgdw6h5tF?Qyq)-|h>5-&1A)aZw@m2e@6WC>MqE zP+E-pB}yKvF!bJ8Yw6@@?3XK)xUnlK%yYslAjXNwU(08i zn_-K2UrMjiWpyHY4DtiQ%dhNe>J@C+wpg1|agTwzx&Su&rB+YR1ubaAiE-&zKo#G0 zm*RXR|29QTbuGW6K2xX#y;w>bzpC|oah^4OR(5(VjlBVCwIyp{VGX}ifSRGM=nkkX z2e2Ka*0dq=HS;tZ(C%k&VCYmH*jmruJ-^6eFT&3Hke5ooWz>@9?&LYqOeD7l6e;#} z0zq4dtd4*!P2NsduvVfUXBz2ZpZ1WpIMlBbHST>?iVL_4afEYZ4BP{X+b-L=obIL` z76Y9jHyz1RrF^X@q~>i(C43@?tIXBf`-O;IjnXKqcF7c+#zbYF%}P*o0i&udqn@io z?6oqk-?l4mupwg8h2B2UwrEvLaLXal7X1(agBoIR|cs_32r7EFEQ2 z2k!`{7s^tt-T0M3LL&6@k2y&-F@=6djlfk_!;ZQ4lkJiKD3NaYo9 zGr$VG+41S5< zqV8(BbtPYW9h6uEZ#FR8;F_If?$(>oOjNt3UX^a%A5+HsFf5M0LTW&Vhtj;QxZZVq zanaD*#qp`H$+aFjDCEs)i5}m=tz%Nx(!y?;wUON5biMrW`rWJ?+AffNntrz^y8XD- z*~o7*epkx7JqpdJb@E&z#|k?&rbXnmI zYuD?(98cN0H@jMK)DIG>fZtwd$h>HGo$%xk*m*q2u`35wH1iWe+lcb*p8_lHwcPiK zsR}L+xgzaV#}>#Wa2(m;y{Wl?N3~#`N6*Dwi2(kmG(vAAXwrN{v%pPEAN|5@;9P}5uzC* zqNSsvEn^?u$3I3Pi!Kg)Y#HnvMQ%&N?7YP4Ld5RE$Lacj+r^95C4=9kMbKqI*yTa= zvvz^lKdo+hrJYBkU6fpMT1#@_L3014>p??ule+8tSG)gKd+!G+z;=7-=k_}KdPK%% z2ik7)!wy;^;Z{}_`|}PNcBxA?E@e*PI<_9XX0D#*j>ubyj5}!=z8)?9o^L!oiS{BM zLglnydRj#r038B{NFQ0ndlAKD@I|GUqy+NOKW55&{D#t*B-@!U_pwfpb5EJDG_R{f>7zU^Z^x*Byt3c`-c#qBXZdg514#(-QGW8767ppM@~t0xc@;k{ z1rJ<)>8(>3BvXz^pvBW&>7&%vbd!TgB9y2!!asKJ@&!HrZ|`^8_S zOML+*eCWn~Gl{aB2}1yYf>FHWsJ`r@nVd{ozeT#7mQ{CRQNK~fP@T2HNJ+hgUH>-+ zxuidG`A$9P`O@V1Js%1b(H!K-|H`v^$cuU^mU$^A-Yd@VOV>FIB8tg~Rw+J8jo`Hp z9M&qaddt^=2B!T#Neh2cZYa-h68zApRM#{T)jHDJTssppa@jG8*g1;dHTt1@l(%P8 zrgv1UZ`7hcB0xKLCU(f<{gBM?kbSN={%AY;*bsT*Xe*hxOqxO&cxZD<;nK7pKTDx^ zMqz1s*u!3KG*51+a4f%2rLJJCw`iX|dsWwX;6CF*@* zfC3zq@0c@4SkqS+TIlmL6u4l-6|jXT*kcPEurXs)I^#&7@tttMVNZUH6ud>MT0o_> zSv&K%Uk0L|l4Y2p1890&Odx4%i|J^qXhTBBr=Ij71qKk1A*9dfbDPa5p$r(xxT2mnaEl4Apq~otgN-VtR=uRUPpyzo&rf{ z%mDxro6WLUFSi3Er|}a=X%RS&CA&cpq4OYT4T_xUhfC;pZ*a8Yj3tQ{C{HUn-#|=5 zYbYwaS%4^5R&U{(#V zd|q3&a*?kYxV5r;1zSN{T_IRop<7?!+gKE9QNZZb$Jo|!+|dc#)mMo#4(y_N>YrWi z*9WHSNB7QKiW+ZPFO~l`8XMN-lYy~anotZv#U_{Oz);{VG%I%rG7c;E)@lm^T0Q|i zzW}wLb(i*F%NmPp6`%Q*KZ{k3ZzHVAA+Ks3jW2J0PC;EQe79PUw%U)rI*YNojoC!7 z#UyrM9D`$=)wV{LxKfBS?btc*d9-qMy7JUx>X@=Jc4$I(1ygaIU%oOCr!dNC$=!5ng{7^rRZ-J8f?`YZuJ{&%>oy5EknVYWbc(M?{qTWbG%`k{Wiyp5oB&9 zXNk6LgPJu6~dK>|aT+`c9;tq|YA=(V6bGd-5qZztPP^|EmC zwh;H!zgFHUxVI>uFk@G?uK!?WtPTv)GjM%M@-0hFrosHhzj_JHMv4RI|lNfBX+o1*7S_LJdizlHI+SrQQqgABZ8vjC1 zNkbb?v5iT#ZBIw}k%5AoXlh#Vb{YHeAdA9Ii~7LL?5@of6_X##Ta=+&>}u;VAr3)(hcTh%$YHx)KX!F~9=4A; zj8!bJ#5y8R9ub0%=%taBPs~#Idd~oaN4=oi0vPuTS)moON!ZDsG&u?oYx|owFYzgC3kyZk-FCPX-^I zS6)sA;G)D|oS|^eJBX(wHK&ZtUpQH_eB`2oQBNz;PIH%Bs@h!!FkPVNr#BecD40=1 zSZ5sAXUP7r_8tc%SBq0QtH<;^cDk%i;)_nIn@%R2PR?9T!Q5^zmJ4}~^Bc|! z6Rrzq?u(%PPF^sF>)kG|7gz7M4cAa-uK^El;^rF;FK@w~8#Om?6JKwe zA@5*+@1LzVIo{rtLEeo%-VBecm#VPO5z4;<4%l&N0sd!%yS20zef@9)6YG5zkcs?;}>3dpHt-VcIUoQ$Zw$ZKCHqo zwbHMs%99G<$)W7=qT)%Ua>bzfAfR*^tNauGbn_#eq%r1tQ$H$_3;U{i6(S`b_K%*` zYGN5?4uX)rb&bCk!Xn5~eIGR5KBlvM2=CuK+hc9kB~BT%a)#rw`UvNO0Im^?+V&R9 z8D`W_xI%Ch&et%X@mZ#iR{lHv8}imtF-NNm@|SOD%dYbCAI(xi(ku2bv&w}det$OA zWvd!PO04x{jO~a5V-DRR*?T7c_7xYOx7Y6kzICou`~2i*CW-3ZsPTOpqv)G7xK-;L z+3wqwIl5byPWDU$h=y!=HZ2FpWUA&CPR^8PL!B9y`uG>43m?2GSNd-r#<9r4kk@+z zA63;vQfqc6gdWFBLn~*NT|`=AUZi{G&iWo#g5`@K2iN|AD=%RKTNn3%L9pO~ovVlE z;NXczc(_h=a#V!QNl_fcE_EXkbKqMnN07xCtJDDd(@YNFncB+C9!)o$Q5vV z!J-=YeHupkgni%uQ{sM&Ok2`^aH}i%fX0$9$;tV){e>Ky3hfBzOJ13rv#HB7Vi@Cie~1!#k`C3ST8JgxP#lHXyd%w8pX8|Z>S8bu9(+v2pGK0q?eG-EHP-`SiKWJ_ki@=^?&lsrH~G+-PY4anfc5zkF4Kb z^Te6^5(-aj-0@2%S@`NGg6lBt%l5KKLP2R(F)LPwJ0$I|Sr(P-?!T>LF24T}GTA|ztCP?c3`m5RB^EI+*NgGNI6tRll^xlbqCILzS5vhhY0?7?}_ZyJikh z$e)u3(+!?4r3nn5KSFt?E&^r0WNY({sX_JRzk7a$W`P2iJd03_e_pqxIM$`YnkC&9|!fRBj*5VN*kv?`3t{NAgU&rS!RIN{U&tyV}aAP)=ZMcNV1$GPY*YmX_z{aeyzd(fRo4`nL5T_h* zAMwTq%P4R*uiP&P>4t@4G`L+*o_LFVBhSmLx6{B7DT>5_r1|UwEMB| zWHX99)y83H>=g6IPj@&pn*fNPV%wZTSit3GGboYUPi3emmNCC}%rC(Wpa2PG*fPpZ z!i~`#*4=%Axy`3knz~7#>^~ONUZ>Wa8?hG$a+;9Mr)o}_wiRt~6=Ze$?bW1pD+xIYr(~ zP@E>I?590Cy8M~!B+Ybf#XU>5f|>ST08&fky#l_%nUzdHI*?)s_cQH~U-sbs_xB|q zzPl$-N`FdY_+UpN=b22CH&c>G$7~e+7bI0Qyl2SDyj0`IU05Z0r^ZAlYnH6KucH$I zGAP+{D1{WP>!RecZELlp#g?=vrDbr@1N|!TXg=#2!UGTT2U4$bE#@qn1Xj@@nTclo zl1yzvWez*l(^7-_N_6Jikod3P7Xl|yA5f0aIqO_vrg{`EY#E*kvs2oG4AEpA{w^c_ z(PCbl9C4Q}?;~uevxD`@egF7(tUH1TccIrTM76?ltO+W<3nVM>I4*!RLyZrC86dCY zt>qSI(5W$5pU(;1mfYo84`3ZbokK;X6c_hqh`9*B+@@K=r+Ejl%L?EC>30yV+?%v) zec^z(mAUoTK<>x%Ty!uxA=!t-sMaq(03-nNgJld(h{F8`{nx0~3oZL%(+6s~cPq_iM+#V8kL014qP7?ed~xQVNP znpi$;Mw%xbWotF@Vs2SRfm|Iy?G5^VE0zx-zbg1Jnut)?m(a%2FRUEL<~szf$+t2u z3O&c?VWQTVSJ@ZaffMsDA8m+t{N9>}6&({vt&~0lb38qF3iRLDw4&ryu_sIk3&`0# zu?2ecCxU+&zB7%M{!!{h-XkcjVTYkz45Iuy^GjF9u6c;D+6vUm&p>3KDIeiL+0fT~ zg6}|XTE$OE(_e35Va^s?Q$0S^pCMpnzMNHa9W*sOFKxfYT3UbPH9s`H=%RMiV=qAQ zcjzvWciYfA?M6<2B%_7p__3kYyU=EZ6fM9x>#F@``w6!A^2LP`sq>ZrX?2O}n@cu9 z_pK%Wdl1-M7Jn`VGsxFEP0F8 zc@SXq&Zu&BuF}NaH*(x}M{sB9BnP?@8qtg*8zhAG-aqS%i<3bcA>U#&ds z1fDFvbox*stv)h*KV6~f^~oVvpM1e0CztuTGKM!OM5le;urso%##11=lC~)xJifQX zQr1+&cG=o7eL-nj`lMTYmEEE1Pfk(|>hFabPn$f%cuqD8$lPSVzyiqNr|xt5_S*my zmzBwocD2!g7uuai=L}wdEQ`ayjGE2yuImS}(tB9&VPG)mMnBl=I4!o`>jknDY~udZ z2E910T*DK!zIuki9)okPo?mvJUM61Pu;0L`qrk89Lo*Tod+)aVx5 z61mD2soW=RBIO=U@ZLH9o+I`xcHmpi;J50rZzmJq7MI;uc0XeEKXA06P8Pkx>U*bN z`p-;VIpsIU`Z8zx%F)7#;}N<`?~3~PO4aB}6&6O^BE}>P6MF+weFrn{0CVyL3;P1= zHI*TT4Bmwfq52KN8PXfBw-<4|7n3v_>QvZZ+|5dzYh@x_WeUVI`mR&0ZFL1)E+!-{ z*6T`-ohoY-4Y@frXS_4j8!io`IEPf{a8&HnxAA5JV2cIE-wV}1R1KR2R$tVr>UVMV z!(9Hv7_AR+d1&#m#BAeF9P=cx%OsUMBqx)P6EITsHgpzWtm;}ajRvy#7BX-LIZh9` z#sGQz2swCy0{aB(j0i_nf+8OKQUe~_bqBk85BrRal1mJ)TIfxj1l6Q8)v5e@P9-Xh zhxfRcGa9-mIQrBHMAYCont3EE8@b zlV&PYLI%@R4l`~Zvt|)hlhTc+m*pxf)Ur3(Y2D)EtD}1jGm~$Jgv)@XvjBh!jEUp z53uG>wCA687Ql0hI_;BX$iJs!Xj zPvRLG4--AEHP5s(FP=Pl?aN1i1_l6t!TVO&-BU7&Omc=w3ZGU=i$N-hMQVma8lMMq z+E)lKm369(mAi}8LxQzVmWn4tdgSE&)Rt@lO!(4-z0Op4##}DRT5d*HcDjQmaYO{r zBVWTNe*yoA!&OXE>{BfT7v8cM&zczCm_kxAeIlKNyMUtm716~VJz$5sF8|vl0dIWy zM|UM&fSM$4-$&jdX{~7$EvQP;qRI?R6@NojYezNdK()U~Ix!h1NkJC>MrKA*#^Xh% zPFg+r%^SB3d0uA?yi$1r5)Jnp;bdwNo&oBb7m*}yc?c7SHV?;)4|FD0EJ5JYOpcaE zqE_vq)|IRxI8Zx2Ov(K~`{G20=R(J$^y5sMGH;j8<=968|BrPex>_fC+F^Rhk$R9= zeS$=N?Nt5b4E@O)HC}*vothx1Kpj$PkR+l8DN-k>)c_R8X_IMcP8+%H7}XhSjM)OT z+2jF3fW)p(1W*7(M3hfb>`Fy}K*NMjTZ;!xk>E&)w@JIMT9MCGq7L-=8cBQlL6^W= zj~B-bAInUKOb<6u1s_ZQn#Wv6z&u67d{)AOP})LA-XcZO*)z}3v)Zs;SKUk764Yie z`*{P@C(K7{Rn2E~g=dvmZ4`g?$#YQ?v}26_-B=q2AXx9MhgX1{nbcBi^F7&KXPJ2A zYI_lV=EtzRvNYwd{#*~D*I_c_?=tiB)xGXqlqNM1nRWBOU1FUr64Ruk~bTOfS!Nl9r-7euGzdURJ?fy zygkO7yiRDJJCH~8PQ_@{#Whl2wUNCI@h0jUE4um7zC#@oE77*e4@-iv{$ut4ZW5D_FujW#Hx zD`*Zr7y+H6bCdD2*Xv64%Pn4rnMTMTrVu@n5MsKJb+NBB0$+Wozc%uHy~POqLlWx4 z^^L~&n~!A3tUHk3(M`VGEmgp6HU!`;V#6Qp4(0gKpy83K=HdAgmV9EO=Wizfuus+Z z;&b%8vHW>isrV~RSHLy=#xp{|H^MtG0u-o1loqMi5c#Vma;_taxFO)Y&v*PWWFFuQ38c~CFFM?BVg6AM#^ydA1He!f(f^MH;^bTTvoy5#RV~Jt0zVNY4 zNU?XsG=e{M=(MKdv~}8aq3U!q-}I*3^t-kU>a`5tzKq{eiGdM<}d!KQ|jrh9bS);%U>SuNAMf!A4y8>P(NUL z&lEvO@#hp8wIGm}E|Rw(kxwd}Zy=wauB3kp_9MZH)mtv`xr-N~i1$T}^Tmj3yeKem zE^NXnNOvvtb1V`zj5jbT)F=D<8aMiefA36+h0MRrf1o6xE^Y#(kVcmX$CorGm)xf% zG-s8NHk25omZZ0pEcBI<4wdS&Wu`&1gr|S$r=*EAH1Sg|{mG z_9~l?(hbh4gs-an?y8!fsvh7o(ynqyPO_WMRfVpp(;jL>$f`4_s^{-&5btWFd}}g< zY8F{)$vA2ad20OmHAIBT$s|A`(sg9=b%siH8ESQlVjzTLv_C%(OvO1`+-Neq8(KUY zLf&DjBX00ZR+-ICaZ2X%xlglYF;dDdZ^$L zsUat8a9CjRYJqsoRsQjZ<_W8`8}77c9JDWlRkp4DM-eO2`!J6_rdP&FS^)C>8GM zHB#z%)F@lh?oHF~UDEHpA1Ego$x53j4{#mugJcHy4m<`97=>1}h71P8Rv8jiWhPfG zrB#z>RqKxpi5Aq0*w*_y*0<=^-z|d{4eQBF>WxZA$SX&TYC&OaAft(q%#M+zo>B7h zQKOO3OvspM!+1+yi{VfMSxZYsN6TVQE7`yV*~En8Q|lslBH(GF4SwPYX;SoJG7x8S z>8`bn2`oke4kXuv%gBaj)La(IZW9FnC3CW5rvpW%pEPCywP&8BA!0yCpedwH3G!q+ zE9N-s&oI;HK9l7++ZH(c6w=WaHkTDSw;Vf9kvI=bozF^-Bzxu#c+(R=)SXGzW5luW z$kXW;+Sy9mx!ke1+_Mlsw+Jlmp(yXkv@dJb?Kd*)UotNVuwHJ1ls&DLQEaVK04UKb5t529XuCRr2b8zXPo zXr149Tpnv(8+#lW7v0-ff^Hg1Z)VGHt|*NgtBtP!#vgaKgCw_%4X2*?rp2hiPpDI3 zXp_(1rCkX*gk*wynH0Sb`i#>&8Z2>X%5;c z3{wL5ns+kwtZi&j415lwSm>ZWZyUOx7+NJa?OUFQiS1niVV8k- zeNRuY4(Y2L-7A>+CCsG%nQN_`ty-LEL!9nL!f_+mcbLla`Z4|{*!5N-a4k4w&DeeG z8MrNOx<%=_-EKL$;=C1{cK4EXFHw0PoO{3G`9Rh1NY%1O!S&eg`}mykB%ZUEGrzZz zx7%K@-(LLGv9>23{kW3+1cN?zzb0702!$6B%V1n5g?5{|%=y4~;e>bo zsoGVxq9cmw+r(+q;ugb>>!ZW{G!tP+l{!K*&1B?e&YbvZd%9`V?4O-qh4{jVD{68H zU`t-#E9Owt|IZ{#ur0{oC}q*Jea*GW=s497v~uR#W_*&mc=ZAw()IcDmk~b|PI#Z$ z*{>xp3F`PEi}N(2dK0d+F{_KTrR!kHT(He$I*>m{qZnd$mA>rNVfwn0>^cKj4|8o= zcDl(}zJ3Yu_Q8p%gT5K{6;UGf8LmlXN&B$lA=G+D#8Eo4=lLfq24*KQeQ+5dHn`(I zvKn&R(ynHbPVl<(E01zQqUGS!C!j+@{>|ZS2iC)-HKr=3mqd{ZD;8qDx`6_sq&hD( z@}D?AiML&onOyJk;+j#=yC;_}@Lxv+b{$Goo;VXSt?(N(@o#YicQp!GD%HpAz{dZwdfGi z)OWY=K7s4XspJ6oo;aUq@Dm6W0W%<370t^n9>BEC+9P}8AhWlQ_8_;cUVd_*tTwjNfnaey^`zYFUe%1ihCYvxY&2HhkAlU*+cogFN7qB0f z%@|(LU$p$r4CuzN51jA<3PjIlucY$z`K}O^U-=gcXx|nY&!L6>JIPW|hv+QEdt~ZFP6X34!sefEVE8B?Xn`^!byT z%HkCvsL1NZ8>0Yr1A#QHEv~S2ZOl`ug>2pG5QXpDuEfUg-5FP=9o)@^|%JesgQ1w5|2fKQ)tD3<;{=9p~VJ`>sHKEPfy^U{JlE?&N1Bg20bM1}v%ixPrO zkMyyTbMd2)lt>o1tJ`sUj%GJfq}ok>$3C9x{dpKhI4bklE)KTb=? zy9d3|G9ta4T;ZGZ^8K;aX^mtxs7YkWcHVcNP> zE&E6=03>t`F+JqTscQF^jzGcu%PRW&^xjX_3oK3MiRqLiMXkYDDs47K7+!71(Dx}% z?oXz4UY%L=)Z5>iw!CinXDOm`e%hqA4)z6S^}qUW$H;9n-3!38nFD^_#I}b?u?fa9 zLw2DW=9ke4xZ(;5q#+vn_=$g$h4VfQ2r$wGG^Ti(89<7i>GKcsN~eMS*`bEa zIF5|k`MF->Ghja|XU6FEKi&utiwiuT{yyR73*g2tMJUO$JAP<_%4JNYdn&V+b2nd3 zB~RPesQN?&XB%g$zgWc+&j=}#NyYI6~vS|GStt9SbBuzUn@Y*7P@38#-()=J-@86OiONn zr+tAvpv}nnNBu}wFOGiV0TOdj+i2H%fdS$j3iDRqSl4TbK@t>d$pfj6M-)|c_4|;` zQbWh`Amw`gu#w{g!f$tCRLXqb@yc>u|vB0%M{nRf%^QeLnrd%X&<= z@WeQ>FN=fTt#y9(%2=c+n@KLX*$?4-m-Q*AGWxOOp0a3WWc~|>R|NRxM#g5DI*8XR zrR$alVNIDqukIRk`c^=FuTB1&$9B;S$Q^ILRXT!;khyX$R^J{SQ07@!IlHh!ebDAx z>3Q8TM}}hRAQoQj#Xm4l#$f5tmR9TKHIe0uA7v4co8UI9F(RaEw-pEiUDq!!k$AiG zFzTk&(seD}U~z9_O#f<-+g>57_32~VlW!>R4^M-MS&QCe)gC`gh#Z7CKckQf^6{J}|KTc5-Sn~eFU^7oRjce?wA)Ct{4>{6iQdWeUE{RHIqC>RPn+S8`}yw&z8WcrS1 zV@1mcbcdJC9k0sU8U%<;mJ(m87LErT1+uS?$TsS>o=z4;j1P|5dK!+OwwApo5KqYI zvrmn2U_QD*CxJVi|EsGv|A%t%(izG?* zb;x!UNs`3ahh$5(>?Ap|CF@{j%zn@J{ygpR%=ZQRk(gNaWq;A_*DSkF4Ls z^zt3m2XEt_GyeE&E+MCl3Rx`Yj8UI-apW(TQK|T+>YDc zR9f%{v~L4H%`lWM(Q!al8eG!N+}K0b9%FoGXpZE^`>erFCjbURbDz9izWIoCVRL0F zwXd`Gt6s;j(ouS6-ll!9{>LyP#O}x5f`c4_e=za=3tyXfbcZ9R|hrU*f!6zHzo6hI; z)=WB&xj#+wemeT=(+`Wjzcw-AaNl<6~4Gt$a~^d1}a(xHPk z1{UGQgK2j=M@)_KkVcjF2B$3te^?FfSXj_aOs!l^I~^LXJKi%*{NjAk`1&P$<^6kJ zUj0q~w`94N@#Q#k=xo-|#lgEaGDB)PL#DEyV|)kH{O_Xt+K>GQ0t3xUfCKMdm_I2Q z$SXCkR2c{_GepwNk2QPqviDCv+T$-zo?3?s8HYdX;Y~KSY59_uJv@ycUhDe* zOO|)f57_`Ly$AZ&2F;GYSSJ3q2^<`0`ZCg1GDI&OSu6XzGdgm7+g9j~?bUCVO)6jY zs?2Fji(4~R*QdW;)gBG3RPv6sdRJ%RjIix6u^Rbl^<$~2b6IOGZkV^(+NN>Lbal)b z*XND1yWTJs-ueagWcW#kb;+OMm_MdXZKl6I+t}=npFJ2CB8*=@99KIUH~pVvd3D6Q z$TAH(6n-)kh_!!5waKFl9hcbbOpVZ)w!D)LLQ|H-N8fnot->ou(_Yz@uyN#TTflic zp+&1ExGkIBch-I@<+J1CAG6so1zj|)TpvTNIto3Sxb=IiX>Y7=@1DBISV{Z6@STY~ z!EuoIME})^8HtH?$qAy=BGtXCyA$bm$2vPF6)mQd?oH=gPFGn?R*>M=9Gl0OWmEBel7p(Hkawys(Dg5h)98h1J|QPYh`y|$Yn`zx-qbOs({;vd+Qr1N zPIv+xSq`#uQ-{o%#mu=t=YX&|D10s{b}k#6{7CN1{OURPTii=Xw|+FYFos(eHMfME z=ZimI_2@h!!Hx0Q-8S+3`jhh#N$&bj-P4oLyF5GJpW=>w?mn98R+quWneL9y@(|9P z+f7)|Pg<}|SpcLjJWM*PY?h(S_r$$)A-!nfO}0mCjz?9oM}C>dm**bynG3r&CyB3I zpp`C3RWA9yQ?dXj+jCR;>MpySF8DXDwz{sunqN!bJEhmVO1zs@44#k;nVJb-)H8F% z;avNhU3a6Lfg*0QZC+^e#kzK5S;XQDesR5Pk=Xr%^V1LEo*%NGr}J~&74y7p^SuED z-XN?uI(q@#=ML)kt{d>i4|FDHCg ztN;M!0iYR={&E1m6i}E3NS_1DM*(&hJvo2)NGt(H7ku=WeQZ|%z+XO~RUgGQpY(N~ z!X(ZpqYP!0C#|JF@jE}~vweQ&_{{A2^b>qep8KB9T$VWSRV4aqW%%ApSOy&Xf+)V| zZQnO(zNII=U!r`M=sv{52YjRl64VEZ%m)UieAPcZ00N!xb)I`%o<382**;;w1L1Zv13kB$WOS}FGyqsCc5(U@=Af&%IhmD&EgG=Z|=Jj zeoK>nyHkFQX@9z|*zD)N4nUxJ$Khh2Vb8`M7|M<&J|5DHSWvAeApz6XZ`X{EX?cZtesOa$k=s0X|AtFK}Cb?Xe!=xjyTaV&MI3 z0I)vbv%cZGPVx#m=f80=Y2#W{ka7}-vUr9vDDgq*#{001FWEu!IY9%UpysHc-_L_i zGB^1ngC%2v|H%l}O4zi61pCAWM}!6!rUhrfgIlA5$B;oA;9y4ce@nP8Hw4}IOjle5 z$MRdf%7j)_%T@-S{Vw>wCCmR7{ogBkmnm#o_4CG-K~9LB+?LPZTetH= zBnv}8e|*qeAzj7(yT3z5H$r}vgfL43B+Eki%R?pq4K=6;Fn<|p_bSw`jYGdN$=JQYS)_U0|!0?&b|-rXRYlrw>9e6hGpn%bLig3P-bfwe_NPjBbUa* zjk1S9<^db7fg3@<4gR2wg5d2g{Dx$2*lgGKM)x-9)6TizaFNcP+x_7NU&HMN!hHsV z3m)zKjDC1-JlJ(Ccy=O~^euc37S5ao@lSyyXF<1TKn8Ijy9JQXJSbujg!vB2Kn1^v z<%klNzO2Ljb9s+{Wsjpc;$n8hjkO4kUwf{rbs}reUa#+EZ0rSX?!mV95@jN?G9q5@ zMKrTz#O=M=ojuZSbJ^ZA*Zuu_xexcq`%h2y3n=@qsr${e{Q>%%M&_1A(XZEqTQHWt zBr~Ldd7ydvz~a<_A!n4`KMS`F{sg`G1FKoxc>5>$uaG_cgYolGU3v%0JfYYNp-OyU zDgLlB+<$hB1Up<-owx-1CglHm7)JCa{^ZgfUV_Jx;? zgx6mWXUnEzU!u*X36WDf($t1nXhGbRHkZ|pBK0BTdY~K2kw&ZgZbp$B#N+dbV@Xns$ie26G>Nldv}jD>k*q(w~Ry_k5*n0l)iCIc#WpQ!0Xv~VW6xe$Y0iI7(Z za_68}t|$?CXr^bBuXj|0{DIUd=$r>}(+f&|a1ehns$3v6QV?c;31*;8f}cMa6eMW^ zqp+c{OdzQ|2)t)Q`q=>6YlpEUNSjiGxyuCdRe~V|E)^424~3DV;r8Nzn+k`UHx4h{ zj4m4>*WV&9E0Vj>a27H;StdFY6|4CuR!SjO=}Az${t;;o5^Qi3ZvbH>$CfAW4n8}n zPl06WLdr8C_0MCIQ)Bnlj|WYU4fA5U^N#1tjyG+O$#z68N1|v^Op61>u;6$cOMw?s zl8Y!UFDR^HqLdp|^A**ilIm7P4Sr39S5qIKf_D2u-(&ZgFR0bt)Nuf{{5?skK5pNNxSFA#o++{UM&a+P+Nh`Oe8SrqW+^xud)?_Xx zo^15%;MugRnPkr3b;2|H3+Y>VUx0`AD&Gc+y%?#4q?|gft6VZ(1+m;=rra*=yxRZg zrY*N+)l>-KPnv6~!)uqf(O=(1*E_$PtBrcTqtosEVc{KE;TCl~kUj0k)TTaO4sT!D z%fH!w*n??$aQpH^K(Q@B(kZE}4#r_wC*!(^;P`@^W%~ z%DTmOrstxKFlSFzxCGbdi)pglpR?X6^7LLDQRnN;+R?wzC%|ha(3gG7_ENuqjf-%9 zb~r$EKp+h$Hjw=eBt9rG0+krd-a$)!5#&vh{*rSmU1mtoCSP_aC%jbdt6*A{{MVd! zbqd3RBRIw3oE^N<$R%EO6Dw=tKe-E`@&&i#RZV9!WTv7O)JGd_Hnn7PbF_8rRGY|p z(w8PB4D3~jTt%pKM85>84JEJG#pg<9*z#IWzrTT#|~tJxH9d z0zF~Cz{=M*1tSVx&whUVO78hx$H+Huz7Fu47RA<5fT&G2fFA=TtNc9>W{;h zHPz=o)mrbJ?q)Z!`v2F&Qv6>N%YXfUO|09)FBbpT#0ub=h~BVL4a#ct*ar0+q(XPhspi$g_fIpcLT=Aut3&EtFpm!Dp7WlD0j&df#nRkGAjZirdZLc8epHTK2><*4y@q-|u$pi;ofU`^9^V?t`lr zIC~CCl=ylHSM7!S2_=ydgNIi$Wrq$+-nH1+9nY!SL1M_d4v^zIlM#FRob^}?o$T}t ziWW7QbYN%(OvBJ_VY4t24jSx9N=AmjN!5ujEWHGnYZRz z(UT7PkW$)Yj^rt#Ur+k*EKss-2C(TzlF4(W#5!B z{gM^;3+WZU=Xul2y^d`Wc%(W;+%}uXi>{vaOLNS)W9~jGwrU!Z?)YBAe11uM^$9q` zaZJa;eP>DHaVE@h>$}AuNUF5&5p=9suSWga%Stt5whGB?kRpC95S0kOBW(R8^FH`et^^Y36>zKVJMGwU_sac98E+A>%k*!iYFsfKFE zwL>q5tYIe!^lrrClh=0`*!^apjpY+&dD;l3QRj5eJ?S&c#{w@7?9SPSG-_Ns`@Q%m znrC#YSL>S4uacwetg(>EQL*ELr87@K5tUx%@8y7gL(7o;Kh6cj0BeSjDuD%*1m|^EbF_8OIqp_dRVbx6I_LoZlDR z{|>O;N>X|4JoduL6J)#9uU+lDSLXB`ZNJr-mwP+I+F8@rG0++NCMVR~`E91tb*Nod z{@<^sL#v%-W12lbeytn!0J{N4-qzalvx{K(+iw{`@9unT==?tH^}2Eb+jti^r=6>| z+&BLIh`)0rB|$Z8H@XnV?6g75=xzqi6gS_Fo*!Bac;}M-(bZsRL0lgATbT3TJW0g2 z7aYNIOJ$83j}yM%~VIt zy6$B=^Q43K%!bJ{R; z@UOQm86=;MCB#1H0Dov~Jsa2W83rZQVOMr%X*A5+QF@*mjpILVzeiP=vprf@+UK>a zd~a#+)w^Cg@6IdZ9hHyiAx`o7EuMKAu6MRiSp(D)rt)kP_IK&!w|&1-rBmO%SMEEh z7qq3>2JHf?aPz+|1{ABOAD+|wqFn#CLt0YC?hLLa@BXs4bz<&&Nvqc8UkgC%ko>x4 zmm%!{*+rfn?9n;D5s%C}LEfW9N9G};9`zbQKbDG*o`T0b#&v?-x8v#y5M9`p_cqhW zCh!9kM@#25nq}6qQx=C7>-FDC5sS%bX8M9;bfsb-(075@&QS7`bwr_ zsJCLx&h0m!+pDO*euO*+zuWGc_3{i*467wzS_gEt#s2I9vJN%OaJqlZ?$|&Z8kiOf zBDsM(&DI5^r?y``3`5qhw&ju1B^F)DptaEsY-97v53kJV2*79?*?eoss~#4ywA4X< zO8DtD9vA5eEpIZr--!!AZH^KvG&fHE^HfMUkmT+;;pbg(skhm}u$x#dBYHIdmlkQs;61>o=z%A8=}&y@TgEn;JjSuyWPk3!=QNdoQLm&M zyddZqrX95RtZutybsLBey4soyAuQykx`M|L-THve{eO>6bSL5f@NvYr)~wIlK1jXe zA4+!v(O`H!0F*!39%-{swl9SoLGJ%{`_#xJ|3clWAdu?6{P9(&Bkna6w9ic}vP`zt zQMpS7g9|j@$UXEjp3LG}q5EbYVthv_EQ_rj{l!k^Pcp4z|6rdr{OQD&!8kt0AV|8g z6J57hBYz!cah~E8JryZ)YM;izLp|JcKC-u&%98AL`1^yywc{*Bb_(mX!|jboz?pjY z{wzIGrq`L>fU{9%XD{BHJ-xBHm&y4a%Q+4?cL8;x{O}|)t?3|QFe{XjSxMQi<9gpt z%|Z<;W2hfe&hK=bcX-^KIm7*-l>4CKFXgJgqH6y7Q1{nC1CKI}C#sF7ZikV%%&WA@ zYrn~x`G=X#;yoDUQ=VsC5MpJrI2_KL%242sT0iyS_iEHJD~k6$zb;3;$(bnmGY+a3 z!MuF}0vs707ghe+1Yac{*pR^2Ne%LXDhiwz?awx-2!4_|SZxsvsrrBZPPjJwj%eDLAuLUuRb0gYez($$8cs|UMRC#EEL zFA3PWNgPOAbi8zdAaV>AmzY?U+%Xgs&=e%7OC31RsZg#7a-LPumTJ%;?Z=!n*k773 zktSFOM~h240Hw2?DR-19(W=4|2$4x$QGvieceJnHe*N9C^!lNr>>U@`Xb;(s0NF#o z8+U*=qC;+c3^#Xd6wiTP+2xeG6Z`ED%{TdIh0B6pVD6s-0{_9byvmGx_JVwL#RYKx zRh8p4f$h`a&6^V|0@>UO4Hu*Y+b$k*T~o5ZHo>@+j=VLYs2DAv_>uc>P^oZ^ywH8c zzwavv3+^hZ*xo+83XHZ8jke^{$bEdt?Eo`9+dAa5LrsjwBvBk4P;vUecz|6dbJ zEnP5CTrh=C@X5C+Q`UW?>cO0v>ODfH$gk*h`N`~?Dn0V|tJIymG%g6=64+Ik{CIQ; zqCnVEfFL!&99nk|DvC9#kQsf*f_~$Y{?Uqo>be1B%b;=B;GkZ$(dqzVtU5(eJEAkE z&lrBJ)^G}tZv<&%52_yptLKTTPhVTj!AYq?H3b{)POO^<)@tS{Nkz{nPUUUp87dx5 zX-{FbRkKJ_ElN3?e_wi~E9CXJYMriXn>0jNX6lR#q(|l$VR1*@;*e#2^jQuO61E#ESx$g-)_^0>lEt;#B<#;U2#>bSvL4QCzGX5EChKJKwm>$iy+vS}K%Ii9dp zo3V{qux(nhJzino-rL1&*){Fj9TV);i1smT*2=IaayY1SIzYJ{aC{C#0Y`OVN2r)1 zPQsBWeP3PnK2-ibPVqib#YtV=398LL)pjBpJFA;HLoJv_kb38;7UD+6`tx?T!s5fx}pgseDE&y;#0OE>|`nnHv%LljXLnQdB z6Mdl+UmU}i#PL9b^8t+eK{MY2k^uX_*bgS=*DT>jlJ?h-^@qv(H!J#+R01^A17O+# zY#>H54%9FUgjohQ+Xj*xff_DAmXmedCsUb`Q7Y1t!YsQC>dcrmO!(l_= z&7C&X$d#lll!The36*$B;i1iT2*Qi>o~#A#N= z!E54L>f*=^NKG6P-iBB}t%{FNR1&q+ z6Jxa#TlEu9jGt(kJ&Cn^(rWwU#4$5`njDKr zZbc`bBs|kfdKR1VtTp}FNp^}>eoAanN^5D#NyT%ms^_sa&s*!BpERUu;ZkGUQd{w< zCp~Fe{b{j7X|1DaCll#fGwHDl>8(rYCo36R>lv|I8Lhh+CxlEbVrDEQvz3ua;mFeF z%wkKEZG2f2foyHzY=l^Ln?yE6I!9YJ2O*!+rkF!f$<PebD5rA#tHb$k9QVKNeE(7fDs+S^;>0T2B`T=W zFLh*J#>u~ISA0oTd8MQNDo*=VyZ$Sxaixw~Wt?SYyKN=au}a6KD$b*-9Z*H}d#wX} z9T)Pt9rT(CuGWE8$04fQ(bd$1H#$jg;!@tUr@x_Q*XZQe#1+-Fm)1}#-s)7njjMUv zUiX&TP^*KhjccoI$JbJO-s$weiyL~^KKhP2QKvIg7q?K?zEnqDd9SnnK5pxM`|f)x zp^Zkd{pywoNogoURKF>4ED2;Ano$y1-^+NOK3MnFemrg|;9OEgk3< zT0*OCQY$j0wIjWimffbC--aw|>nLrbRkZ6?wIgfVJL=kL4IR3;4rE(L2fl;W)2Z9v zi5%+e811A@;B{y4$OU}I5}vlwrMupR-0JGsWsjTPy2NfIrMrXCP3K^5!JkmvpE~(I z(FJ<+gnLk8J)IIgbm`A}vY%1%pF0&l(^Y!))O%6dy`B2KbmKlfvp$q%UnhHJb?n!3 z=|_3=cLMt9egk^I0aVC9Cuo2U9@K*lvYpXR^dLRqi(b+fRLYmm^e^=6A-((|RMAjp z=@7kwy$pUu)qL%&`$}&Z*24{>+J-yv!}OjJz5Wr@&`9U#2z_EyZ)Oy=Fxt5^N?#e% zV^4BhW1YKWbi%kEk-fEzcQVEq9N+XgzoEIm;rYHX1Sa&^%aqszUSfhFJ*h7{iI$(l zD^4<0*fY=+T6+qwKgBSf);F6*TTbI`rx}hj`YtnQj~P5*hT%7>51d7Z%;G__4Dg&j zbPkP}!=vXI3G@0%^XQa$eEK{idqF>c0bR6!FI`|%eAln~j;{HRulvquVC%k%=(a^X zev#4hL%;tAdgup!^ao>NNq=Suy|9E|T4JpH)L;LJ-uj8({mCFK>l2sJlw~|)naS~= z0q1`h?*F>@{$mQP7znRm#8$c_R+!Si3}k;{_Trc6Mh>c{l=vH?n?j7 z%-%A{-@+7ab(L;0EB+W%{lV1y>8ks~Y}huyZDZQlgnygavt!V|gBfBM6Ly#ry9P76 zn1$W0rCsLAp27MaW^1o&caKTfHz4j~DEnQEeHI5BrysB*1?=xWmH@#}m=G^U=$0U` zqz?^c598$zyA=;vDo2LuNAcQ6-TFr?<6}d!<9N&CZrfv)Bhk=>81F&s1`t_(Btsx6 zK7`Z_BC)_^Lnt{O!A8eqR>Fy4(n);ENq71Qi`~M_r?5TI?otY?f@)Yrjjy40*HKvw zG(#LMzKzz+MzuY3!+v`F5WRbp&YEBt&M@K^7~M+@)(X>bof*Hy?A~Rv2rNS)EB@r# zsq;eo+?zLIZO<5KIgE&O68 zq76{*wj&}goG}D!^wqiKN zWVu`Y!Y&!fl#;@ZZxLx|mQ?xNJ{wUP9-KYHVH9{Z5ikW`x4vXbFm)0(0|$DEnh~A^h?#>c!^F)AeGmx?@ETIe z;_z&u^u6fosWSHtO><=}qXUcOEDxW&lDCSktW~f&>}yoCj$Z3fvOYT7t6~GWKCEhU zWICm83kh7*v^{#Vs%;0U+}5=_>O0c6hpf>I?T%+^=fu6tk{t??mJ@add{tT05~1g`C(Qd_XVuB9au~;R>w^^KvEjLjZ2j zb)>Hwi8Im99VVL^;7&5j1$w}M#laq=q*o!Hu&UZHPf~v)$P2dK5$Q$d>;-$nWrrc& zWV0zK01jM)1IS6M2p@RWHqwXOe}wjhuhZgvPtKi5co2JoC-K3F`K2Vk*dXy_zmumj zDgLpq6;l0A22|4nVmEX%11RTAvI7w}taAe?=1%!QM35I2NO>Ai6ohykRvbhbfRqL! zHjw4P)N_dy594m6zIsSC&#elH3o5P-p+0?86B_rrwliVpUggEEo>7GUV=utgYS5V8bE zZz7lBOs>S0Sd3iiYAn+tcO8KVF5W~ilV5GcVXAAlz+bIP)ZAX>b{x zNS}R>cjTERlACAkFBmy%1rU%DjTy`*Ds|AubA4dCl1SR6m=LeAU-^v&BwvV7rhR-ReA2 zxf3WX(*KIDD=Y=>JM!6$46S)WhpW@E+&;lha@3PgI{kplkV zE~yO#8iVdM>A>*(+>ZsW!yZVLpzxXE#sb*52hI2)NcdG#LBW(K(lHd|Qj05Sp7W&n zg@f`Nn+s+ay^!Dt&`d{50XvURONfdP?rp`2ta>A}qa$2~+prp&-ZXX}HGird>$(j< zHozig7CW%8eE_W|Hd1)C6I*cPgPe$qblJvZo5?=36;x#YQ5SZW=8Gg?B4=pbSQ5*Z z&haQpl>37Ln;I z#9s18IX(xwD)bk&i2Bq0(!d3(1BG+q0Vr?=cvg3?kSrBIPsoZEG5Jy?Diesx&WU!l z9xBq53#3=%MHe`IEpk%;q8bXKXT64t;7UMxPa#AkV5A7E8ibm70dWl*Eo#vWqOX)d z3Ls-ebGpGOLK$QhIbK9I3}$fr8zYkV?S-hxL$tul7}wN^7n&9i8Pb(81-X+i+^j>; zDz9T^i>F?|?L!#GZ=fQtre9#4LeY+Ip{}(vFIwC}8Gi4e1&y;W=Dfnt;P=qkj=2|P z-!MkP2bf6je6eUiI6C_y%yoF7STi`BQPBh|nEGDq76w8$G{a^W7mMMMAVyCsTx9h} zF%}Ymo@j@=ZZ8$Lz#|wdo$!LApT%>?NHn1fK1*9JCdWrIIX=aTp82mtG%*Sz@Hy6v zXQf0lIf^OW7mL00tHdo8j8PegofBUzfoFo5#$OPkGHWH++-QvBSA?6wdPxg5n&~%! zz^ZPPu>Tu^0goZ(bT>=LVLg-=(6jVldg0ac@0oWzT+(#x?&nD_~5yxTUTyoHQmt97b198{Il%q<7ZHhSSLVoKZNJ5$e2A8n;o&UqLZ qM)-_=@Eh%1Frr^H<`po$DrkJ`lChqUv5l~?w}|2VqeuVe`F{YKZyUe> literal 0 HcmV?d00001 From 2c5ea579e45fb18e4051215a00689d4755936782 Mon Sep 17 00:00:00 2001 From: Alex Merose Date: Mon, 18 Sep 2023 06:07:22 -0400 Subject: [PATCH 08/16] Removing test data after clone in Docker image. (#351) I believe that our docker images are larger than they should be since they also include checked-in test data. This change adds a short-term fix to delete all `test_data` folders in each weather_tool before building the rest of the image. In small experiments, this can save ~190 MiBs of disk space. --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 057442f5..30a72f57 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,6 +29,7 @@ ARG weather_tools_git_rev=main RUN git clone https://github.com/google/weather-tools.git /weather WORKDIR /weather RUN git checkout "${weather_tools_git_rev}" +RUN rm -r /weather/weather_*/test_data/ RUN conda env create -f environment.yml --debug # Activate the conda env and update the PATH From 85799070805ac5149d9c576f4ca13c2381d3fc89 Mon Sep 17 00:00:00 2001 From: dabhi_cusp <123355381+dabhicusp@users.noreply.github.com> Date: Sat, 23 Sep 2023 11:01:29 +0530 Subject: [PATCH 09/16] Added feature in weather-mv to extract specific date's data from zarr files. (#396) * Function added: Access specific data from zarr. * Nit changes done. * Nit changes done. * Pandas removed in time-range. * example added in the readme.md file. * Zarr_kwargs added. * testcase updated. * nit changes. --- weather_mv/README.md | 23 +++++++++++++++++++++++ weather_mv/loader_pipeline/bq.py | 4 +++- weather_mv/loader_pipeline/bq_test.py | 5 +++-- weather_mv/loader_pipeline/pipeline.py | 4 ++++ weather_mv/loader_pipeline/sinks.py | 15 +++++++++++---- 5 files changed, 44 insertions(+), 7 deletions(-) diff --git a/weather_mv/README.md b/weather_mv/README.md index ee32b103..7374cd0b 100644 --- a/weather_mv/README.md +++ b/weather_mv/README.md @@ -165,6 +165,29 @@ weather-mv bq --uris "gs://your-bucket/*.nc" \ --direct_num_workers 2 ``` +Upload a zarr file: + +```bash +weather-mv bq --uris "gs://your-bucket/*.zarr" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --temp_location "gs://$BUCKET/tmp" \ + --use-local-code \ + --zarr \ + --direct_num_workers 2 +``` + +Upload a specific date range's data from the zarr file: + +```bash +weather-mv bq --uris "gs://your-bucket/*.zarr" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --temp_location "gs://$BUCKET/tmp" \ + --use-local-code \ + --zarr \ + --zarr_kwargs '{"start_date": "2021-07-18", "end_date": "2021-07-19"}' \ + --direct_num_workers 2 +``` + Control how weather data is opened with XArray: ```bash diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py index e1b39a22..a1cac9f9 100644 --- a/weather_mv/loader_pipeline/bq.py +++ b/weather_mv/loader_pipeline/bq.py @@ -308,7 +308,9 @@ def expand(self, paths): | 'ExtractRows' >> beam.FlatMapTuple(self.extract_rows) ) else: - ds, chunks = xbeam.open_zarr(self.first_uri, **self.xarray_open_dataset_kwargs) + xarray_open_dataset_kwargs = self.xarray_open_dataset_kwargs.copy() + xarray_open_dataset_kwargs.pop('chunks') + ds, chunks = xbeam.open_zarr(self.first_uri, **xarray_open_dataset_kwargs) ds.attrs[DATA_URI_COLUMN] = self.first_uri extracted_rows = ( paths diff --git a/weather_mv/loader_pipeline/bq_test.py b/weather_mv/loader_pipeline/bq_test.py index 0faa567c..fae7ab31 100644 --- a/weather_mv/loader_pipeline/bq_test.py +++ b/weather_mv/loader_pipeline/bq_test.py @@ -478,7 +478,7 @@ def setUp(self) -> None: def test_extract_rows_with_end_time(self): actual = next( self.extract(self.test_data_path, tif_metadata_for_start_time='start_time', - tif_metadata_for_end_time='end_time') + tif_metadata_for_end_time='end_time') ) expected = { 'dewpoint_temperature_2m': 281.09349060058594, @@ -787,7 +787,8 @@ def test_extracts_rows(self): ds.to_zarr(input_zarr) op = ToBigQuery.from_kwargs( - first_uri=input_zarr, zarr_kwargs=dict(), dry_run=True, zarr=True, output_table='foo.bar.baz', + first_uri=input_zarr, zarr_kwargs=dict(chunks=None, consolidated=True), dry_run=True, zarr=True, + output_table='foo.bar.baz', variables=list(), area=list(), xarray_open_dataset_kwargs=dict(), import_time=None, infer_schema=False, tif_metadata_for_start_time=None, tif_metadata_for_end_time=None, skip_region_validation=True, disable_grib_schema_normalization=False, diff --git a/weather_mv/loader_pipeline/pipeline.py b/weather_mv/loader_pipeline/pipeline.py index f6d40c41..842d28c1 100644 --- a/weather_mv/loader_pipeline/pipeline.py +++ b/weather_mv/loader_pipeline/pipeline.py @@ -145,6 +145,10 @@ def run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]: if known_args.zarr_kwargs and not known_args.zarr: raise ValueError('`--zarr_kwargs` argument is only allowed with valid Zarr input URI.') + if known_args.zarr: + known_args.zarr_kwargs['chunks'] = known_args.zarr_kwargs.get('chunks', None) + known_args.zarr_kwargs['consolidated'] = known_args.zarr_kwargs.get('consolidated', True) + # Validate subcommand if known_args.subcommand == 'bigquery' or known_args.subcommand == 'bq': ToBigQuery.validate_arguments(known_args, pipeline_args) diff --git a/weather_mv/loader_pipeline/sinks.py b/weather_mv/loader_pipeline/sinks.py index 332e1aac..cd882b48 100644 --- a/weather_mv/loader_pipeline/sinks.py +++ b/weather_mv/loader_pipeline/sinks.py @@ -182,12 +182,12 @@ def _replace_dataarray_names_with_long_names(ds: xr.Dataset): try: # if start_time/end_time is in integer milliseconds init_time = (int(start_time.timestamp()) if start_time is not None - else int(ds.attrs[tif_metadata_for_start_time]) / 1000.0) + else int(ds.attrs[tif_metadata_for_start_time]) / 1000.0) coords['time'] = datetime.datetime.utcfromtimestamp(init_time) if tif_metadata_for_end_time: forecast_time = (int(end_time.timestamp()) if end_time is not None - else int(ds.attrs[tif_metadata_for_end_time]) / 1000.0) + else int(ds.attrs[tif_metadata_for_end_time]) / 1000.0) coords['valid_time'] = datetime.datetime.utcfromtimestamp(forecast_time) ds = ds.assign_coords(coords) @@ -197,12 +197,14 @@ def _replace_dataarray_names_with_long_names(ds: xr.Dataset): try: # if start_time/end_time is in UTC string format init_time = (int(start_time.timestamp()) if start_time is not None - else datetime.datetime.strptime(ds.attrs[tif_metadata_for_start_time], '%Y-%m-%dT%H:%M:%SZ')) + else datetime.datetime.strptime(ds.attrs[tif_metadata_for_start_time], + '%Y-%m-%dT%H:%M:%SZ')) coords['time'] = init_time if tif_metadata_for_end_time: forecast_time = (int(end_time.timestamp()) if end_time is not None - else datetime.datetime.strptime(ds.attrs[tif_metadata_for_end_time], '%Y-%m-%dT%H:%M:%SZ')) + else datetime.datetime.strptime(ds.attrs[tif_metadata_for_end_time], + '%Y-%m-%dT%H:%M:%SZ')) coords['valid_time'] = forecast_time ds = ds.assign_coords(coords) @@ -406,7 +408,12 @@ def open_dataset(uri: str, """Open the dataset at 'uri' and return a xarray.Dataset.""" try: if is_zarr: + if open_dataset_kwargs is not None: + start_date = open_dataset_kwargs.pop('start_date', None) + end_date = open_dataset_kwargs.pop('end_date', None) ds: xr.Dataset = _add_is_normalized_attr(xr.open_dataset(uri, engine='zarr', **open_dataset_kwargs), False) + if start_date is not None and end_date is not None: + ds = ds.sel(time=slice(start_date, end_date)) beam.metrics.Metrics.counter('Success', 'ReadNetcdfData').inc() yield ds ds.close() From dcfc11afa932378120fc1a3b10a3007a0cdfd177 Mon Sep 17 00:00:00 2001 From: DeepGabani <60647051+deepgabani8@users.noreply.github.com> Date: Tue, 3 Oct 2023 12:25:18 +0530 Subject: [PATCH 10/16] Not re-initializing ee if it's already initialized in the same worker. (#399) * Not initializing ee for every worker if it has already been initialized before. * Bumped version number --- weather_mv/loader_pipeline/ee.py | 7 ++++++- weather_mv/setup.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/weather_mv/loader_pipeline/ee.py b/weather_mv/loader_pipeline/ee.py index 094b3a17..e117c4de 100644 --- a/weather_mv/loader_pipeline/ee.py +++ b/weather_mv/loader_pipeline/ee.py @@ -157,7 +157,12 @@ def setup(self): def check_setup(self): """Ensures that setup has been called.""" if not self._has_setup: - self.setup() + try: + # This throws an exception if ee is not initialized. + ee.data.getAlgorithms() + self._has_setup = True + except ee.EEException: + self.setup() def process(self, *args, **kwargs): """Checks that setup has been called then call the process implementation.""" diff --git a/weather_mv/setup.py b/weather_mv/setup.py index b46121a5..4bdb4a0b 100644 --- a/weather_mv/setup.py +++ b/weather_mv/setup.py @@ -65,7 +65,7 @@ packages=find_packages(), author='Anthromets', author_email='anthromets-ecmwf@google.com', - version='0.2.18', + version='0.2.19', url='https://weather-tools.readthedocs.io/en/latest/weather_mv/', description='A tool to load weather data into BigQuery.', install_requires=beam_gcp_requirements + base_requirements, From 765a40c7906feed5435bf2cb6799e49bb79fe9e6 Mon Sep 17 00:00:00 2001 From: dabhi_cusp <123355381+dabhicusp@users.noreply.github.com> Date: Thu, 5 Oct 2023 11:51:19 +0530 Subject: [PATCH 11/16] Fix : for a added feature in weather-mv to extract specific date's data from zarr files. (#400) * Start_date & end_date of zarr_kwargs are added. * Pytype checked. * Nit changes done. * Warning message added. * Warning.warn added in pipeline.py --- weather_mv/loader_pipeline/bq.py | 14 ++++++++++++-- weather_mv/loader_pipeline/pipeline.py | 6 ++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py index a1cac9f9..68850d6a 100644 --- a/weather_mv/loader_pipeline/bq.py +++ b/weather_mv/loader_pipeline/bq.py @@ -104,6 +104,8 @@ class ToBigQuery(ToDataSink): skip_creating_polygon: bool = False lat_grid_resolution: t.Optional[float] = None lon_grid_resolution: t.Optional[float] = None + start_date: t.Optional[str] = None + end_date: t.Optional[str] = None @classmethod def add_parser_arguments(cls, subparser: argparse.ArgumentParser): @@ -154,8 +156,10 @@ def validate_arguments(cls, known_args: argparse.Namespace, pipeline_args: t.Lis _, uri_extension = os.path.splitext(known_args.uris) if (uri_extension in ['.tif', '.tiff'] and not known_args.tif_metadata_for_start_time): raise RuntimeError("'--tif_metadata_for_start_time' is required for tif files.") - elif (uri_extension not in ['.tif', '.tiff'] and (known_args.tif_metadata_for_start_time - or known_args.tif_metadata_for_end_time)): + elif uri_extension not in ['.tif', '.tiff'] and ( + known_args.tif_metadata_for_start_time + or known_args.tif_metadata_for_end_time + ): raise RuntimeError("'--tif_metadata_for_start_time' and " "'--tif_metadata_for_end_time' can be specified only for tif files.") @@ -171,6 +175,8 @@ def __post_init__(self): """Initializes Sink by creating a BigQuery table based on user input.""" if self.zarr: self.xarray_open_dataset_kwargs = self.zarr_kwargs + self.start_date = self.zarr_kwargs.get('start_date') + self.end_date = self.zarr_kwargs.get('end_date') with open_dataset(self.first_uri, self.xarray_open_dataset_kwargs, self.disable_grib_schema_normalization, self.tif_metadata_for_start_time, self.tif_metadata_for_end_time, is_zarr=self.zarr) as open_ds: @@ -311,6 +317,10 @@ def expand(self, paths): xarray_open_dataset_kwargs = self.xarray_open_dataset_kwargs.copy() xarray_open_dataset_kwargs.pop('chunks') ds, chunks = xbeam.open_zarr(self.first_uri, **xarray_open_dataset_kwargs) + + if self.start_date is not None and self.end_date is not None: + ds = ds.sel(time=slice(self.start_date, self.end_date)) + ds.attrs[DATA_URI_COLUMN] = self.first_uri extracted_rows = ( paths diff --git a/weather_mv/loader_pipeline/pipeline.py b/weather_mv/loader_pipeline/pipeline.py index 842d28c1..ef685473 100644 --- a/weather_mv/loader_pipeline/pipeline.py +++ b/weather_mv/loader_pipeline/pipeline.py @@ -17,6 +17,7 @@ import json import logging import typing as t +import warnings import apache_beam as beam from apache_beam.io.filesystems import FileSystems @@ -145,6 +146,11 @@ def run(argv: t.List[str]) -> t.Tuple[argparse.Namespace, t.List[str]]: if known_args.zarr_kwargs and not known_args.zarr: raise ValueError('`--zarr_kwargs` argument is only allowed with valid Zarr input URI.') + if known_args.zarr_kwargs: + if not known_args.zarr_kwargs.get('start_date') or not known_args.zarr_kwargs.get('end_date'): + warnings.warn('`--zarr_kwargs` not contains both `start_date` and `end_date`' + 'so whole zarr-dataset will ingested.') + if known_args.zarr: known_args.zarr_kwargs['chunks'] = known_args.zarr_kwargs.get('chunks', None) known_args.zarr_kwargs['consolidated'] = known_args.zarr_kwargs.get('consolidated', True) From 3bd6183c6c3e247159d8b014d461207a6f4302ea Mon Sep 17 00:00:00 2001 From: dabhi_cusp <123355381+dabhicusp@users.noreply.github.com> Date: Tue, 10 Oct 2023 12:38:14 +0530 Subject: [PATCH 12/16] Add: extract specific date's data from any files. (#402) --- weather_mv/README.md | 10 ++++++++++ weather_mv/loader_pipeline/bq.py | 10 ++++------ weather_mv/loader_pipeline/sinks.py | 16 +++++++++++----- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/weather_mv/README.md b/weather_mv/README.md index 7374cd0b..4e77346b 100644 --- a/weather_mv/README.md +++ b/weather_mv/README.md @@ -188,6 +188,16 @@ weather-mv bq --uris "gs://your-bucket/*.zarr" \ --direct_num_workers 2 ``` +Upload a specific date range's data from the file: + +```bash +weather-mv bq --uris "gs://your-bucket/*.nc" \ + --output_table $PROJECT.$DATASET_ID.$TABLE_ID \ + --temp_location "gs://$BUCKET/tmp" \ + --use-local-code \ + --xarray_open_dataset_kwargs '{"start_date": "2021-07-18", "end_date": "2021-07-19"}' \ +``` + Control how weather data is opened with XArray: ```bash diff --git a/weather_mv/loader_pipeline/bq.py b/weather_mv/loader_pipeline/bq.py index 68850d6a..14221218 100644 --- a/weather_mv/loader_pipeline/bq.py +++ b/weather_mv/loader_pipeline/bq.py @@ -104,8 +104,6 @@ class ToBigQuery(ToDataSink): skip_creating_polygon: bool = False lat_grid_resolution: t.Optional[float] = None lon_grid_resolution: t.Optional[float] = None - start_date: t.Optional[str] = None - end_date: t.Optional[str] = None @classmethod def add_parser_arguments(cls, subparser: argparse.ArgumentParser): @@ -175,8 +173,6 @@ def __post_init__(self): """Initializes Sink by creating a BigQuery table based on user input.""" if self.zarr: self.xarray_open_dataset_kwargs = self.zarr_kwargs - self.start_date = self.zarr_kwargs.get('start_date') - self.end_date = self.zarr_kwargs.get('end_date') with open_dataset(self.first_uri, self.xarray_open_dataset_kwargs, self.disable_grib_schema_normalization, self.tif_metadata_for_start_time, self.tif_metadata_for_end_time, is_zarr=self.zarr) as open_ds: @@ -316,10 +312,12 @@ def expand(self, paths): else: xarray_open_dataset_kwargs = self.xarray_open_dataset_kwargs.copy() xarray_open_dataset_kwargs.pop('chunks') + start_date = xarray_open_dataset_kwargs.pop('start_date', None) + end_date = xarray_open_dataset_kwargs.pop('end_date', None) ds, chunks = xbeam.open_zarr(self.first_uri, **xarray_open_dataset_kwargs) - if self.start_date is not None and self.end_date is not None: - ds = ds.sel(time=slice(self.start_date, self.end_date)) + if start_date is not None and end_date is not None: + ds = ds.sel(time=slice(start_date, end_date)) ds.attrs[DATA_URI_COLUMN] = self.first_uri extracted_rows = ( diff --git a/weather_mv/loader_pipeline/sinks.py b/weather_mv/loader_pipeline/sinks.py index cd882b48..bba1bea5 100644 --- a/weather_mv/loader_pipeline/sinks.py +++ b/weather_mv/loader_pipeline/sinks.py @@ -407,11 +407,15 @@ def open_dataset(uri: str, is_zarr: bool = False) -> t.Iterator[xr.Dataset]: """Open the dataset at 'uri' and return a xarray.Dataset.""" try: + local_open_dataset_kwargs = start_date = end_date = None + if open_dataset_kwargs is not None: + local_open_dataset_kwargs = open_dataset_kwargs.copy() + start_date = local_open_dataset_kwargs.pop('start_date', None) + end_date = local_open_dataset_kwargs.pop('end_date', None) + if is_zarr: - if open_dataset_kwargs is not None: - start_date = open_dataset_kwargs.pop('start_date', None) - end_date = open_dataset_kwargs.pop('end_date', None) - ds: xr.Dataset = _add_is_normalized_attr(xr.open_dataset(uri, engine='zarr', **open_dataset_kwargs), False) + ds: xr.Dataset = _add_is_normalized_attr(xr.open_dataset(uri, engine='zarr', + **local_open_dataset_kwargs), False) if start_date is not None and end_date is not None: ds = ds.sel(time=slice(start_date, end_date)) beam.metrics.Metrics.counter('Success', 'ReadNetcdfData').inc() @@ -423,7 +427,9 @@ def open_dataset(uri: str, xr_dataset: xr.Dataset = __open_dataset_file(local_path, uri_extension, disable_grib_schema_normalization, - open_dataset_kwargs) + local_open_dataset_kwargs) + if start_date is not None and end_date is not None: + xr_dataset = xr_dataset.sel(time=slice(start_date, end_date)) if uri_extension in ['.tif', '.tiff']: xr_dataset = _preprocess_tif(xr_dataset, local_path, From b423d8a18f742a99c85e71da5bb8176d10007b59 Mon Sep 17 00:00:00 2001 From: dabhi_cusp <123355381+dabhicusp@users.noreply.github.com> Date: Thu, 26 Oct 2023 19:54:26 +0530 Subject: [PATCH 13/16] Ruff version fixed to 0.1.2. (#410) * Ruff version unfixed. * ruff version updated. --- .github/workflows/ci.yml | 2 +- ci3.8.yml | 2 +- ci3.9.yml | 2 +- setup.py | 2 +- weather_dl/download_pipeline/util.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac86adc1..bf28a5e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -84,7 +84,7 @@ jobs: echo "::set-output name=dir::$(pip cache dir)" - name: Install linter run: | - pip install ruff==0.0.280 + pip install ruff==0.1.2 - name: Lint project run: ruff check . type-check: diff --git a/ci3.8.yml b/ci3.8.yml index 803e59c4..211d36be 100644 --- a/ci3.8.yml +++ b/ci3.8.yml @@ -30,7 +30,7 @@ dependencies: - pip=22.3 - pygrib=2.1.4 - xarray==2023.1.0 - - ruff==0.0.260 + - ruff==0.1.2 - google-cloud-sdk=410.0.0 - aria2=1.36.0 - zarr=2.15.0 diff --git a/ci3.9.yml b/ci3.9.yml index e9e0671f..86f0968d 100644 --- a/ci3.9.yml +++ b/ci3.9.yml @@ -32,7 +32,7 @@ dependencies: - google-cloud-sdk=410.0.0 - aria2=1.36.0 - xarray==2023.1.0 - - ruff==0.0.260 + - ruff==0.1.2 - zarr=2.15.0 - pip: - cython==0.29.34 diff --git a/setup.py b/setup.py index dedb552e..233e5193 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,7 @@ test_requirements = [ "pytype==2021.11.29", - "ruff", + "ruff==0.1.2", "pytest", "pytest-subtests", "netcdf4", diff --git a/weather_dl/download_pipeline/util.py b/weather_dl/download_pipeline/util.py index 3e92b8d8..dd09ba29 100644 --- a/weather_dl/download_pipeline/util.py +++ b/weather_dl/download_pipeline/util.py @@ -102,7 +102,7 @@ def to_json_serializable_type(value: t.Any) -> t.Any: return None elif np.issubdtype(type(value), np.floating): return float(value) - elif type(value) == np.ndarray: + elif isinstance(value, np.ndarray): # Will return a scaler if array is of size 1, else will return a list. return value.tolist() elif isinstance(value, datetime.datetime) or isinstance(value, str) or isinstance(value, np.datetime64): @@ -126,7 +126,7 @@ def to_json_serializable_type(value: t.Any) -> t.Any: # We assume here that naive timestamps are in UTC timezone. return value.replace(tzinfo=datetime.timezone.utc).isoformat() - elif type(value) == np.timedelta64: + elif isinstance(value, np.timedelta64): # Return time delta in seconds. return float(value / np.timedelta64(1, 's')) # This check must happen after processing np.timedelta64 and np.datetime64. From 440dfa696a64b5d2cdd2d978fe68000c5c68dca2 Mon Sep 17 00:00:00 2001 From: Rahul Mahrsee <86819420+mahrsee1997@users.noreply.github.com> Date: Wed, 1 Nov 2023 21:53:52 +0530 Subject: [PATCH 14/16] weather-dl v2 (#408) * Initial Commit. * Replaced in-memory db with db layer (FirestoreClient()). * Updated api_endpoints & added server k8s dep. conf * Updated 'manifest' system to uses 'database' layer * Updated Dockerfile to create server image. * Added nginx server. * fix linter issues. * Inital commit for license deployment. * Fix license deployment * Made changes to deploy license from fastapi server. * Minor directory restructuring. * Minor fixes related to license-deployment. * weather-dl-v2 `cli` (#346) * Cli Init * Added network service. * Added Download subcommand and service. * Added Queue subcommand and service. * Added License subcommand and service. * Updated readme and dockerfile. * Update README.md * Update Dockerfile * updated readme * added missing logger * added logs before exiting in network service * Made use of google-secret-manager for storing license keys. * Necessary code changes in weather-dl-v2 cli. * Added CLI & API Interaction doc. & Implemented absolute priority. * Removed nginx-server -- not required. * dl-v2 server refactor no tests (#358) * Refactored DB layer. * added logger * added logger to new db layer * added mock databases * added mock databases * Added Integration tests for download. * Fixed license handler and router after mc. * Added license integration tests * Added integrations tests for queue. * fixed lint issues * ignore ruff for a test * removed old db_service * refactor license deployment * updated queue modify priority * removed integration tests * fastapi server minor fixes * updated logger * Update main.py * revert pyproject.toml * Update pyproject.toml * Update pyproject.toml * fixes in license deployment * updated license handler for secret manager * `dl-v2` server integration tests (#359) * added tests for fast-api server * added test for absolute license priority * minor fix * updated cli create-instance command & lint fixes (#364) * updated vm creation command * cli lint fixes * revert pyproj * seperated startup script * updated readme * fixed typo * Pending deployment bootup (#366) * Added pending deployments on server bootup. * minor refactoring in db layer * using create license deployment from license route * updated args in downloader * minor fix * Updated README.md. * formatting by pyink for dl_v2 (#367) * added pyink to env ymls * formatting changes * updated readme * lint fixes * remove pyink from env * passing license handler to update license handler (#369) * `dl-v2` server config (#370) * added server config * added global state for config and db session * seperated server config json * minor fix * minor nit: updated var name * Added config status to dl-v2-server (#371) * parallel queury for aggregating config stats and manifest handler * get download routes * mock manifest_handler and updated download tests * updated manifest collection in config * lint fixes * updated config stat response * removed extra db call for get download by config * updated manifest collection name * lint fixes * `dl-v2` server firestore async client (#375) * added firestore async client * minor fixes * updated database handlers * Readme updates. * .gitkeep -> config_files folder. * fixed download add call. * queue handler fix * Fix _update_queues_on_stop_download db call. * Fix file upload on fastapi server. * Improved logging. * Fix download add functionality. * cli condition fix * adding additional configs from queue priority * Added a check in priority_queue list route. * lint fixes * print -> logger.info in queue_handler.py. * removed print statements --------- Co-authored-by: Rahul Mahrsee * Implemented force-download. * await create_deployment call in main.py fast-api server. * Improved config partitioning -- maintain status. * refetch config (#378) * added config refetching * added cli command for refetch * updated docs * added license check * loader for cli * Assign download jobs to its dedicated node-pool. * `dl-v2` improvements (#380) * added deployment config * added cli config * command to see and update server ip address. * added basic tools to docker container * lint fixes * minor fix, removed pring stmt * minor fix, cli * minor fix, cli * minor fix, license dep * minor fix, server * `dl-v2` imporvements 2 (#381) * added gsutil to cli container * updated license id as user input * added license_id to cli license add command * minor fix * updated cli readme * minor fix * added validation for license id (#382) * `dl-v2` additional download filters (#383) * added additonal filter params to download * added additional filter to cli validation * minor fixes * minor nit and updated cli docs * minor doc fix * allow multiple filters fix * updated inprogress filter * updated help string cli * Update cli docs * `dl-v2` improvements 3 (#384) * added downloader config * added downloader k8 image to deployment_config * added license deployment image to server_config * added license id to error msg for failed manifest * lint fixes * minor nit * minor nit * `dl-v2` improvements 4 (#386) * minor fixes and upgrades * updated network service in cli * `weather-dl-v2` cli show table (#388) * added option to view data as table * updated dependencies * minor fix * set table default response * `dl-v2` server queue client name bug (#389) * Updating queue client_name when license is updated. * nit: updated method name * `dl-v2` license dep logger (#390) * updated logger * fixed logger * `dl-v2` cli minor updates (#393) * minor changes * lint fix * `dl-v2` added gcs storage and updated retry partition (#394) * added GCS storage capabltiy to server * updated refetch partition to use config from gcs storage * lint fixes * added cli command to show config and updated docs * nit fixes * updated comments for dl-v2 * `dl-v2` cli doc update (#395) * updated cli docs * updated cli doc * `dl v2` config map (#397) * updated config files for all services * added config maps to all services * added config.json * updated readmes * fixed downloader yml * updated config.json(s) * removed config folder * updated get_config * updated readmes * `dl-v2` pre merge (#403) * lint fixes in downloader_k8s * removed comment * `dl v2` added license to dl-v2 (#405) * lint fixes in downloader_k8s * removed comment * added license to dl-v2 * `dl-v2` todos (#407) * added todos * nits * removed old todos * nit * `dl v2` lint fixes (#409) * lint fixes * minor fix * `dl-v2` server test fix (#412) * fixed license tests * fixed download tests * fixed license tests * updated name for missing config condition * updated readme * skipping dl-v2 tests in CI * lint fixes (#413) --------- Co-authored-by: aniketinfocusp <122869307+aniketinfocusp@users.noreply.github.com> --- .github/workflows/ci.yml | 2 +- pyproject.toml | 2 +- weather_dl_v2/README.md | 12 + weather_dl_v2/__init__.py | 13 + weather_dl_v2/cli/CLI-Documentation.md | 306 +++++++++++ weather_dl_v2/cli/Dockerfile | 43 ++ weather_dl_v2/cli/README.md | 58 ++ weather_dl_v2/cli/app/__init__.py | 13 + weather_dl_v2/cli/app/cli_config.py | 62 +++ weather_dl_v2/cli/app/main.py | 48 ++ weather_dl_v2/cli/app/services/__init__.py | 13 + .../cli/app/services/download_service.py | 128 +++++ .../cli/app/services/license_service.py | 107 ++++ .../cli/app/services/network_service.py | 85 +++ .../cli/app/services/queue_service.py | 101 ++++ weather_dl_v2/cli/app/subcommands/__init__.py | 13 + weather_dl_v2/cli/app/subcommands/config.py | 60 ++ weather_dl_v2/cli/app/subcommands/download.py | 102 ++++ weather_dl_v2/cli/app/subcommands/license.py | 104 ++++ weather_dl_v2/cli/app/subcommands/queue.py | 111 ++++ weather_dl_v2/cli/app/utils.py | 168 ++++++ weather_dl_v2/cli/cli_config.json | 4 + weather_dl_v2/cli/environment.yml | 14 + weather_dl_v2/cli/setup.py | 30 + weather_dl_v2/cli/vm-startup.sh | 4 + weather_dl_v2/config.json | 11 + .../downloader_kubernetes/Dockerfile | 46 ++ weather_dl_v2/downloader_kubernetes/README.md | 23 + .../downloader_kubernetes/downloader.py | 73 +++ .../downloader_config.py | 65 +++ .../downloader_kubernetes/environment.yml | 17 + .../downloader_kubernetes/manifest.py | 503 +++++++++++++++++ weather_dl_v2/downloader_kubernetes/util.py | 226 ++++++++ .../fastapi-server/API-Interactions.md | 25 + weather_dl_v2/fastapi-server/Dockerfile | 40 ++ weather_dl_v2/fastapi-server/README.md | 91 +++ weather_dl_v2/fastapi-server/__init__.py | 13 + .../config_processing/config.py | 120 ++++ .../config_processing/manifest.py | 513 +++++++++++++++++ .../config_processing/parsers.py | 507 +++++++++++++++++ .../config_processing/partition.py | 129 +++++ .../config_processing/pipeline.py | 69 +++ .../config_processing/stores.py | 122 ++++ .../fastapi-server/config_processing/util.py | 229 ++++++++ .../fastapi-server/database/__init__.py | 13 + .../database/download_handler.py | 160 ++++++ .../database/license_handler.py | 200 +++++++ .../database/manifest_handler.py | 181 ++++++ .../fastapi-server/database/queue_handler.py | 247 +++++++++ .../fastapi-server/database/session.py | 79 +++ .../database/storage_handler.py | 77 +++ weather_dl_v2/fastapi-server/environment.yml | 18 + weather_dl_v2/fastapi-server/example.cfg | 32 ++ .../license_dep/deployment_creator.py | 67 +++ .../license_dep/license_deployment.yaml | 35 ++ weather_dl_v2/fastapi-server/logging.conf | 36 ++ weather_dl_v2/fastapi-server/main.py | 70 +++ .../fastapi-server/routers/download.py | 386 +++++++++++++ .../fastapi-server/routers/license.py | 202 +++++++ .../fastapi-server/routers/queues.py | 124 +++++ weather_dl_v2/fastapi-server/server.yaml | 93 ++++ weather_dl_v2/fastapi-server/server_config.py | 72 +++ .../fastapi-server/tests/__init__.py | 13 + .../tests/integration/__init__.py | 13 + .../tests/integration/test_download.py | 175 ++++++ .../tests/integration/test_license.py | 207 +++++++ .../tests/integration/test_queues.py | 148 +++++ .../tests/test_data/example.cfg | 32 ++ .../tests/test_data/not_exist.cfg | 32 ++ weather_dl_v2/license_deployment/Dockerfile | 34 ++ weather_dl_v2/license_deployment/README.md | 21 + weather_dl_v2/license_deployment/__init__.py | 13 + weather_dl_v2/license_deployment/clients.py | 417 ++++++++++++++ weather_dl_v2/license_deployment/config.py | 120 ++++ weather_dl_v2/license_deployment/database.py | 161 ++++++ .../license_deployment/deployment_config.py | 69 +++ .../license_deployment/downloader.yaml | 33 ++ .../license_deployment/environment.yml | 17 + weather_dl_v2/license_deployment/fetch.py | 139 +++++ .../license_deployment/job_creator.py | 58 ++ weather_dl_v2/license_deployment/manifest.py | 520 ++++++++++++++++++ weather_dl_v2/license_deployment/util.py | 239 ++++++++ 82 files changed, 8976 insertions(+), 2 deletions(-) create mode 100644 weather_dl_v2/README.md create mode 100644 weather_dl_v2/__init__.py create mode 100644 weather_dl_v2/cli/CLI-Documentation.md create mode 100644 weather_dl_v2/cli/Dockerfile create mode 100644 weather_dl_v2/cli/README.md create mode 100644 weather_dl_v2/cli/app/__init__.py create mode 100644 weather_dl_v2/cli/app/cli_config.py create mode 100644 weather_dl_v2/cli/app/main.py create mode 100644 weather_dl_v2/cli/app/services/__init__.py create mode 100644 weather_dl_v2/cli/app/services/download_service.py create mode 100644 weather_dl_v2/cli/app/services/license_service.py create mode 100644 weather_dl_v2/cli/app/services/network_service.py create mode 100644 weather_dl_v2/cli/app/services/queue_service.py create mode 100644 weather_dl_v2/cli/app/subcommands/__init__.py create mode 100644 weather_dl_v2/cli/app/subcommands/config.py create mode 100644 weather_dl_v2/cli/app/subcommands/download.py create mode 100644 weather_dl_v2/cli/app/subcommands/license.py create mode 100644 weather_dl_v2/cli/app/subcommands/queue.py create mode 100644 weather_dl_v2/cli/app/utils.py create mode 100644 weather_dl_v2/cli/cli_config.json create mode 100644 weather_dl_v2/cli/environment.yml create mode 100644 weather_dl_v2/cli/setup.py create mode 100644 weather_dl_v2/cli/vm-startup.sh create mode 100644 weather_dl_v2/config.json create mode 100644 weather_dl_v2/downloader_kubernetes/Dockerfile create mode 100644 weather_dl_v2/downloader_kubernetes/README.md create mode 100644 weather_dl_v2/downloader_kubernetes/downloader.py create mode 100644 weather_dl_v2/downloader_kubernetes/downloader_config.py create mode 100644 weather_dl_v2/downloader_kubernetes/environment.yml create mode 100644 weather_dl_v2/downloader_kubernetes/manifest.py create mode 100644 weather_dl_v2/downloader_kubernetes/util.py create mode 100644 weather_dl_v2/fastapi-server/API-Interactions.md create mode 100644 weather_dl_v2/fastapi-server/Dockerfile create mode 100644 weather_dl_v2/fastapi-server/README.md create mode 100644 weather_dl_v2/fastapi-server/__init__.py create mode 100644 weather_dl_v2/fastapi-server/config_processing/config.py create mode 100644 weather_dl_v2/fastapi-server/config_processing/manifest.py create mode 100644 weather_dl_v2/fastapi-server/config_processing/parsers.py create mode 100644 weather_dl_v2/fastapi-server/config_processing/partition.py create mode 100644 weather_dl_v2/fastapi-server/config_processing/pipeline.py create mode 100644 weather_dl_v2/fastapi-server/config_processing/stores.py create mode 100644 weather_dl_v2/fastapi-server/config_processing/util.py create mode 100644 weather_dl_v2/fastapi-server/database/__init__.py create mode 100644 weather_dl_v2/fastapi-server/database/download_handler.py create mode 100644 weather_dl_v2/fastapi-server/database/license_handler.py create mode 100644 weather_dl_v2/fastapi-server/database/manifest_handler.py create mode 100644 weather_dl_v2/fastapi-server/database/queue_handler.py create mode 100644 weather_dl_v2/fastapi-server/database/session.py create mode 100644 weather_dl_v2/fastapi-server/database/storage_handler.py create mode 100644 weather_dl_v2/fastapi-server/environment.yml create mode 100644 weather_dl_v2/fastapi-server/example.cfg create mode 100644 weather_dl_v2/fastapi-server/license_dep/deployment_creator.py create mode 100644 weather_dl_v2/fastapi-server/license_dep/license_deployment.yaml create mode 100644 weather_dl_v2/fastapi-server/logging.conf create mode 100644 weather_dl_v2/fastapi-server/main.py create mode 100644 weather_dl_v2/fastapi-server/routers/download.py create mode 100644 weather_dl_v2/fastapi-server/routers/license.py create mode 100644 weather_dl_v2/fastapi-server/routers/queues.py create mode 100644 weather_dl_v2/fastapi-server/server.yaml create mode 100644 weather_dl_v2/fastapi-server/server_config.py create mode 100644 weather_dl_v2/fastapi-server/tests/__init__.py create mode 100644 weather_dl_v2/fastapi-server/tests/integration/__init__.py create mode 100644 weather_dl_v2/fastapi-server/tests/integration/test_download.py create mode 100644 weather_dl_v2/fastapi-server/tests/integration/test_license.py create mode 100644 weather_dl_v2/fastapi-server/tests/integration/test_queues.py create mode 100644 weather_dl_v2/fastapi-server/tests/test_data/example.cfg create mode 100644 weather_dl_v2/fastapi-server/tests/test_data/not_exist.cfg create mode 100644 weather_dl_v2/license_deployment/Dockerfile create mode 100644 weather_dl_v2/license_deployment/README.md create mode 100644 weather_dl_v2/license_deployment/__init__.py create mode 100644 weather_dl_v2/license_deployment/clients.py create mode 100644 weather_dl_v2/license_deployment/config.py create mode 100644 weather_dl_v2/license_deployment/database.py create mode 100644 weather_dl_v2/license_deployment/deployment_config.py create mode 100644 weather_dl_v2/license_deployment/downloader.yaml create mode 100644 weather_dl_v2/license_deployment/environment.yml create mode 100644 weather_dl_v2/license_deployment/fetch.py create mode 100644 weather_dl_v2/license_deployment/job_creator.py create mode 100644 weather_dl_v2/license_deployment/manifest.py create mode 100644 weather_dl_v2/license_deployment/util.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bf28a5e4..b8d0839f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,7 +61,7 @@ jobs: run: python -m metview selfcheck - name: Run unit tests shell: bash -l {0} - run: pytest --memray + run: pytest --memray --ignore=weather_dl_v2 # Ignoring dl-v2 as it only supports py3.10 lint: runs-on: ubuntu-latest strategy: diff --git a/pyproject.toml b/pyproject.toml index 8e782e8d..2eaadb0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,4 +42,4 @@ target-version = "py310" [tool.ruff.mccabe] # Unlike Flake8, default to a complexity level of 10. -max-complexity = 10 \ No newline at end of file +max-complexity = 10 diff --git a/weather_dl_v2/README.md b/weather_dl_v2/README.md new file mode 100644 index 00000000..ea7b7bb5 --- /dev/null +++ b/weather_dl_v2/README.md @@ -0,0 +1,12 @@ +## weather-dl-v2 + + + +> **_NOTE:_** weather-dl-v2 only supports python 3.10 + +### Sequence of steps: +1) Refer to downloader_kubernetes/README.md +2) Refer to license_deployment/README.md +3) Refer to fastapi-server/README.md +4) Refer to cli/README.md + diff --git a/weather_dl_v2/__init__.py b/weather_dl_v2/__init__.py new file mode 100644 index 00000000..5678014c --- /dev/null +++ b/weather_dl_v2/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/weather_dl_v2/cli/CLI-Documentation.md b/weather_dl_v2/cli/CLI-Documentation.md new file mode 100644 index 00000000..ea16bb6b --- /dev/null +++ b/weather_dl_v2/cli/CLI-Documentation.md @@ -0,0 +1,306 @@ +# CLI Documentation +The following doc provides cli commands and their various arguments and options. + +Base Command: +``` +weather-dl-v2 +``` + +## Ping +Ping the FastAPI server and check if it’s live and reachable. + + weather-dl-v2 ping + +##### Usage +``` +weather-dl-v2 ping +``` + + +
+ +## Download +Manage download configs. + + +### Add Downloads + weather-dl-v2 download add
+ Adds a new download config to specific licenses. +
+ + +##### Arguments +> `FILE_PATH` : Path to config file. + +##### Options +> `-l/--license` (Required): License ID to which this download has to be added to. +> `-f/--force-download` : Force redownload of partitions that were previously downloaded. + +##### Usage +``` +weather-dl-v2 download add /path/to/example.cfg –l L1 -l L2 [--force-download] +``` + +### List Downloads + weather-dl-v2 download list
+ List all the active downloads. +
+ +The list can also be filtered out by client_names. +Available filters: +``` +Filter Key: client_name +Values: cds, mars, ecpublic + +Filter Key: status +Values: completed, failed, in-progress +``` + +##### Options +> `--filter` : Filter the list by some key and value. Format of filter filter_key=filter_value + +##### Usage +``` +weather-dl-v2 download list +weather-dl-v2 download list --filter client_name=cds +weather-dl-v2 download list --filter status=success +weather-dl-v2 download list --filter status=failed +weather-dl-v2 download list --filter status=in-progress +weather-dl-v2 download list --filter client_name=cds --filter status=success +``` + +### Download Get + weather-dl-v2 download get
+ Get a particular download by config name. +
+ +##### Arguments +> `CONFIG_NAME` : Name of the download config. + +##### Usage +``` +weather-dl-v2 download get example.cfg +``` + +### Download Show + weather-dl-v2 download show
+ Get contents of a particular config by config name. +
+ +##### Arguments +> `CONFIG_NAME` : Name of the download config. + +##### Usage +``` +weather-dl-v2 download show example.cfg +``` + +### Download Remove + weather-dl-v2 download remove
+ Remove a download by config name. +
+ +##### Arguments +> `CONFIG_NAME` : Name of the download config. + +##### Usage +``` +weather-dl-v2 download remove example.cfg +``` + +### Download Refetch + weather-dl-v2 download refetch
+ Refetch all non-successful partitions of a config. +
+ +##### Arguments +> `CONFIG_NAME` : Name of the download config. + +##### Options +> `-l/--license` (Required): License ID to which this download has to be added to. + +##### Usage +``` +weather-dl-v2 download refetch example.cfg -l L1 -l L2 +``` + +
+ +## License +Manage licenses. + +### License Add + weather-dl-v2 license add
+ Add a new license. New licenses are added using a json file. +
+ +The json file should be in this format: +``` +{ + "license_id: , + "client_name": , + "number_of_requests": , + "secret_id": +} +``` +NOTE: `license_id` is case insensitive and has to be unique for each license. + + +##### Arguments +> `FILE_PATH` : Path to the license json. + +##### Usage +``` +weather-dl-v2 license add /path/to/new-license.json +``` + +### License Get + weather-dl-v2 license get
+ Get a particular license by license ID. +
+ +##### Arguments +> `LICENSE` : License ID of the license to be fetched. + +##### Usage +``` +weather-dl-v2 license get L1 +``` + +### License Remove + weather-dl-v2 license remove
+ Remove a particular license by license ID. +
+ +##### Arguments +> `LICENSE` : License ID of the license to be removed. + +##### Usage +``` +weather-dl-v2 license remove L1 +``` + +### License List + weather-dl-v2 license list
+ List all the licenses available. +
+ + The list can also be filtered by client name. + +##### Options +> `--filter` : Filter the list by some key and value. Format of filter filter_key=filter_value. + +##### Usage +``` +weather-dl-v2 license list +weather-dl-v2 license list --filter client_name=cds +``` + +### License Update + weather-dl-v2 license update
+ Update an existing license using License ID and a license json. +
+ + The json should be of the same format used to add a new license. + +##### Arguments +> `LICENSE` : License ID of the license to be edited. +> `FILE_PATH` : Path to the license json. + +##### Usage +``` +weather-dl-v2 license update L1 /path/to/license.json +``` + +
+ +## Queue +Manage all the license queue. + +### Queue List + weather-dl-v2 queue list
+ List all the queues. +
+ + The list can also be filtered by client name. + +##### Options +> `--filter` : Filter the list by some key and value. Format of filter filter_key=filter_value. + +##### Usage +``` +weather-dl-v2 queue list +weather-dl-v2 queue list --filter client_name=cds +``` + +### Queue Get + weather-dl-v2 queue get
+ Get a queue by license ID. +
+ + The list can also be filtered by client name. + +##### Arguments +> `LICENSE` : License ID of the queue to be fetched. + +##### Usage +``` +weather-dl-v2 queue get L1 +``` + +### Queue Edit + weather-dl-v2 queue edit
+ Edit the priority of configs inside queues using edit. +
+ +Priority can be edited in two ways: +1. The new priority queue is passed using a priority json file that should follow the following format: +``` +{ + “priority”: [“c1.cfg”, “c3.cfg”, “c2.cfg”] +} +``` +2. A config file name and its absolute priority can be passed and it updates the priority for that particular config file in the mentioned license queue. + +##### Arguments +> `LICENSE` : License ID of queue to be edited. + +##### Options +> `-f/--file` : Path of the new priority json file. +> `-c/--config` : Config name for absolute priority. +> `-p/--priority`: Absolute priority for the config in a license queue. Priority increases in ascending order with 0 having highest priority. + +##### Usage +``` +weather-dl-v2 queue edit L1 --file /path/to/priority.json +weather-dl-v2 queue edit L1 --config example.cfg --priority 0 +``` + +
+ +## Config +Configurations for cli. + +### Config Show IP + weather-dl-v2 config show-ip
+See the current server IP address. +
+ +##### Usage +``` +weather-dl-v2 config show-ip +``` + +### Config Set IP + weather-dl-v2 config set-ip
+See the current server IP address. +
+ +##### Arguments +> `NEW_IP` : New IP address. (Do not add port or protocol). + +##### Usage +``` +weather-dl-v2 config set-ip 127.0.0.1 +``` + diff --git a/weather_dl_v2/cli/Dockerfile b/weather_dl_v2/cli/Dockerfile new file mode 100644 index 00000000..ec3536be --- /dev/null +++ b/weather_dl_v2/cli/Dockerfile @@ -0,0 +1,43 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +FROM continuumio/miniconda3:latest + +COPY . . + +# Add the mamba solver for faster builds +RUN conda install -n base conda-libmamba-solver +RUN conda config --set solver libmamba + +# Create conda env using environment.yml +RUN conda update conda -y +RUN conda env create --name weather-dl-v2-cli --file=environment.yml + +# Activate the conda env and update the PATH +ARG CONDA_ENV_NAME=weather-dl-v2-cli +RUN echo "source activate ${CONDA_ENV_NAME}" >> ~/.bashrc +ENV PATH /opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH + +RUN apt-get update -y +RUN apt-get install nano -y +RUN apt-get install vim -y +RUN apt-get install curl -y + +# Install gsutil +RUN curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-443.0.0-linux-arm.tar.gz +RUN tar -xf google-cloud-cli-443.0.0-linux-arm.tar.gz +RUN ./google-cloud-sdk/install.sh --quiet +RUN echo "if [ -f '/google-cloud-sdk/path.bash.inc' ]; then . '/google-cloud-sdk/path.bash.inc'; fi" >> /root/.bashrc +RUN echo "if [ -f '/google-cloud-sdk/completion.bash.inc' ]; then . '/google-cloud-sdk/completion.bash.inc'; fi" >> /root/.bashrc diff --git a/weather_dl_v2/cli/README.md b/weather_dl_v2/cli/README.md new file mode 100644 index 00000000..a4f4932f --- /dev/null +++ b/weather_dl_v2/cli/README.md @@ -0,0 +1,58 @@ +# weather-dl-cli +This is a command line interface for talking to the weather-dl-v2 FastAPI server. + +- Due to our org level policy we can't expose external-ip using LoadBalancer Service +while deploying our FastAPI server. Hence we need to deploy the CLI on a VM to interact +through our fastapi server. + +Replace the FastAPI server pod's IP in cli_config.json. +``` +Please make approriate changes in cli_config.json, if required. +``` +> Note: Command to get the Pod IP : `kubectl get pods -o wide`. +> +> Though note that in case of Pod restart IP might get change. So we need to look +> for better solution for the same. + +## Create docker image for weather-dl-cli + +``` +export PROJECT_ID= +export REPO= eg:weather-tools + +gcloud builds submit . --tag "gcr.io/$PROJECT_ID/$REPO:weather-dl-v2-cli" --timeout=79200 --machine-type=e2-highcpu-32 +``` + +## Create a VM using above created docker-image +``` +export ZONE= eg: us-west1-a +export SERVICE_ACCOUNT= # Let's keep this as Compute Engine Default Service Account +export IMAGE_PATH= # The above created image-path + +gcloud compute instances create-with-container weather-dl-v2-cli \ + --project=$PROJECT_ID \ + --zone=$ZONE \ + --machine-type=e2-medium \ + --network-interface=network-tier=PREMIUM,subnet=default \ + --maintenance-policy=MIGRATE \ + --provisioning-model=STANDARD \ + --service-account=$SERVICE_ACCOUNT \ + --scopes=https://www.googleapis.com/auth/cloud-platform \ + --tags=http-server,https-server \ + --image=projects/cos-cloud/global/images/cos-stable-105-17412-101-24 \ + --boot-disk-size=10GB \ + --boot-disk-type=pd-balanced \ + --boot-disk-device-name=weather-dl-v2-cli \ + --container-image=$IMAGE_PATH \ + --container-restart-policy=on-failure \ + --container-tty \ + --no-shielded-secure-boot \ + --shielded-vtpm \ + --labels=goog-ec-src=vm_add-gcloud,container-vm=cos-stable-105-17412-101-24 \ + --metadata-from-file=startup-script=vm-startup.sh +``` + +## Use the cli after doing ssh in the above created VM +``` +weather-dl-v2 --help +``` diff --git a/weather_dl_v2/cli/app/__init__.py b/weather_dl_v2/cli/app/__init__.py new file mode 100644 index 00000000..5678014c --- /dev/null +++ b/weather_dl_v2/cli/app/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/weather_dl_v2/cli/app/cli_config.py b/weather_dl_v2/cli/app/cli_config.py new file mode 100644 index 00000000..9bfeb1de --- /dev/null +++ b/weather_dl_v2/cli/app/cli_config.py @@ -0,0 +1,62 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import dataclasses +import typing as t +import json +import os + +Values = t.Union[t.List["Values"], t.Dict[str, "Values"], bool, int, float, str] # pytype: disable=not-supported-yet + + +@dataclasses.dataclass +class CliConfig: + pod_ip: str = "" + port: str = "" + + @property + def BASE_URI(self) -> str: + return f"http://{self.pod_ip}:{self.port}" + + kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict) + + @classmethod + def from_dict(cls, config: t.Dict): + config_instance = cls() + + for key, value in config.items(): + if hasattr(config_instance, key): + setattr(config_instance, key, value) + else: + config_instance.kwargs[key] = value + + return config_instance + + +cli_config = None + + +def get_config(): + global cli_config + # TODO: Update this so cli can work from any folder level. + # Right now it only works in folder where cli_config.json is present. + cli_config_json = os.path.join(os.getcwd(), "cli_config.json") + + if cli_config is None: + with open(cli_config_json) as file: + firestore_dict = json.load(file) + cli_config = CliConfig.from_dict(firestore_dict) + + return cli_config diff --git a/weather_dl_v2/cli/app/main.py b/weather_dl_v2/cli/app/main.py new file mode 100644 index 00000000..03a52577 --- /dev/null +++ b/weather_dl_v2/cli/app/main.py @@ -0,0 +1,48 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import typer +import logging +from app.cli_config import get_config +import requests +from app.subcommands import download, queue, license, config +from app.utils import Loader + +logger = logging.getLogger(__name__) + +app = typer.Typer( + help="weather-dl-v2 is a cli tool for communicating with FastAPI server." +) + +app.add_typer(download.app, name="download", help="Manage downloads.") +app.add_typer(queue.app, name="queue", help="Manage queues.") +app.add_typer(license.app, name="license", help="Manage licenses.") +app.add_typer(config.app, name="config", help="Configurations for cli.") + + +@app.command("ping", help="Check if FastAPI server is live and rechable.") +def ping(): + uri = f"{get_config().BASE_URI}/" + try: + with Loader("Sending request..."): + x = requests.get(uri) + except Exception as e: + print(f"error {e}") + return + print(x.text) + + +if __name__ == "__main__": + app() diff --git a/weather_dl_v2/cli/app/services/__init__.py b/weather_dl_v2/cli/app/services/__init__.py new file mode 100644 index 00000000..5678014c --- /dev/null +++ b/weather_dl_v2/cli/app/services/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/weather_dl_v2/cli/app/services/download_service.py b/weather_dl_v2/cli/app/services/download_service.py new file mode 100644 index 00000000..4d467271 --- /dev/null +++ b/weather_dl_v2/cli/app/services/download_service.py @@ -0,0 +1,128 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import logging +import json +import typing as t +from app.services.network_service import network_service +from app.cli_config import get_config + +logger = logging.getLogger(__name__) + + +class DownloadService(abc.ABC): + + @abc.abstractmethod + def _list_all_downloads(self): + pass + + @abc.abstractmethod + def _list_all_downloads_by_filter(self, filter_dict: dict): + pass + + @abc.abstractmethod + def _get_download_by_config(self, config_name: str): + pass + + @abc.abstractmethod + def _show_config_content(self, config_name: str): + pass + + @abc.abstractmethod + def _add_new_download( + self, file_path: str, licenses: t.List[str], force_download: bool + ): + pass + + @abc.abstractmethod + def _remove_download(self, config_name: str): + pass + + @abc.abstractmethod + def _refetch_config_partitions(self, config_name: str, licenses: t.List[str]): + pass + + +class DownloadServiceNetwork(DownloadService): + + def __init__(self): + self.endpoint = f"{get_config().BASE_URI}/download" + + def _list_all_downloads(self): + return network_service.get( + uri=self.endpoint, header={"accept": "application/json"} + ) + + def _list_all_downloads_by_filter(self, filter_dict: dict): + return network_service.get( + uri=self.endpoint, + header={"accept": "application/json"}, + query=filter_dict, + ) + + def _get_download_by_config(self, config_name: str): + return network_service.get( + uri=f"{self.endpoint}/{config_name}", + header={"accept": "application/json"}, + ) + + def _show_config_content(self, config_name: str): + return network_service.get( + uri=f"{self.endpoint}/show/{config_name}", + header={"accept": "application/json"}, + ) + + def _add_new_download( + self, file_path: str, licenses: t.List[str], force_download: bool + ): + try: + file = {"file": open(file_path, "rb")} + except FileNotFoundError: + return "File not found." + + return network_service.post( + uri=self.endpoint, + header={"accept": "application/json"}, + file=file, + payload={"licenses": licenses}, + query={"force_download": force_download}, + ) + + def _remove_download(self, config_name: str): + return network_service.delete( + uri=f"{self.endpoint}/{config_name}", header={"accept": "application/json"} + ) + + def _refetch_config_partitions(self, config_name: str, licenses: t.List[str]): + return network_service.post( + uri=f"{self.endpoint}/retry/{config_name}", + header={"accept": "application/json"}, + payload=json.dumps({"licenses": licenses}), + ) + + +class DownloadServiceMock(DownloadService): + pass + + +def get_download_service(test: bool = False): + if test: + return DownloadServiceMock() + else: + return DownloadServiceNetwork() + + +download_service = get_download_service() diff --git a/weather_dl_v2/cli/app/services/license_service.py b/weather_dl_v2/cli/app/services/license_service.py new file mode 100644 index 00000000..09ff4f3c --- /dev/null +++ b/weather_dl_v2/cli/app/services/license_service.py @@ -0,0 +1,107 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import logging +import json +from app.services.network_service import network_service +from app.cli_config import get_config + +logger = logging.getLogger(__name__) + + +class LicenseService(abc.ABC): + + @abc.abstractmethod + def _get_all_license(self): + pass + + @abc.abstractmethod + def _get_all_license_by_client_name(self, client_name: str): + pass + + @abc.abstractmethod + def _get_license_by_license_id(self, license_id: str): + pass + + @abc.abstractmethod + def _add_license(self, license_dict: dict): + pass + + @abc.abstractmethod + def _remove_license(self, license_id: str): + pass + + @abc.abstractmethod + def _update_license(self, license_id: str, license_dict: dict): + pass + + +class LicenseServiceNetwork(LicenseService): + + def __init__(self): + self.endpoint = f"{get_config().BASE_URI}/license" + + def _get_all_license(self): + return network_service.get( + uri=self.endpoint, header={"accept": "application/json"} + ) + + def _get_all_license_by_client_name(self, client_name: str): + return network_service.get( + uri=self.endpoint, + header={"accept": "application/json"}, + query={"client_name": client_name}, + ) + + def _get_license_by_license_id(self, license_id: str): + return network_service.get( + uri=f"{self.endpoint}/{license_id}", + header={"accept": "application/json"}, + ) + + def _add_license(self, license_dict: dict): + return network_service.post( + uri=self.endpoint, + header={"accept": "application/json"}, + payload=json.dumps(license_dict), + ) + + def _remove_license(self, license_id: str): + return network_service.delete( + uri=f"{self.endpoint}/{license_id}", + header={"accept": "application/json"}, + ) + + def _update_license(self, license_id: str, license_dict: dict): + return network_service.put( + uri=f"{self.endpoint}/{license_id}", + header={"accept": "application/json"}, + payload=json.dumps(license_dict), + ) + + +class LicenseServiceMock(LicenseService): + pass + + +def get_license_service(test: bool = False): + if test: + return LicenseServiceMock() + else: + return LicenseServiceNetwork() + + +license_service = get_license_service() diff --git a/weather_dl_v2/cli/app/services/network_service.py b/weather_dl_v2/cli/app/services/network_service.py new file mode 100644 index 00000000..4406d91b --- /dev/null +++ b/weather_dl_v2/cli/app/services/network_service.py @@ -0,0 +1,85 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import requests +import json +import logging +from app.utils import Loader, timeit + +logger = logging.getLogger(__name__) + + +class NetworkService: + + def parse_response(self, response: requests.Response): + try: + parsed = json.loads(response.text) + except Exception as e: + logger.info(f"Parsing error: {e}.") + logger.info(f"Status code {response.status_code}") + logger.info(f"Response {response.text}") + return + + if isinstance(parsed, list): + print(f"[Total {len(parsed)} items.]") + + return json.dumps(parsed, indent=3) + + @timeit + def get(self, uri, header, query=None, payload=None): + try: + with Loader("Sending request..."): + x = requests.get(uri, params=query, headers=header, data=payload) + return self.parse_response(x) + except requests.exceptions.RequestException as e: + logger.error(f"request error: {e}") + raise SystemExit(e) + + @timeit + def post(self, uri, header, query=None, payload=None, file=None): + try: + with Loader("Sending request..."): + x = requests.post( + uri, params=query, headers=header, data=payload, files=file + ) + return self.parse_response(x) + except requests.exceptions.RequestException as e: + logger.error(f"request error: {e}") + raise SystemExit(e) + + @timeit + def put(self, uri, header, query=None, payload=None, file=None): + try: + with Loader("Sending request..."): + x = requests.put( + uri, params=query, headers=header, data=payload, files=file + ) + return self.parse_response(x) + except requests.exceptions.RequestException as e: + logger.error(f"request error: {e}") + raise SystemExit(e) + + @timeit + def delete(self, uri, header, query=None): + try: + with Loader("Sending request..."): + x = requests.delete(uri, params=query, headers=header) + return self.parse_response(x) + except requests.exceptions.RequestException as e: + logger.error(f"request error: {e}") + raise SystemExit(e) + + +network_service = NetworkService() diff --git a/weather_dl_v2/cli/app/services/queue_service.py b/weather_dl_v2/cli/app/services/queue_service.py new file mode 100644 index 00000000..f6824934 --- /dev/null +++ b/weather_dl_v2/cli/app/services/queue_service.py @@ -0,0 +1,101 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import logging +import json +import typing as t +from app.services.network_service import network_service +from app.cli_config import get_config + +logger = logging.getLogger(__name__) + + +class QueueService(abc.ABC): + + @abc.abstractmethod + def _get_all_license_queues(self): + pass + + @abc.abstractmethod + def _get_license_queue_by_client_name(self, client_name: str): + pass + + @abc.abstractmethod + def _get_queue_by_license(self, license_id: str): + pass + + @abc.abstractmethod + def _edit_license_queue(self, license_id: str, priority_list: t.List[str]): + pass + + @abc.abstractmethod + def _edit_config_absolute_priority( + self, license_id: str, config_name: str, priority: int + ): + pass + + +class QueueServiceNetwork(QueueService): + + def __init__(self): + self.endpoint = f"{get_config().BASE_URI}/queues" + + def _get_all_license_queues(self): + return network_service.get( + uri=self.endpoint, header={"accept": "application/json"} + ) + + def _get_license_queue_by_client_name(self, client_name: str): + return network_service.get( + uri=self.endpoint, + header={"accept": "application/json"}, + query={"client_name": client_name}, + ) + + def _get_queue_by_license(self, license_id: str): + return network_service.get( + uri=f"{self.endpoint}/{license_id}", header={"accept": "application/json"} + ) + + def _edit_license_queue(self, license_id: str, priority_list: t.List[str]): + return network_service.post( + uri=f"{self.endpoint}/{license_id}", + header={"accept": "application/json", "Content-Type": "application/json"}, + payload=json.dumps(priority_list), + ) + + def _edit_config_absolute_priority( + self, license_id: str, config_name: str, priority: int + ): + return network_service.put( + uri=f"{self.endpoint}/priority/{license_id}", + header={"accept": "application/json"}, + query={"config_name": config_name, "priority": priority}, + ) + + +class QueueServiceMock(QueueService): + pass + + +def get_queue_service(test: bool = False): + if test: + return QueueServiceMock() + else: + return QueueServiceNetwork() + + +queue_service = get_queue_service() diff --git a/weather_dl_v2/cli/app/subcommands/__init__.py b/weather_dl_v2/cli/app/subcommands/__init__.py new file mode 100644 index 00000000..5678014c --- /dev/null +++ b/weather_dl_v2/cli/app/subcommands/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/weather_dl_v2/cli/app/subcommands/config.py b/weather_dl_v2/cli/app/subcommands/config.py new file mode 100644 index 00000000..b2a03aaf --- /dev/null +++ b/weather_dl_v2/cli/app/subcommands/config.py @@ -0,0 +1,60 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import typer +import json +import os +from typing_extensions import Annotated +from app.cli_config import get_config +from app.utils import Validator + +app = typer.Typer() + + +class ConfigValidator(Validator): + pass + + +@app.command("show-ip", help="See the current server IP address.") +def show_server_ip(): + print(f"Current pod IP: {get_config().pod_ip}") + + +@app.command("set-ip", help="Update the server IP address.") +def update_server_ip( + new_ip: Annotated[ + str, typer.Argument(help="New IP address. (Do not add port or protocol).") + ], +): + file_path = os.path.join(os.getcwd(), "cli_config.json") + cli_config = {} + with open(file_path, "r") as file: + cli_config = json.load(file) + + old_ip = cli_config["pod_ip"] + cli_config["pod_ip"] = new_ip + + with open(file_path, "w") as file: + json.dump(cli_config, file) + + validator = ConfigValidator(valid_keys=["pod_ip", "port"]) + + try: + cli_config = validator.validate_json(file_path=file_path) + except Exception as e: + print(f"payload error: {e}") + return + + print(f"Pod IP Updated {old_ip} -> {new_ip} .") diff --git a/weather_dl_v2/cli/app/subcommands/download.py b/weather_dl_v2/cli/app/subcommands/download.py new file mode 100644 index 00000000..b16a26e8 --- /dev/null +++ b/weather_dl_v2/cli/app/subcommands/download.py @@ -0,0 +1,102 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import typer +from typing_extensions import Annotated +from app.services.download_service import download_service +from app.utils import Validator, as_table +from typing import List + +app = typer.Typer(rich_markup_mode="markdown") + + +class DowloadFilterValidator(Validator): + pass + + +@app.command("list", help="List out all the configs.") +def get_downloads( + filter: Annotated[ + List[str], + typer.Option( + help="""Filter by some value. Format: filter_key=filter_value. Available filters """ + """[key: client_name, values: cds, mars, ecpublic] """ + """[key: status, values: completed, failed, in-progress]""" + ), + ] = [] +): + if len(filter) > 0: + validator = DowloadFilterValidator(valid_keys=["client_name", "status"]) + + try: + filter_dict = validator.validate(filters=filter, allow_missing=True) + except Exception as e: + print(f"filter error: {e}") + return + + print(as_table(download_service._list_all_downloads_by_filter(filter_dict))) + return + + print(as_table(download_service._list_all_downloads())) + + +# TODO: Add support for submitting multiple configs using *.cfg notation. +@app.command("add", help="Submit new config to download.") +def submit_download( + file_path: Annotated[ + str, typer.Argument(help="File path of config to be uploaded.") + ], + license: Annotated[List[str], typer.Option("--license", "-l", help="License ID.")], + force_download: Annotated[ + bool, + typer.Option( + "-f", + "--force-download", + help="Force redownload of partitions that were previously downloaded.", + ), + ] = False, +): + print(download_service._add_new_download(file_path, license, force_download)) + + +@app.command("get", help="Get a particular config.") +def get_download_by_config( + config_name: Annotated[str, typer.Argument(help="Config file name.")] +): + print(as_table(download_service._get_download_by_config(config_name))) + + +@app.command("show", help="Show contents of a particular config.") +def show_config( + config_name: Annotated[str, typer.Argument(help="Config file name.")] +): + print(download_service._show_config_content(config_name)) + + +@app.command("remove", help="Remove existing config.") +def remove_download( + config_name: Annotated[str, typer.Argument(help="Config file name.")] +): + print(download_service._remove_download(config_name)) + + +@app.command( + "refetch", help="Reschedule all partitions of a config that are not successful." +) +def refetch_config( + config_name: Annotated[str, typer.Argument(help="Config file name.")], + license: Annotated[List[str], typer.Option("--license", "-l", help="License ID.")], +): + print(download_service._refetch_config_partitions(config_name, license)) diff --git a/weather_dl_v2/cli/app/subcommands/license.py b/weather_dl_v2/cli/app/subcommands/license.py new file mode 100644 index 00000000..68dccd1d --- /dev/null +++ b/weather_dl_v2/cli/app/subcommands/license.py @@ -0,0 +1,104 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import typer +from typing_extensions import Annotated +from app.services.license_service import license_service +from app.utils import Validator, as_table + +app = typer.Typer() + + +class LicenseValidator(Validator): + pass + + +@app.command("list", help="List all licenses.") +def get_all_license( + filter: Annotated[ + str, typer.Option(help="Filter by some value. Format: filter_key=filter_value") + ] = None +): + if filter: + validator = LicenseValidator(valid_keys=["client_name"]) + + try: + data = validator.validate(filters=[filter]) + client_name = data["client_name"] + except Exception as e: + print(f"filter error: {e}") + return + + print(as_table(license_service._get_all_license_by_client_name(client_name))) + return + + print(as_table(license_service._get_all_license())) + + +@app.command("get", help="Get a particular license by ID.") +def get_license(license: Annotated[str, typer.Argument(help="License ID.")]): + print(as_table(license_service._get_license_by_license_id(license))) + + +@app.command("add", help="Add new license.") +def add_license( + file_path: Annotated[ + str, + typer.Argument( + help="""Input json file. Example json for new license-""" + """{"license_id" : , "client_name" : , "number_of_requests" : , "secret_id" : }""" + """\nNOTE: license_id is case insensitive and has to be unique for each license.""" + ), + ], +): + validator = LicenseValidator( + valid_keys=["license_id", "client_name", "number_of_requests", "secret_id"] + ) + + try: + license_dict = validator.validate_json(file_path=file_path) + except Exception as e: + print(f"payload error: {e}") + return + + print(license_service._add_license(license_dict)) + + +@app.command("remove", help="Remove a license.") +def remove_license(license: Annotated[str, typer.Argument(help="License ID.")]): + print(license_service._remove_license(license)) + + +@app.command("update", help="Update existing license.") +def update_license( + license: Annotated[str, typer.Argument(help="License ID.")], + file_path: Annotated[ + str, + typer.Argument( + help="""Input json file. Example json for updated license- """ + """{"client_id": , "client_name" : , "number_of_requests" : , "secret_id" : }""" + ), + ], # noqa +): + validator = LicenseValidator( + valid_keys=["client_id", "client_name", "number_of_requests", "secret_id"] + ) + try: + license_dict = validator.validate_json(file_path=file_path) + except Exception as e: + print(f"payload error: {e}") + return + + print(license_service._update_license(license, license_dict)) diff --git a/weather_dl_v2/cli/app/subcommands/queue.py b/weather_dl_v2/cli/app/subcommands/queue.py new file mode 100644 index 00000000..816564ca --- /dev/null +++ b/weather_dl_v2/cli/app/subcommands/queue.py @@ -0,0 +1,111 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import typer +from typing_extensions import Annotated +from app.services.queue_service import queue_service +from app.utils import Validator, as_table + +app = typer.Typer() + + +class QueueValidator(Validator): + pass + + +@app.command("list", help="List all the license queues.") +def get_all_license_queue( + filter: Annotated[ + str, typer.Option(help="Filter by some value. Format: filter_key=filter_value") + ] = None +): + if filter: + validator = QueueValidator(valid_keys=["client_name"]) + + try: + data = validator.validate(filters=[filter]) + client_name = data["client_name"] + except Exception as e: + print(f"filter error: {e}") + return + + print(as_table(queue_service._get_license_queue_by_client_name(client_name))) + return + + print(as_table(queue_service._get_all_license_queues())) + + +@app.command("get", help="Get queue of particular license.") +def get_license_queue(license: Annotated[str, typer.Argument(help="License ID")]): + print(as_table(queue_service._get_queue_by_license(license))) + + +@app.command( + "edit", + help="Edit existing license queue. Queue can edited via a priority" + "file or my moving a single config to a given priority.", +) # noqa +def modify_license_queue( + license: Annotated[str, typer.Argument(help="License ID.")], + file: Annotated[ + str, + typer.Option( + "--file", + "-f", + help="""File path of priority json file. Example json: {"priority": ["c1.cfg", "c2.cfg",...]}""", + ), + ] = None, # noqa + config: Annotated[ + str, typer.Option("--config", "-c", help="Config name for absolute priority.") + ] = None, + priority: Annotated[ + int, + typer.Option( + "--priority", + "-p", + help="Absolute priority for the config in a license queue." + "Priority increases in ascending order with 0 having highest priority.", + ), + ] = None, # noqa +): + if file is None and (config is None and priority is None): + print("Priority file or config name with absolute priority must be passed.") + return + + if file is not None and (config is not None or priority is not None): + print("--config & --priority can't be used along with --file argument.") + return + + if file is not None: + validator = QueueValidator(valid_keys=["priority"]) + + try: + data = validator.validate_json(file_path=file) + priority_list = data["priority"] + except Exception as e: + print(f"key error: {e}") + return + print(queue_service._edit_license_queue(license, priority_list)) + return + elif config is not None and priority is not None: + if priority < 0: + print("Priority can not be negative.") + return + + print(queue_service._edit_config_absolute_priority(license, config, priority)) + return + else: + print("--config & --priority arguments should be used together.") + return diff --git a/weather_dl_v2/cli/app/utils.py b/weather_dl_v2/cli/app/utils.py new file mode 100644 index 00000000..1ced5c7b --- /dev/null +++ b/weather_dl_v2/cli/app/utils.py @@ -0,0 +1,168 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import logging +import dataclasses +import typing as t +import json +from time import time +from itertools import cycle +from shutil import get_terminal_size +from threading import Thread +from time import sleep +from tabulate import tabulate + +logger = logging.getLogger(__name__) + + +def timeit(func): + def wrap_func(*args, **kwargs): + t1 = time() + result = func(*args, **kwargs) + t2 = time() + print(f"[executed in {(t2-t1):.4f}s.]") + return result + + return wrap_func + + +# TODO: Add a flag (may be -j/--json) to support raw response. +def as_table(response: str): + data = json.loads(response) + + if not isinstance(data, list): + # convert response to list if not a list. + data = [data] + + if len(data) == 0: + return "" + + header = data[0].keys() + # if any column has lists, convert that to a string. + rows = [ + [ + ",\n".join([f"{i} {ele}" for i, ele in enumerate(val)]) + if isinstance(val, list) + else val + for val in x.values() + ] + for x in data + ] + rows.insert(0, list(header)) + return tabulate( + rows, showindex=True, tablefmt="grid", maxcolwidths=[16] * len(header) + ) + + +class Loader: + + def __init__(self, desc="Loading...", end="", timeout=0.1): + """ + A loader-like context manager + + Args: + desc (str, optional): The loader's description. Defaults to "Loading...". + end (str, optional): Final print. Defaults to "Done!". + timeout (float, optional): Sleep time between prints. Defaults to 0.1. + """ + self.desc = desc + self.end = end + self.timeout = timeout + + self._thread = Thread(target=self._animate, daemon=True) + self.steps = ["⢿", "⣻", "⣽", "⣾", "⣷", "⣯", "⣟", "⡿"] + self.done = False + + def start(self): + self._thread.start() + return self + + def _animate(self): + for c in cycle(self.steps): + if self.done: + break + print(f"\r{self.desc} {c}", flush=True, end="") + sleep(self.timeout) + + def __enter__(self): + self.start() + + def stop(self): + self.done = True + cols = get_terminal_size((80, 20)).columns + print("\r" + " " * cols, end="", flush=True) + + def __exit__(self, exc_type, exc_value, tb): + # handle exceptions with those variables ^ + self.stop() + + +@dataclasses.dataclass +class Validator(abc.ABC): + valid_keys: t.List[str] + + def validate( + self, filters: t.List[str], show_valid_filters=True, allow_missing: bool = False + ): + filter_dict = {} + + for filter in filters: + _filter = filter.split("=") + + if len(_filter) != 2: + if show_valid_filters: + logger.info(f"valid filters are: {self.valid_keys}.") + raise ValueError("Incorrect Filter. Please Try again.") + + key, value = _filter + filter_dict[key] = value + + data_set = set(filter_dict.keys()) + valid_set = set(self.valid_keys) + + if self._validate_keys(data_set, valid_set, allow_missing): + return filter_dict + + def validate_json(self, file_path, allow_missing: bool = False): + try: + with open(file_path) as f: + data: dict = json.load(f) + data_keys = data.keys() + + data_set = set(data_keys) + valid_set = set(self.valid_keys) + + if self._validate_keys(data_set, valid_set, allow_missing): + return data + + except FileNotFoundError: + logger.info("file not found.") + raise FileNotFoundError + + def _validate_keys(self, data_set: set, valid_set: set, allow_missing: bool): + missing_keys = valid_set.difference(data_set) + invalid_keys = data_set.difference(valid_set) + + if not allow_missing and len(missing_keys) > 0: + raise ValueError(f"keys {missing_keys} are missing in file.") + + if len(invalid_keys) > 0: + raise ValueError(f"keys {invalid_keys} are invalid keys.") + + if allow_missing or data_set == valid_set: + return True + + return False diff --git a/weather_dl_v2/cli/cli_config.json b/weather_dl_v2/cli/cli_config.json new file mode 100644 index 00000000..076ed641 --- /dev/null +++ b/weather_dl_v2/cli/cli_config.json @@ -0,0 +1,4 @@ +{ + "pod_ip": "", + "port": 8080 +} \ No newline at end of file diff --git a/weather_dl_v2/cli/environment.yml b/weather_dl_v2/cli/environment.yml new file mode 100644 index 00000000..f2ffec62 --- /dev/null +++ b/weather_dl_v2/cli/environment.yml @@ -0,0 +1,14 @@ +name: weather-dl-v2-cli +channels: + - conda-forge +dependencies: + - python=3.10 + - pip=23.0.1 + - typer=0.9.0 + - tabulate=0.9.0 + - pip: + - requests + - ruff + - pytype + - pytest + - . diff --git a/weather_dl_v2/cli/setup.py b/weather_dl_v2/cli/setup.py new file mode 100644 index 00000000..509f42fc --- /dev/null +++ b/weather_dl_v2/cli/setup.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from setuptools import setup + +requirements = ["typer", "requests", "tabulate"] + +setup( + name="weather-dl-v2", + packages=["app", "app.subcommands", "app.services"], + install_requires=requirements, + version="0.0.1", + author="aniket", + description=( + "This cli tools helps in interacting with weather dl v2 fast API server." + ), + entry_points={"console_scripts": ["weather-dl-v2=app.main:app"]}, +) diff --git a/weather_dl_v2/cli/vm-startup.sh b/weather_dl_v2/cli/vm-startup.sh new file mode 100644 index 00000000..e36f6edc --- /dev/null +++ b/weather_dl_v2/cli/vm-startup.sh @@ -0,0 +1,4 @@ +#! /bin/bash + +command="docker exec -it \\\$(docker ps -qf name=weather-dl-v2-cli) /bin/bash" +sudo sh -c "echo \"$command\" >> /etc/profile" \ No newline at end of file diff --git a/weather_dl_v2/config.json b/weather_dl_v2/config.json new file mode 100644 index 00000000..f5afae8b --- /dev/null +++ b/weather_dl_v2/config.json @@ -0,0 +1,11 @@ +{ + "download_collection": "download", + "queues_collection": "queues", + "license_collection": "license", + "manifest_collection": "manifest", + "storage_bucket": "XXXXXXX", + "gcs_project": "XXXXXXX", + "license_deployment_image": "XXXXXXX", + "downloader_k8_image": "XXXXXXX", + "welcome_message": "Greetings from weather-dl v2!" +} \ No newline at end of file diff --git a/weather_dl_v2/downloader_kubernetes/Dockerfile b/weather_dl_v2/downloader_kubernetes/Dockerfile new file mode 100644 index 00000000..74084030 --- /dev/null +++ b/weather_dl_v2/downloader_kubernetes/Dockerfile @@ -0,0 +1,46 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +FROM continuumio/miniconda3:latest + +# Update miniconda +RUN conda update conda -y + +# Add the mamba solver for faster builds +RUN conda install -n base conda-libmamba-solver +RUN conda config --set solver libmamba + +# Create conda env using environment.yml +COPY . . +RUN conda env create -f environment.yml --debug + +# Activate the conda env and update the PATH +ARG CONDA_ENV_NAME=weather-dl-v2-downloader +RUN echo "source activate ${CONDA_ENV_NAME}" >> ~/.bashrc +ENV PATH /opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH diff --git a/weather_dl_v2/downloader_kubernetes/README.md b/weather_dl_v2/downloader_kubernetes/README.md new file mode 100644 index 00000000..b0d865f8 --- /dev/null +++ b/weather_dl_v2/downloader_kubernetes/README.md @@ -0,0 +1,23 @@ +# Deployment / Usage Instruction + +### User authorization required to set up the environment: +* roles/container.admin + +### Authorization needed for the tool to operate: +We are not configuring any service account here hence make sure that compute engine default service account have roles: +* roles/storage.admin +* roles/bigquery.dataEditor +* roles/bigquery.jobUser + +### Make changes in weather_dl_v2/config.json, if required [for running locally] +``` +export CONFIG_PATH=/path/to/weather_dl_v2/config.json +``` + +### Create docker image for downloader: +``` +export PROJECT_ID= +export REPO= eg:weather-tools + +gcloud builds submit . --tag "gcr.io/$PROJECT_ID/$REPO:weather-dl-v2-downloader" --timeout=79200 --machine-type=e2-highcpu-32 +``` diff --git a/weather_dl_v2/downloader_kubernetes/downloader.py b/weather_dl_v2/downloader_kubernetes/downloader.py new file mode 100644 index 00000000..c8a5c7dc --- /dev/null +++ b/weather_dl_v2/downloader_kubernetes/downloader.py @@ -0,0 +1,73 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This program downloads ECMWF data & upload it into GCS. +""" +import tempfile +import os +import sys +from manifest import FirestoreManifest, Stage +from util import copy, download_with_aria2 +import datetime + + +def download(url: str, path: str) -> None: + """Download data from client, with retries.""" + if path: + if os.path.exists(path): + # Empty the target file, if it already exists, otherwise the + # transfer below might be fooled into thinking we're resuming + # an interrupted download. + open(path, "w").close() + download_with_aria2(url, path) + + +def main( + config_name, dataset, selection, user_id, url, target_path, license_id +) -> None: + """Download data from a client to a temp file.""" + + manifest = FirestoreManifest(license_id=license_id) + temp_name = "" + with manifest.transact(config_name, dataset, selection, target_path, user_id): + with tempfile.NamedTemporaryFile(delete=False) as temp: + temp_name = temp.name + manifest.set_stage(Stage.DOWNLOAD) + precise_download_start_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + manifest.prev_stage_precise_start_time = precise_download_start_time + print(f"Downloading data for {target_path!r}.") + download(url, temp_name) + print(f"Download completed for {target_path!r}.") + + manifest.set_stage(Stage.UPLOAD) + precise_upload_start_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + manifest.prev_stage_precise_start_time = precise_upload_start_time + print(f"Uploading to store for {target_path!r}.") + copy(temp_name, target_path) + print(f"Upload to store complete for {target_path!r}.") + os.unlink(temp_name) + + +if __name__ == "__main__": + main(*sys.argv[1:]) diff --git a/weather_dl_v2/downloader_kubernetes/downloader_config.py b/weather_dl_v2/downloader_kubernetes/downloader_config.py new file mode 100644 index 00000000..247ae664 --- /dev/null +++ b/weather_dl_v2/downloader_kubernetes/downloader_config.py @@ -0,0 +1,65 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import dataclasses +import typing as t +import json +import os +import logging + +logger = logging.getLogger(__name__) + +Values = t.Union[t.List["Values"], t.Dict[str, "Values"], bool, int, float, str] # pytype: disable=not-supported-yet + + +@dataclasses.dataclass +class DownloaderConfig: + manifest_collection: str = "" + kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict) + + @classmethod + def from_dict(cls, config: t.Dict): + config_instance = cls() + + for key, value in config.items(): + if hasattr(config_instance, key): + setattr(config_instance, key, value) + else: + config_instance.kwargs[key] = value + + return config_instance + + +downloader_config = None + + +def get_config(): + global downloader_config + if downloader_config: + return downloader_config + + downloader_config_json = "config/config.json" + if not os.path.exists(downloader_config_json): + downloader_config_json = os.environ.get("CONFIG_PATH", None) + + if downloader_config_json is None: + logger.error("Couldn't load config file for downloader.") + raise FileNotFoundError("Couldn't load config file for downloader.") + + with open(downloader_config_json) as file: + config_dict = json.load(file) + downloader_config = DownloaderConfig.from_dict(config_dict) + + return downloader_config diff --git a/weather_dl_v2/downloader_kubernetes/environment.yml b/weather_dl_v2/downloader_kubernetes/environment.yml new file mode 100644 index 00000000..79e75565 --- /dev/null +++ b/weather_dl_v2/downloader_kubernetes/environment.yml @@ -0,0 +1,17 @@ +name: weather-dl-v2-downloader +channels: + - conda-forge +dependencies: + - python=3.10 + - google-cloud-sdk=410.0.0 + - aria2=1.36.0 + - geojson=2.5.0=py_0 + - xarray=2022.11.0 + - google-apitools + - pip=22.3 + - pip: + - apache_beam[gcp]==2.40.0 + - firebase-admin + - google-cloud-pubsub + - kubernetes + - psutil diff --git a/weather_dl_v2/downloader_kubernetes/manifest.py b/weather_dl_v2/downloader_kubernetes/manifest.py new file mode 100644 index 00000000..0bc82264 --- /dev/null +++ b/weather_dl_v2/downloader_kubernetes/manifest.py @@ -0,0 +1,503 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Client interface for connecting to a manifest.""" + +import abc +import dataclasses +import datetime +import enum +import json +import pandas as pd +import time +import traceback +import typing as t + +from util import ( + to_json_serializable_type, + fetch_geo_polygon, + get_file_size, + get_wait_interval, + generate_md5_hash, + GLOBAL_COVERAGE_AREA, +) + +import firebase_admin +from firebase_admin import credentials +from firebase_admin import firestore +from google.cloud.firestore_v1 import DocumentReference +from google.cloud.firestore_v1.types import WriteResult +from downloader_config import get_config + +"""An implementation-dependent Manifest URI.""" +Location = t.NewType("Location", str) + + +class ManifestException(Exception): + """Errors that occur in Manifest Clients.""" + + pass + + +class Stage(enum.Enum): + """A request can be either in one of the following stages at a time: + + fetch : This represents request is currently in fetch stage i.e. request placed on the client's server + & waiting for some result before starting download (eg. MARS client). + download : This represents request is currently in download stage i.e. data is being downloading from client's + server to the worker's local file system. + upload : This represents request is currently in upload stage i.e. data is getting uploaded from worker's local + file system to target location (GCS path). + retrieve : In case of clients where there is no proper separation of fetch & download stages (eg. CDS client), + request will be in the retrieve stage i.e. fetch + download. + """ + + RETRIEVE = "retrieve" + FETCH = "fetch" + DOWNLOAD = "download" + UPLOAD = "upload" + + +class Status(enum.Enum): + """Depicts the request's state status: + + scheduled : A request partition is created & scheduled for processing. + Note: Its corresponding state can be None only. + in-progress : This represents the request state is currently in-progress (i.e. running). + The next status would be "success" or "failure". + success : This represents the request state execution completed successfully without any error. + failure : This represents the request state execution failed. + """ + + SCHEDULED = "scheduled" + IN_PROGRESS = "in-progress" + SUCCESS = "success" + FAILURE = "failure" + + +@dataclasses.dataclass +class DownloadStatus: + """Data recorded in `Manifest`s reflecting the status of a download.""" + + """The name of the config file associated with the request.""" + config_name: str = "" + + """Represents the dataset field of the configuration.""" + dataset: t.Optional[str] = "" + + """Copy of selection section of the configuration.""" + selection: t.Dict = dataclasses.field(default_factory=dict) + + """Location of the downloaded data.""" + location: str = "" + + """Represents area covered by the shard.""" + area: str = "" + + """Current stage of request : 'fetch', 'download', 'retrieve', 'upload' or None.""" + stage: t.Optional[Stage] = None + + """Download status: 'scheduled', 'in-progress', 'success', or 'failure'.""" + status: t.Optional[Status] = None + + """Cause of error, if any.""" + error: t.Optional[str] = "" + + """Identifier for the user running the download.""" + username: str = "" + + """Shard size in GB.""" + size: t.Optional[float] = 0 + + """A UTC datetime when download was scheduled.""" + scheduled_time: t.Optional[str] = "" + + """A UTC datetime when the retrieve stage starts.""" + retrieve_start_time: t.Optional[str] = "" + + """A UTC datetime when the retrieve state ends.""" + retrieve_end_time: t.Optional[str] = "" + + """A UTC datetime when the fetch state starts.""" + fetch_start_time: t.Optional[str] = "" + + """A UTC datetime when the fetch state ends.""" + fetch_end_time: t.Optional[str] = "" + + """A UTC datetime when the download state starts.""" + download_start_time: t.Optional[str] = "" + + """A UTC datetime when the download state ends.""" + download_end_time: t.Optional[str] = "" + + """A UTC datetime when the upload state starts.""" + upload_start_time: t.Optional[str] = "" + + """A UTC datetime when the upload state ends.""" + upload_end_time: t.Optional[str] = "" + + @classmethod + def from_dict(cls, download_status: t.Dict) -> "DownloadStatus": + """Instantiate DownloadStatus dataclass from dict.""" + download_status_instance = cls() + for key, value in download_status.items(): + if key == "status": + setattr(download_status_instance, key, Status(value)) + elif key == "stage" and value is not None: + setattr(download_status_instance, key, Stage(value)) + else: + setattr(download_status_instance, key, value) + return download_status_instance + + @classmethod + def to_dict(cls, instance) -> t.Dict: + """Return the fields of a dataclass instance as a manifest ingestible + dictionary mapping of field names to field values.""" + download_status_dict = {} + for field in dataclasses.fields(instance): + key = field.name + value = getattr(instance, field.name) + if isinstance(value, Status) or isinstance(value, Stage): + download_status_dict[key] = value.value + elif isinstance(value, pd.Timestamp): + download_status_dict[key] = value.isoformat() + elif key == "selection" and value is not None: + download_status_dict[key] = json.dumps(value) + else: + download_status_dict[key] = value + return download_status_dict + + +@dataclasses.dataclass +class Manifest(abc.ABC): + """Abstract manifest of download statuses. + + Update download statuses to some storage medium. + + This class lets one indicate that a download is `scheduled` or in a transaction process. + In the event of a transaction, a download will be updated with an `in-progress`, `success` + or `failure` status (with accompanying metadata). + + Example: + ``` + my_manifest = parse_manifest_location(Location('fs://some-firestore-collection')) + + # Schedule data for download + my_manifest.schedule({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') + + # ... + + # Initiate a transaction – it will record that the download is `in-progess` + with my_manifest.transact({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') as tx: + # download logic here + pass + + # ... + + # on error, will record the download as a `failure` before propagating the error. By default, it will + # record download as a `success`. + ``` + + Attributes: + status: The current `DownloadStatus` of the Manifest. + """ + + # To reduce the impact of _read() and _update() calls + # on the start time of the stage. + license_id: str = "" + prev_stage_precise_start_time: t.Optional[str] = None + status: t.Optional[DownloadStatus] = None + + # This is overridden in subclass. + def __post_init__(self): + """Initialize the manifest.""" + pass + + def schedule( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> None: + """Indicate that a job has been scheduled for download. + + 'scheduled' jobs occur before 'in-progress', 'success' or 'finished'. + """ + scheduled_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + self.status = DownloadStatus( + config_name=config_name, + dataset=dataset if dataset else None, + selection=selection, + location=location, + area=fetch_geo_polygon(selection.get("area", GLOBAL_COVERAGE_AREA)), + username=user, + stage=None, + status=Status.SCHEDULED, + error=None, + size=None, + scheduled_time=scheduled_time, + retrieve_start_time=None, + retrieve_end_time=None, + fetch_start_time=None, + fetch_end_time=None, + download_start_time=None, + download_end_time=None, + upload_start_time=None, + upload_end_time=None, + ) + self._update(self.status) + + def skip( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> None: + """Updates the manifest to mark the shards that were skipped in the current job + as 'upload' stage and 'success' status, indicating that they have already been downloaded. + """ + old_status = self._read(location) + # The manifest needs to be updated for a skipped shard if its entry is not present, or + # if the stage is not 'upload', or if the stage is 'upload' but the status is not 'success'. + if ( + old_status.location != location + or old_status.stage != Stage.UPLOAD + or old_status.status != Status.SUCCESS + ): + current_utc_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + + size = get_file_size(location) + + status = DownloadStatus( + config_name=config_name, + dataset=dataset if dataset else None, + selection=selection, + location=location, + area=fetch_geo_polygon(selection.get("area", GLOBAL_COVERAGE_AREA)), + username=user, + stage=Stage.UPLOAD, + status=Status.SUCCESS, + error=None, + size=size, + scheduled_time=None, + retrieve_start_time=None, + retrieve_end_time=None, + fetch_start_time=None, + fetch_end_time=None, + download_start_time=None, + download_end_time=None, + upload_start_time=current_utc_time, + upload_end_time=current_utc_time, + ) + self._update(status) + print( + f"Manifest updated for skipped shard: {location!r} -- {DownloadStatus.to_dict(status)!r}." + ) + + def _set_for_transaction( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> None: + """Reset Manifest state in preparation for a new transaction.""" + self.status = dataclasses.replace(self._read(location)) + self.status.config_name = config_name + self.status.dataset = dataset if dataset else None + self.status.selection = selection + self.status.location = location + self.status.username = user + + def __enter__(self) -> None: + pass + + def __exit__(self, exc_type, exc_inst, exc_tb) -> None: + """Record end status of a transaction as either 'success' or 'failure'.""" + if exc_type is None: + status = Status.SUCCESS + error = None + else: + status = Status.FAILURE + # For explanation, see https://docs.python.org/3/library/traceback.html#traceback.format_exception + error = f"license_id: {self.license_id} " + error += "\n".join(traceback.format_exception(exc_type, exc_inst, exc_tb)) + + new_status = dataclasses.replace(self.status) + new_status.error = error + new_status.status = status + current_utc_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + + # This is necessary for setting the precise start time of the previous stage + # and end time of the final stage, as well as handling the case of Status.FAILURE. + if new_status.stage == Stage.FETCH: + new_status.fetch_start_time = self.prev_stage_precise_start_time + new_status.fetch_end_time = current_utc_time + elif new_status.stage == Stage.RETRIEVE: + new_status.retrieve_start_time = self.prev_stage_precise_start_time + new_status.retrieve_end_time = current_utc_time + elif new_status.stage == Stage.DOWNLOAD: + new_status.download_start_time = self.prev_stage_precise_start_time + new_status.download_end_time = current_utc_time + else: + new_status.upload_start_time = self.prev_stage_precise_start_time + new_status.upload_end_time = current_utc_time + + new_status.size = get_file_size(new_status.location) + + self.status = new_status + + self._update(self.status) + + def transact( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> "Manifest": + """Create a download transaction.""" + self._set_for_transaction(config_name, dataset, selection, location, user) + return self + + def set_stage(self, stage: Stage) -> None: + """Sets the current stage in manifest.""" + new_status = dataclasses.replace(self.status) + new_status.stage = stage + new_status.status = Status.IN_PROGRESS + current_utc_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + + if stage == Stage.DOWNLOAD: + new_status.download_start_time = current_utc_time + else: + new_status.download_start_time = self.prev_stage_precise_start_time + new_status.download_end_time = current_utc_time + new_status.upload_start_time = current_utc_time + + self.status = new_status + self._update(self.status) + + @abc.abstractmethod + def _read(self, location: str) -> DownloadStatus: + pass + + @abc.abstractmethod + def _update(self, download_status: DownloadStatus) -> None: + pass + + +class FirestoreManifest(Manifest): + """A Firestore Manifest. + This Manifest implementation stores DownloadStatuses in a Firebase document store. + The document hierarchy for the manifest is as follows: + [manifest ] + ├── doc_id (md5 hash of the path) { 'selection': {...}, 'location': ..., 'username': ... } + └── etc... + Where `[]` indicates a collection and ` {...}` indicates a document. + """ + + def _get_db(self) -> firestore.firestore.Client: + """Acquire a firestore client, initializing the firebase app if necessary. + Will attempt to get the db client five times. If it's still unsuccessful, a + `ManifestException` will be raised. + """ + db = None + attempts = 0 + + while db is None: + try: + db = firestore.client() + except ValueError as e: + # The above call will fail with a value error when the firebase app is not initialized. + # Initialize the app here, and try again. + # Use the application default credentials. + cred = credentials.ApplicationDefault() + + firebase_admin.initialize_app(cred) + print("Initialized Firebase App.") + + if attempts > 4: + raise ManifestException( + "Exceeded number of retries to get firestore client." + ) from e + + time.sleep(get_wait_interval(attempts)) + + attempts += 1 + + return db + + def _read(self, location: str) -> DownloadStatus: + """Reads the JSON data from a manifest.""" + + doc_id = generate_md5_hash(location) + + # Update document with download status + download_doc_ref = self.root_document_for_store(doc_id) + + result = download_doc_ref.get() + row = {} + if result.exists: + records = result.to_dict() + row = {n: to_json_serializable_type(v) for n, v in records.items()} + return DownloadStatus.from_dict(row) + + def _update(self, download_status: DownloadStatus) -> None: + """Update or create a download status record.""" + print("Updating Firestore Manifest.") + + status = DownloadStatus.to_dict(download_status) + doc_id = generate_md5_hash(status["location"]) + + # Update document with download status + download_doc_ref = self.root_document_for_store(doc_id) + + result: WriteResult = download_doc_ref.set(status) + + print( + f"Firestore manifest updated. " + f"update_time={result.update_time}, " + f"filename={download_status.location}." + ) + + def root_document_for_store(self, store_scheme: str) -> DocumentReference: + """Get the root manifest document given the user's config and current document's storage location.""" + return ( + self._get_db() + .collection(get_config().manifest_collection) + .document(store_scheme) + ) diff --git a/weather_dl_v2/downloader_kubernetes/util.py b/weather_dl_v2/downloader_kubernetes/util.py new file mode 100644 index 00000000..5777234f --- /dev/null +++ b/weather_dl_v2/downloader_kubernetes/util.py @@ -0,0 +1,226 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import datetime +import geojson +import hashlib +import itertools +import os +import socket +import subprocess +import sys +import typing as t + +import numpy as np +import pandas as pd +from apache_beam.io.gcp import gcsio +from apache_beam.utils import retry +from xarray.core.utils import ensure_us_time_resolution +from urllib.parse import urlparse +from google.api_core.exceptions import BadRequest + + +LATITUDE_RANGE = (-90, 90) +LONGITUDE_RANGE = (-180, 180) +GLOBAL_COVERAGE_AREA = [90, -180, -90, 180] + + +def _retry_if_valid_input_but_server_or_socket_error_and_timeout_filter( + exception, +) -> bool: + if isinstance(exception, socket.timeout): + return True + if isinstance(exception, TimeoutError): + return True + # To handle the concurrency issue in BigQuery. + if isinstance(exception, BadRequest): + return True + return retry.retry_if_valid_input_but_server_error_and_timeout_filter(exception) + + +class _FakeClock: + + def sleep(self, value): + pass + + +def retry_with_exponential_backoff(fun): + """A retry decorator that doesn't apply during test time.""" + clock = retry.Clock() + + # Use a fake clock only during test time... + if "unittest" in sys.modules.keys(): + clock = _FakeClock() + + return retry.with_exponential_backoff( + retry_filter=_retry_if_valid_input_but_server_or_socket_error_and_timeout_filter, + clock=clock, + )(fun) + + +# TODO(#245): Group with common utilities (duplicated) +def ichunked(iterable: t.Iterable, n: int) -> t.Iterator[t.Iterable]: + """Yield evenly-sized chunks from an iterable.""" + input_ = iter(iterable) + try: + while True: + it = itertools.islice(input_, n) + # peek to check if 'it' has next item. + first = next(it) + yield itertools.chain([first], it) + except StopIteration: + pass + + +# TODO(#245): Group with common utilities (duplicated) +def copy(src: str, dst: str) -> None: + """Copy data via `gsutil cp`.""" + try: + subprocess.run(["gsutil", "cp", src, dst], check=True, capture_output=True) + except subprocess.CalledProcessError as e: + print( + f'Failed to copy file {src!r} to {dst!r} due to {e.stderr.decode("utf-8")}' + ) + raise + + +# TODO(#245): Group with common utilities (duplicated) +def to_json_serializable_type(value: t.Any) -> t.Any: + """Returns the value with a type serializable to JSON""" + # Note: The order of processing is significant. + print("Serializing to JSON") + + if pd.isna(value) or value is None: + return None + elif np.issubdtype(type(value), np.floating): + return float(value) + elif isinstance(value, np.ndarray): + # Will return a scaler if array is of size 1, else will return a list. + return value.tolist() + elif ( + isinstance(value, datetime.datetime) + or isinstance(value, str) + or isinstance(value, np.datetime64) + ): + # Assume strings are ISO format timestamps... + try: + value = datetime.datetime.fromisoformat(value) + except ValueError: + # ... if they are not, assume serialization is already correct. + return value + except TypeError: + # ... maybe value is a numpy datetime ... + try: + value = ensure_us_time_resolution(value).astype(datetime.datetime) + except AttributeError: + # ... value is a datetime object, continue. + pass + + # We use a string timestamp representation. + if value.tzname(): + return value.isoformat() + + # We assume here that naive timestamps are in UTC timezone. + return value.replace(tzinfo=datetime.timezone.utc).isoformat() + elif isinstance(value, np.timedelta64): + # Return time delta in seconds. + return float(value / np.timedelta64(1, "s")) + # This check must happen after processing np.timedelta64 and np.datetime64. + elif np.issubdtype(type(value), np.integer): + return int(value) + + return value + + +def fetch_geo_polygon(area: t.Union[list, str]) -> str: + """Calculates a geography polygon from an input area.""" + # Ref: https://confluence.ecmwf.int/pages/viewpage.action?pageId=151520973 + if isinstance(area, str): + # European area + if area == "E": + area = [73.5, -27, 33, 45] + # Global area + elif area == "G": + area = GLOBAL_COVERAGE_AREA + else: + raise RuntimeError(f"Not a valid value for area in config: {area}.") + + n, w, s, e = [float(x) for x in area] + if s < LATITUDE_RANGE[0]: + raise ValueError(f"Invalid latitude value for south: '{s}'") + if n > LATITUDE_RANGE[1]: + raise ValueError(f"Invalid latitude value for north: '{n}'") + if w < LONGITUDE_RANGE[0]: + raise ValueError(f"Invalid longitude value for west: '{w}'") + if e > LONGITUDE_RANGE[1]: + raise ValueError(f"Invalid longitude value for east: '{e}'") + + # Define the coordinates of the bounding box. + coords = [[w, n], [w, s], [e, s], [e, n], [w, n]] + + # Create the GeoJSON polygon object. + polygon = geojson.dumps(geojson.Polygon([coords])) + return polygon + + +def get_file_size(path: str) -> float: + parsed_gcs_path = urlparse(path) + if parsed_gcs_path.scheme != "gs" or parsed_gcs_path.netloc == "": + return os.stat(path).st_size / (1024**3) if os.path.exists(path) else 0 + else: + return ( + gcsio.GcsIO().size(path) / (1024**3) if gcsio.GcsIO().exists(path) else 0 + ) + + +def get_wait_interval(num_retries: int = 0) -> float: + """Returns next wait interval in seconds, using an exponential backoff algorithm.""" + if 0 == num_retries: + return 0 + return 2**num_retries + + +def generate_md5_hash(input: str) -> str: + """Generates md5 hash for the input string.""" + return hashlib.md5(input.encode("utf-8")).hexdigest() + + +def download_with_aria2(url: str, path: str) -> None: + """Downloads a file from the given URL using the `aria2c` command-line utility, + with options set to improve download speed and reliability.""" + dir_path, file_name = os.path.split(path) + try: + subprocess.run( + [ + "aria2c", + "-x", + "16", + "-s", + "16", + url, + "-d", + dir_path, + "-o", + file_name, + "--allow-overwrite", + ], + check=True, + capture_output=True, + ) + except subprocess.CalledProcessError as e: + print( + f'Failed download from server {url!r} to {path!r} due to {e.stderr.decode("utf-8")}' + ) + raise diff --git a/weather_dl_v2/fastapi-server/API-Interactions.md b/weather_dl_v2/fastapi-server/API-Interactions.md new file mode 100644 index 00000000..3ea4eece --- /dev/null +++ b/weather_dl_v2/fastapi-server/API-Interactions.md @@ -0,0 +1,25 @@ +# API Interactions +| Command | Type | Endpoint | +|---|---|---| +| `weather-dl-v2 ping` | `get` | `/` +| Download | | | +| `weather-dl-v2 download add –l [--force-download]` | `post` | `/download?force_download={value}` | +| `weather-dl-v2 download list` | `get` | `/download/` | +| `weather-dl-v2 download list --filter client_name=` | `get` | `/download?client_name={name}` | +| `weather-dl-v2 download get ` | `get` | `/download/{config_name}` | +| `weather-dl-v2 download show ` | `get` | `/download/show/{config_name}` | +| `weather-dl-v2 download remove ` | `delete` | `/download/{config_name}` | +| `weather-dl-v2 download refetch -l ` | `post` | `/download/refetch/{config_name}` | +| License | | | +| `weather-dl-v2 license add ` | `post` | `/license/` | +| `weather-dl-v2 license get ` | `get` | `/license/{license_id}` | +| `weather-dl-v2 license remove ` | `delete` | `/license/{license_id}` | +| `weather-dl-v2 license list` | `get` | `/license/` | +| `weather-dl-v2 license list --filter client_name=` | `get` | `/license?client_name={name}` | +| `weather-dl-v2 license edit ` | `put` | `/license/{license_id}` | +| Queue | | | +| `weather-dl-v2 queue list` | `get` | `/queues/` | +| `weather-dl-v2 queue list --filter client_name=` | `get` | `/queues?client_name={name}` | +| `weather-dl-v2 queue get ` | `get` | `/queues/{license_id}` | +| `queue edit --config --priority ` | `post` | `/queues/{license_id}` | +| `queue edit --file ` | `put` | `/queues/priority/{license_id}` | \ No newline at end of file diff --git a/weather_dl_v2/fastapi-server/Dockerfile b/weather_dl_v2/fastapi-server/Dockerfile new file mode 100644 index 00000000..b54e41c0 --- /dev/null +++ b/weather_dl_v2/fastapi-server/Dockerfile @@ -0,0 +1,40 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +FROM continuumio/miniconda3:latest + +EXPOSE 8080 + +# Update miniconda +RUN conda update conda -y + +# Add the mamba solver for faster builds +RUN conda install -n base conda-libmamba-solver +RUN conda config --set solver libmamba + +COPY . . +# Create conda env using environment.yml +RUN conda env create -f environment.yml --debug + +# Activate the conda env and update the PATH +ARG CONDA_ENV_NAME=weather-dl-v2-server +RUN echo "source activate ${CONDA_ENV_NAME}" >> ~/.bashrc +ENV PATH /opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH + +# Use the ping endpoint as a healthcheck, +# so Docker knows if the API is still running ok or needs to be restarted +HEALTHCHECK --interval=21s --timeout=3s --start-period=10s CMD curl --fail http://localhost:8080/ping || exit 1 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/weather_dl_v2/fastapi-server/README.md b/weather_dl_v2/fastapi-server/README.md new file mode 100644 index 00000000..2debb563 --- /dev/null +++ b/weather_dl_v2/fastapi-server/README.md @@ -0,0 +1,91 @@ +# Deployment Instructions & General Notes + +### User authorization required to set up the environment: +* roles/container.admin + +### Authorization needed for the tool to operate: +We are not configuring any service account here hence make sure that compute engine default service account have roles: +* roles/pubsub.subscriber +* roles/storage.admin +* roles/bigquery.dataEditor +* roles/bigquery.jobUser + +### Install kubectl: +``` +apt-get update + +apt-get install -y kubectl +``` + +### Create cluster: +``` +export PROJECT_ID= +export REGION= eg: us-west1 +export ZONE= eg: us-west1-a +export CLUSTER_NAME= eg: weather-dl-v2-cluster +export DOWNLOAD_NODE_POOL=downloader-pool + +gcloud beta container --project $PROJECT_ID clusters create $CLUSTER_NAME --zone $ZONE --no-enable-basic-auth --cluster-version "1.27.2-gke.1200" --release-channel "regular" --machine-type "e2-standard-8" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "1100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node "16" --num-nodes "4" --logging=SYSTEM,WORKLOAD --monitoring=SYSTEM --enable-ip-alias --network "projects/$PROJECT_ID/global/networks/default" --subnetwork "projects/$PROJECT_ID/regions/$REGION/subnetworks/default" --no-enable-intra-node-visibility --default-max-pods-per-node "16" --enable-autoscaling --min-nodes "4" --max-nodes "100" --location-policy "BALANCED" --no-enable-master-authorized-networks --addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --enable-managed-prometheus --enable-shielded-nodes --node-locations $ZONE --node-labels preemptible=false && gcloud beta container --project $PROJECT_ID node-pools create $DOWNLOAD_NODE_POOL --cluster $CLUSTER_NAME --zone $ZONE --machine-type "e2-standard-8" --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "1100" --metadata disable-legacy-endpoints=true --scopes "https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node "16" --num-nodes "1" --enable-autoscaling --min-nodes "1" --max-nodes "100" --location-policy "BALANCED" --enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --node-locations $ZONE --node-labels preemptible=false +``` + +### Connect to Cluster: +``` +gcloud container clusters get-credentials $CLUSTER_NAME --zone $ZONE --project $PROJECT_ID +``` + +### How to create environment: +``` +conda env create --name weather-dl-v2-server --file=environment.yml + +conda activate weather-dl-v2-server +``` + +### Make changes in weather_dl_v2/config.json, if required [for running locally] +``` +export CONFIG_PATH=/path/to/weather_dl_v2/config.json +``` + +### To run fastapi server: +``` +uvicorn main:app --reload +``` + +* Open your browser at http://127.0.0.1:8000. + + +### Create docker image for server: +``` +export PROJECT_ID= +export REPO= eg:weather-tools + +gcloud builds submit . --tag "gcr.io/$PROJECT_ID/$REPO:weather-dl-v2-server" --timeout=79200 --machine-type=e2-highcpu-32 +``` + +### Add path of created server image in server.yaml: +``` +Please write down the fastAPI server's docker image path at Line 42 of server.yaml. +``` + +### Create ConfigMap of common configurations for services: +Make necessary changes to weather_dl_v2/config.json and run following command. +ConfigMap is used for: +- Having a common configuration file for all services. +- Decoupling docker image and config files. +``` +kubectl create configmap dl-v2-config --from-file=/path/to/weather_dl_v2/config.json +``` + +### Deploy fastapi server on kubernetes: +``` +kubectl apply -f server.yaml --force +``` + +## General Commands +### For viewing the current pods: +``` +kubectl get pods +``` + +### For deleting existing deployment: +``` +kubectl delete -f server.yaml --force \ No newline at end of file diff --git a/weather_dl_v2/fastapi-server/__init__.py b/weather_dl_v2/fastapi-server/__init__.py new file mode 100644 index 00000000..5678014c --- /dev/null +++ b/weather_dl_v2/fastapi-server/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/weather_dl_v2/fastapi-server/config_processing/config.py b/weather_dl_v2/fastapi-server/config_processing/config.py new file mode 100644 index 00000000..fe2199b8 --- /dev/null +++ b/weather_dl_v2/fastapi-server/config_processing/config.py @@ -0,0 +1,120 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import calendar +import copy +import dataclasses +import typing as t + +Values = t.Union[t.List["Values"], t.Dict[str, "Values"], bool, int, float, str] # pytype: disable=not-supported-yet + + +@dataclasses.dataclass +class Config: + """Contains pipeline parameters. + + Attributes: + config_name: + Name of the config file. + client: + Name of the Weather-API-client. Supported clients are mentioned in the 'CLIENTS' variable. + dataset (optional): + Name of the target dataset. Allowed options are dictated by the client. + partition_keys (optional): + Choose the keys from the selection section to partition the data request. + This will compute a cartesian cross product of the selected keys + and assign each as their own download. + target_path: + Download artifact filename template. Can make use of Python's standard string formatting. + It can contain format symbols to be replaced by partition keys; + if this is used, the total number of format symbols must match the number of partition keys. + subsection_name: + Name of the particular subsection. 'default' if there is no subsection. + force_download: + Force redownload of partitions that were previously downloaded. + user_id: + Username from the environment variables. + kwargs (optional): + For representing subsections or any other parameters. + selection: + Contains parameters used to select desired data. + """ + + config_name: str = "" + client: str = "" + dataset: t.Optional[str] = "" + target_path: str = "" + partition_keys: t.Optional[t.List[str]] = dataclasses.field(default_factory=list) + subsection_name: str = "default" + force_download: bool = False + user_id: str = "unknown" + kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict) + selection: t.Dict[str, Values] = dataclasses.field(default_factory=dict) + + @classmethod + def from_dict(cls, config: t.Dict) -> "Config": + config_instance = cls() + for section_key, section_value in config.items(): + if section_key == "parameters": + for key, value in section_value.items(): + if hasattr(config_instance, key): + setattr(config_instance, key, value) + else: + config_instance.kwargs[key] = value + if section_key == "selection": + config_instance.selection = section_value + return config_instance + + +def optimize_selection_partition(selection: t.Dict) -> t.Dict: + """Compute right-hand-side values for the selection section of a single partition. + + Used to support custom syntax and optimizations, such as 'all'. + """ + selection_ = copy.deepcopy(selection) + + if "day" in selection_.keys() and selection_["day"] == "all": + year, month = selection_["year"], selection_["month"] + + multiples_error = ( + "Cannot use keyword 'all' on selections with multiple '{type}'s." + ) + + if isinstance(year, list): + assert len(year) == 1, multiples_error.format(type="year") + year = year[0] + + if isinstance(month, list): + assert len(month) == 1, multiples_error.format(type="month") + month = month[0] + + if isinstance(year, str): + assert "/" not in year, multiples_error.format(type="year") + + if isinstance(month, str): + assert "/" not in month, multiples_error.format(type="month") + + year, month = int(year), int(month) + + _, n_days_in_month = calendar.monthrange(year, month) + + selection_[ + "date" + ] = f"{year:04d}-{month:02d}-01/to/{year:04d}-{month:02d}-{n_days_in_month:02d}" + del selection_["day"] + del selection_["month"] + del selection_["year"] + + return selection_ diff --git a/weather_dl_v2/fastapi-server/config_processing/manifest.py b/weather_dl_v2/fastapi-server/config_processing/manifest.py new file mode 100644 index 00000000..35a8bf7b --- /dev/null +++ b/weather_dl_v2/fastapi-server/config_processing/manifest.py @@ -0,0 +1,513 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Client interface for connecting to a manifest.""" + +import abc +import dataclasses +import logging +import datetime +import enum +import json +import pandas as pd +import time +import traceback +import typing as t + +from .util import ( + to_json_serializable_type, + fetch_geo_polygon, + get_file_size, + get_wait_interval, + generate_md5_hash, + GLOBAL_COVERAGE_AREA, +) + +import firebase_admin +from firebase_admin import credentials +from firebase_admin import firestore +from google.cloud.firestore_v1 import DocumentReference +from google.cloud.firestore_v1.types import WriteResult +from server_config import get_config +from database.session import Database + +"""An implementation-dependent Manifest URI.""" +Location = t.NewType("Location", str) + +logger = logging.getLogger(__name__) + + +class ManifestException(Exception): + """Errors that occur in Manifest Clients.""" + + pass + + +class Stage(enum.Enum): + """A request can be either in one of the following stages at a time: + + fetch : This represents request is currently in fetch stage i.e. request placed on the client's server + & waiting for some result before starting download (eg. MARS client). + download : This represents request is currently in download stage i.e. data is being downloading from client's + server to the worker's local file system. + upload : This represents request is currently in upload stage i.e. data is getting uploaded from worker's local + file system to target location (GCS path). + retrieve : In case of clients where there is no proper separation of fetch & download stages (eg. CDS client), + request will be in the retrieve stage i.e. fetch + download. + """ + + RETRIEVE = "retrieve" + FETCH = "fetch" + DOWNLOAD = "download" + UPLOAD = "upload" + + +class Status(enum.Enum): + """Depicts the request's state status: + + scheduled : A request partition is created & scheduled for processing. + Note: Its corresponding state can be None only. + in-progress : This represents the request state is currently in-progress (i.e. running). + The next status would be "success" or "failure". + success : This represents the request state execution completed successfully without any error. + failure : This represents the request state execution failed. + """ + + SCHEDULED = "scheduled" + IN_PROGRESS = "in-progress" + SUCCESS = "success" + FAILURE = "failure" + + +@dataclasses.dataclass +class DownloadStatus: + """Data recorded in `Manifest`s reflecting the status of a download.""" + + """The name of the config file associated with the request.""" + config_name: str = "" + + """Represents the dataset field of the configuration.""" + dataset: t.Optional[str] = "" + + """Copy of selection section of the configuration.""" + selection: t.Dict = dataclasses.field(default_factory=dict) + + """Location of the downloaded data.""" + location: str = "" + + """Represents area covered by the shard.""" + area: str = "" + + """Current stage of request : 'fetch', 'download', 'retrieve', 'upload' or None.""" + stage: t.Optional[Stage] = None + + """Download status: 'scheduled', 'in-progress', 'success', or 'failure'.""" + status: t.Optional[Status] = None + + """Cause of error, if any.""" + error: t.Optional[str] = "" + + """Identifier for the user running the download.""" + username: str = "" + + """Shard size in GB.""" + size: t.Optional[float] = 0 + + """A UTC datetime when download was scheduled.""" + scheduled_time: t.Optional[str] = "" + + """A UTC datetime when the retrieve stage starts.""" + retrieve_start_time: t.Optional[str] = "" + + """A UTC datetime when the retrieve state ends.""" + retrieve_end_time: t.Optional[str] = "" + + """A UTC datetime when the fetch state starts.""" + fetch_start_time: t.Optional[str] = "" + + """A UTC datetime when the fetch state ends.""" + fetch_end_time: t.Optional[str] = "" + + """A UTC datetime when the download state starts.""" + download_start_time: t.Optional[str] = "" + + """A UTC datetime when the download state ends.""" + download_end_time: t.Optional[str] = "" + + """A UTC datetime when the upload state starts.""" + upload_start_time: t.Optional[str] = "" + + """A UTC datetime when the upload state ends.""" + upload_end_time: t.Optional[str] = "" + + @classmethod + def from_dict(cls, download_status: t.Dict) -> "DownloadStatus": + """Instantiate DownloadStatus dataclass from dict.""" + download_status_instance = cls() + for key, value in download_status.items(): + if key == "status": + setattr(download_status_instance, key, Status(value)) + elif key == "stage" and value is not None: + setattr(download_status_instance, key, Stage(value)) + else: + setattr(download_status_instance, key, value) + return download_status_instance + + @classmethod + def to_dict(cls, instance) -> t.Dict: + """Return the fields of a dataclass instance as a manifest ingestible + dictionary mapping of field names to field values.""" + download_status_dict = {} + for field in dataclasses.fields(instance): + key = field.name + value = getattr(instance, field.name) + if isinstance(value, Status) or isinstance(value, Stage): + download_status_dict[key] = value.value + elif isinstance(value, pd.Timestamp): + download_status_dict[key] = value.isoformat() + elif key == "selection" and value is not None: + download_status_dict[key] = json.dumps(value) + else: + download_status_dict[key] = value + return download_status_dict + + +@dataclasses.dataclass +class Manifest(abc.ABC): + """Abstract manifest of download statuses. + + Update download statuses to some storage medium. + + This class lets one indicate that a download is `scheduled` or in a transaction process. + In the event of a transaction, a download will be updated with an `in-progress`, `success` + or `failure` status (with accompanying metadata). + + Example: + ``` + my_manifest = parse_manifest_location(Location('fs://some-firestore-collection')) + + # Schedule data for download + my_manifest.schedule({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') + + # ... + + # Initiate a transaction – it will record that the download is `in-progess` + with my_manifest.transact({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') as tx: + # download logic here + pass + + # ... + + # on error, will record the download as a `failure` before propagating the error. By default, it will + # record download as a `success`. + ``` + + Attributes: + status: The current `DownloadStatus` of the Manifest. + """ + + # To reduce the impact of _read() and _update() calls + # on the start time of the stage. + prev_stage_precise_start_time: t.Optional[str] = None + status: t.Optional[DownloadStatus] = None + + # This is overridden in subclass. + def __post_init__(self): + """Initialize the manifest.""" + pass + + def schedule( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> None: + """Indicate that a job has been scheduled for download. + + 'scheduled' jobs occur before 'in-progress', 'success' or 'finished'. + """ + scheduled_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + self.status = DownloadStatus( + config_name=config_name, + dataset=dataset if dataset else None, + selection=selection, + location=location, + area=fetch_geo_polygon(selection.get("area", GLOBAL_COVERAGE_AREA)), + username=user, + stage=None, + status=Status.SCHEDULED, + error=None, + size=None, + scheduled_time=scheduled_time, + retrieve_start_time=None, + retrieve_end_time=None, + fetch_start_time=None, + fetch_end_time=None, + download_start_time=None, + download_end_time=None, + upload_start_time=None, + upload_end_time=None, + ) + self._update(self.status) + + def skip( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> None: + """Updates the manifest to mark the shards that were skipped in the current job + as 'upload' stage and 'success' status, indicating that they have already been downloaded. + """ + old_status = self._read(location) + # The manifest needs to be updated for a skipped shard if its entry is not present, or + # if the stage is not 'upload', or if the stage is 'upload' but the status is not 'success'. + if ( + old_status.location != location + or old_status.stage != Stage.UPLOAD + or old_status.status != Status.SUCCESS + ): + current_utc_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + + size = get_file_size(location) + + status = DownloadStatus( + config_name=config_name, + dataset=dataset if dataset else None, + selection=selection, + location=location, + area=fetch_geo_polygon(selection.get("area", GLOBAL_COVERAGE_AREA)), + username=user, + stage=Stage.UPLOAD, + status=Status.SUCCESS, + error=None, + size=size, + scheduled_time=None, + retrieve_start_time=None, + retrieve_end_time=None, + fetch_start_time=None, + fetch_end_time=None, + download_start_time=None, + download_end_time=None, + upload_start_time=current_utc_time, + upload_end_time=current_utc_time, + ) + self._update(status) + logger.info( + f"Manifest updated for skipped shard: {location!r} -- {DownloadStatus.to_dict(status)!r}." + ) + + def _set_for_transaction( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> None: + """Reset Manifest state in preparation for a new transaction.""" + self.status = dataclasses.replace(self._read(location)) + self.status.config_name = config_name + self.status.dataset = dataset if dataset else None + self.status.selection = selection + self.status.location = location + self.status.username = user + + def __enter__(self) -> None: + pass + + def __exit__(self, exc_type, exc_inst, exc_tb) -> None: + """Record end status of a transaction as either 'success' or 'failure'.""" + if exc_type is None: + status = Status.SUCCESS + error = None + else: + status = Status.FAILURE + # For explanation, see https://docs.python.org/3/library/traceback.html#traceback.format_exception + error = "\n".join(traceback.format_exception(exc_type, exc_inst, exc_tb)) + + new_status = dataclasses.replace(self.status) + new_status.error = error + new_status.status = status + current_utc_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + + # This is necessary for setting the precise start time of the previous stage + # and end time of the final stage, as well as handling the case of Status.FAILURE. + if new_status.stage == Stage.FETCH: + new_status.fetch_start_time = self.prev_stage_precise_start_time + new_status.fetch_end_time = current_utc_time + elif new_status.stage == Stage.RETRIEVE: + new_status.retrieve_start_time = self.prev_stage_precise_start_time + new_status.retrieve_end_time = current_utc_time + elif new_status.stage == Stage.DOWNLOAD: + new_status.download_start_time = self.prev_stage_precise_start_time + new_status.download_end_time = current_utc_time + else: + new_status.upload_start_time = self.prev_stage_precise_start_time + new_status.upload_end_time = current_utc_time + + new_status.size = get_file_size(new_status.location) + + self.status = new_status + + self._update(self.status) + + def transact( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> "Manifest": + """Create a download transaction.""" + self._set_for_transaction(config_name, dataset, selection, location, user) + return self + + def set_stage(self, stage: Stage) -> None: + """Sets the current stage in manifest.""" + prev_stage = self.status.stage + new_status = dataclasses.replace(self.status) + new_status.stage = stage + new_status.status = Status.IN_PROGRESS + current_utc_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + + if stage == Stage.FETCH: + new_status.fetch_start_time = current_utc_time + elif stage == Stage.RETRIEVE: + new_status.retrieve_start_time = current_utc_time + elif stage == Stage.DOWNLOAD: + new_status.fetch_start_time = self.prev_stage_precise_start_time + new_status.fetch_end_time = current_utc_time + new_status.download_start_time = current_utc_time + else: + if prev_stage == Stage.DOWNLOAD: + new_status.download_start_time = self.prev_stage_precise_start_time + new_status.download_end_time = current_utc_time + else: + new_status.retrieve_start_time = self.prev_stage_precise_start_time + new_status.retrieve_end_time = current_utc_time + new_status.upload_start_time = current_utc_time + + self.status = new_status + self._update(self.status) + + @abc.abstractmethod + def _read(self, location: str) -> DownloadStatus: + pass + + @abc.abstractmethod + def _update(self, download_status: DownloadStatus) -> None: + pass + + +class FirestoreManifest(Manifest, Database): + """A Firestore Manifest. + This Manifest implementation stores DownloadStatuses in a Firebase document store. + The document hierarchy for the manifest is as follows: + [manifest ] + ├── doc_id (md5 hash of the path) { 'selection': {...}, 'location': ..., 'username': ... } + └── etc... + Where `[]` indicates a collection and ` {...}` indicates a document. + """ + + def _get_db(self) -> firestore.firestore.Client: + """Acquire a firestore client, initializing the firebase app if necessary. + Will attempt to get the db client five times. If it's still unsuccessful, a + `ManifestException` will be raised. + """ + db = None + attempts = 0 + + while db is None: + try: + db = firestore.client() + except ValueError as e: + # The above call will fail with a value error when the firebase app is not initialized. + # Initialize the app here, and try again. + # Use the application default credentials. + cred = credentials.ApplicationDefault() + + firebase_admin.initialize_app(cred) + logger.info("Initialized Firebase App.") + + if attempts > 4: + raise ManifestException( + "Exceeded number of retries to get firestore client." + ) from e + + time.sleep(get_wait_interval(attempts)) + + attempts += 1 + + return db + + def _read(self, location: str) -> DownloadStatus: + """Reads the JSON data from a manifest.""" + + doc_id = generate_md5_hash(location) + + # Update document with download status + download_doc_ref = self.root_document_for_store(doc_id) + + result = download_doc_ref.get() + row = {} + if result.exists: + records = result.to_dict() + row = {n: to_json_serializable_type(v) for n, v in records.items()} + return DownloadStatus.from_dict(row) + + def _update(self, download_status: DownloadStatus) -> None: + """Update or create a download status record.""" + logger.info("Updating Firestore Manifest.") + + status = DownloadStatus.to_dict(download_status) + doc_id = generate_md5_hash(status["location"]) + + # Update document with download status + download_doc_ref = self.root_document_for_store(doc_id) + + result: WriteResult = download_doc_ref.set(status) + + logger.info( + f"Firestore manifest updated. " + f"update_time={result.update_time}, " + f"filename={download_status.location}." + ) + + def root_document_for_store(self, store_scheme: str) -> DocumentReference: + """Get the root manifest document given the user's config and current document's storage location.""" + root_collection = get_config().manifest_collection + return self._get_db().collection(root_collection).document(store_scheme) diff --git a/weather_dl_v2/fastapi-server/config_processing/parsers.py b/weather_dl_v2/fastapi-server/config_processing/parsers.py new file mode 100644 index 00000000..5f9e1f5c --- /dev/null +++ b/weather_dl_v2/fastapi-server/config_processing/parsers.py @@ -0,0 +1,507 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Parsers for ECMWF download configuration.""" + +import ast +import configparser +import copy as cp +import datetime +import json +import string +import textwrap +import typing as t +import numpy as np +from collections import OrderedDict +from .config import Config + +CLIENTS = ["cds", "mars", "ecpublic"] + + +def date(candidate: str) -> datetime.date: + """Converts ECMWF-format date strings into a `datetime.date`. + + Accepted absolute date formats: + - YYYY-MM-DD + - YYYYMMDD + - YYYY-DDD, where DDD refers to the day of the year + + For example: + - 2021-10-31 + - 19700101 + - 1950-007 + + See https://confluence.ecmwf.int/pages/viewpage.action?pageId=118817289 for date format spec. + Note: Name of month is not supported. + """ + converted = None + + # Parse relative day value. + if candidate.startswith("-"): + return datetime.date.today() + datetime.timedelta(days=int(candidate)) + + accepted_formats = ["%Y-%m-%d", "%Y%m%d", "%Y-%j"] + + for fmt in accepted_formats: + try: + converted = datetime.datetime.strptime(candidate, fmt).date() + break + except ValueError: + pass + + if converted is None: + raise ValueError( + f"Not a valid date: '{candidate}'. Please use valid relative or absolute format." + ) + + return converted + + +def time(candidate: str) -> datetime.time: + """Converts ECMWF-format time strings into a `datetime.time`. + + Accepted time formats: + - HH:MM + - HHMM + - HH + + For example: + - 18:00 + - 1820 + - 18 + + Note: If MM is omitted it defaults to 00. + """ + converted = None + + accepted_formats = ["%H", "%H:%M", "%H%M"] + + for fmt in accepted_formats: + try: + converted = datetime.datetime.strptime(candidate, fmt).time() + break + except ValueError: + pass + + if converted is None: + raise ValueError(f"Not a valid time: '{candidate}'. Please use valid format.") + + return converted + + +def day_month_year(candidate: t.Any) -> int: + """Converts day, month and year strings into 'int'.""" + try: + if isinstance(candidate, str) or isinstance(candidate, int): + return int(candidate) + raise ValueError("must be a str or int.") + except ValueError as e: + raise ValueError( + f"Not a valid day, month, or year value: {candidate}. Please use valid value." + ) from e + + +def parse_literal(candidate: t.Any) -> t.Any: + try: + # Support parsing ints with leading zeros, e.g. '01' + if isinstance(candidate, str) and candidate.isdigit(): + return int(candidate) + return ast.literal_eval(candidate) + except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError): + return candidate + + +def validate(key: str, value: int) -> None: + """Validates value based on the key.""" + if key == "day": + assert 1 <= value <= 31, "Day value must be between 1 to 31." + if key == "month": + assert 1 <= value <= 12, "Month value must be between 1 to 12." + + +def typecast(key: str, value: t.Any) -> t.Any: + """Type the value to its appropriate datatype.""" + SWITCHER = { + "date": date, + "time": time, + "day": day_month_year, + "month": day_month_year, + "year": day_month_year, + } + converted = SWITCHER.get(key, parse_literal)(value) + validate(key, converted) + return converted + + +def _read_config_file(file: t.IO) -> t.Dict: + """Reads `*.json` or `*.cfg` files.""" + try: + return json.load(file) + except json.JSONDecodeError: + pass + + file.seek(0) + + try: + config = configparser.ConfigParser() + config.read_file(file) + config = {s: dict(config.items(s)) for s in config.sections()} + return config + except configparser.ParsingError: + return {} + + +def parse_config(file: t.IO) -> t.Dict: + """Parses a `*.json` or `*.cfg` file into a configuration dictionary.""" + config = _read_config_file(file) + config_by_section = {s: _parse_lists(v, s) for s, v in config.items()} + config_with_nesting = parse_subsections(config_by_section) + return config_with_nesting + + +def _splitlines(block: str) -> t.List[str]: + """Converts a multi-line block into a list of strings.""" + return [line.strip() for line in block.strip().splitlines()] + + +def mars_range_value(token: str) -> t.Union[datetime.date, int, float]: + """Converts a range token into either a date, int, or float.""" + try: + return date(token) + except ValueError: + pass + + if token.isdecimal(): + return int(token) + + try: + return float(token) + except ValueError: + raise ValueError( + "Token string must be an 'int', 'float', or 'datetime.date()'." + ) + + +def mars_increment_value(token: str) -> t.Union[int, float]: + """Converts an increment token into either an int or a float.""" + try: + return int(token) + except ValueError: + pass + + try: + return float(token) + except ValueError: + raise ValueError("Token string must be an 'int' or a 'float'.") + + +def parse_mars_syntax(block: str) -> t.List[str]: + """Parses MARS list or range into a list of arguments; ranges are inclusive. + + Types for the range and value are inferred. + + Examples: + >>> parse_mars_syntax("10/to/12") + ['10', '11', '12'] + >>> parse_mars_syntax("12/to/10/by/-1") + ['12', '11', '10'] + >>> parse_mars_syntax("0.0/to/0.5/by/0.1") + ['0.0', '0.1', '0.2', '0.30000000000000004', '0.4', '0.5'] + >>> parse_mars_syntax("2020-01-07/to/2020-01-14/by/2") + ['2020-01-07', '2020-01-09', '2020-01-11', '2020-01-13'] + >>> parse_mars_syntax("2020-01-14/to/2020-01-07/by/-2") + ['2020-01-14', '2020-01-12', '2020-01-10', '2020-01-08'] + + Returns: + A list of strings representing a range from start to finish, based on the + type of the values in the range. + If all range values are integers, it will return a list of strings of integers. + If range values are floats, it will return a list of strings of floats. + If the range values are dates, it will return a list of strings of dates in + YYYY-MM-DD format. (Note: here, the increment value should be an integer). + """ + + # Split into tokens, omitting empty strings. + tokens = [b.strip() for b in block.split("/") if b != ""] + + # Return list if no range operators are present. + if "to" not in tokens and "by" not in tokens: + return tokens + + # Parse range values, honoring 'to' and 'by' operators. + try: + to_idx = tokens.index("to") + assert to_idx != 0, "There must be a start token." + start_token, end_token = tokens[to_idx - 1], tokens[to_idx + 1] + start, end = mars_range_value(start_token), mars_range_value(end_token) + + # Parse increment token, or choose default increment. + increment_token = "1" + increment = 1 + if "by" in tokens: + increment_token = tokens[tokens.index("by") + 1] + increment = mars_increment_value(increment_token) + except (AssertionError, IndexError, ValueError): + raise SyntaxError(f"Improper range syntax in '{block}'.") + + # Return a range of values with appropriate data type. + if isinstance(start, datetime.date) and isinstance(end, datetime.date): + if not isinstance(increment, int): + raise ValueError( + f"Increments on a date range must be integer number of days, '{increment_token}' is invalid." + ) + return [d.strftime("%Y-%m-%d") for d in date_range(start, end, increment)] + elif (isinstance(start, float) or isinstance(end, float)) and not isinstance( + increment, datetime.date + ): + # Increment can be either an int or a float. + _round_places = 4 + return [ + str(round(x, _round_places)).zfill(len(start_token)) + for x in np.arange(start, end + increment, increment) + ] + elif isinstance(start, int) and isinstance(end, int) and isinstance(increment, int): + # Honor leading zeros. + offset = 1 if start <= end else -1 + return [ + str(x).zfill(len(start_token)) + for x in range(start, end + offset, increment) + ] + else: + raise ValueError( + f"Range tokens (start='{start_token}', end='{end_token}', increment='{increment_token}')" + f" are inconsistent types." + ) + + +def date_range( + start: datetime.date, end: datetime.date, increment: int = 1 +) -> t.Iterable[datetime.date]: + """Gets a range of dates, inclusive.""" + offset = 1 if start <= end else -1 + return ( + start + datetime.timedelta(days=x) + for x in range(0, (end - start).days + offset, increment) + ) + + +def _parse_lists(config: dict, section: str = "") -> t.Dict: + """Parses multiline blocks in *.cfg and *.json files as lists.""" + for key, val in config.items(): + # Checks str type for backward compatibility since it also support "padding": 0 in json config + if not isinstance(val, str): + continue + + if "/" in val and "parameters" not in section: + config[key] = parse_mars_syntax(val) + elif "\n" in val: + config[key] = _splitlines(val) + + return config + + +def _number_of_replacements(s: t.Text): + format_names = [v[1] for v in string.Formatter().parse(s) if v[1] is not None] + num_empty_names = len([empty for empty in format_names if empty == ""]) + if num_empty_names != 0: + num_empty_names -= 1 + return len(set(format_names)) + num_empty_names + + +def parse_subsections(config: t.Dict) -> t.Dict: + """Interprets [section.subsection] as nested dictionaries in `.cfg` files.""" + copy = cp.deepcopy(config) + for key, val in copy.items(): + path = key.split(".") + runner = copy + parent = {} + p = None + for p in path: + if p not in runner: + runner[p] = {} + parent = runner + runner = runner[p] + parent[p] = val + + for_cleanup = [key for key, _ in copy.items() if "." in key] + for target in for_cleanup: + del copy[target] + return copy + + +def require( + condition: bool, message: str, error_type: t.Type[Exception] = ValueError +) -> None: + """A assert-like helper that wraps text and throws an error.""" + if not condition: + raise error_type(textwrap.dedent(message)) + + +def process_config(file: t.IO, config_name: str) -> Config: + """Read the config file and prompt the user if it is improperly structured.""" + config = parse_config(file) + + require(bool(config), "Unable to parse configuration file.") + require( + "parameters" in config, + """ + 'parameters' section required in configuration file. + + The 'parameters' section specifies the 'client', 'dataset', 'target_path', and + 'partition_key' for the API client. + + Please consult the documentation for more information.""", + ) + + params = config.get("parameters", {}) + require( + "target_template" not in params, + """ + 'target_template' is deprecated, use 'target_path' instead. + + Please consult the documentation for more information.""", + ) + require( + "target_path" in params, + """ + 'parameters' section requires a 'target_path' key. + + The 'target_path' is used to format the name of the output files. It + accepts Python 3.5+ string format symbols (e.g. '{}'). The number of symbols + should match the length of the 'partition_keys', as the 'partition_keys' args + are used to create the templates.""", + ) + require( + "client" in params, + """ + 'parameters' section requires a 'client' key. + + Supported clients are {} + """.format( + str(CLIENTS) + ), + ) + require( + params.get("client") in CLIENTS, + """ + Invalid 'client' parameter. + + Supported clients are {} + """.format( + str(CLIENTS) + ), + ) + require( + "append_date_dirs" not in params, + """ + The current version of 'google-weather-tools' no longer supports 'append_date_dirs'! + + Please refer to documentation for creating date-based directory hierarchy : + https://weather-tools.readthedocs.io/en/latest/Configuration.html#""" + """creating-a-date-based-directory-hierarchy.""", + NotImplementedError, + ) + require( + "target_filename" not in params, + """ + The current version of 'google-weather-tools' no longer supports 'target_filename'! + + Please refer to documentation : + https://weather-tools.readthedocs.io/en/latest/Configuration.html#parameters-section.""", + NotImplementedError, + ) + + partition_keys = params.get("partition_keys", list()) + if isinstance(partition_keys, str): + partition_keys = [partition_keys.strip()] + + selection = config.get("selection", dict()) + require( + all((key in selection for key in partition_keys)), + """ + All 'partition_keys' must appear in the 'selection' section. + + 'partition_keys' specify how to split data for workers. Please consult + documentation for more information.""", + ) + + num_template_replacements = _number_of_replacements(params["target_path"]) + num_partition_keys = len(partition_keys) + + require( + num_template_replacements == num_partition_keys, + """ + 'target_path' has {0} replacements. Expected {1}, since there are {1} + partition keys. + """.format( + num_template_replacements, num_partition_keys + ), + ) + + if "day" in partition_keys: + require( + selection["day"] != "all", + """If 'all' is used for a selection value, it cannot appear as a partition key.""", + ) + + # Ensure consistent lookup. + config["parameters"]["partition_keys"] = partition_keys + # Add config file name. + config["parameters"]["config_name"] = config_name + + # Ensure the cartesian-cross can be taken on singleton values for the partition. + for key in partition_keys: + if not isinstance(selection[key], list): + selection[key] = [selection[key]] + + return Config.from_dict(config) + + +def prepare_target_name(config: Config) -> str: + """Returns name of target location.""" + partition_dict = OrderedDict( + (key, typecast(key, config.selection[key][0])) for key in config.partition_keys + ) + target = config.target_path.format(*partition_dict.values(), **partition_dict) + + return target + + +def get_subsections(config: Config) -> t.List[t.Tuple[str, t.Dict]]: + """Collect parameter subsections from main configuration. + + If the `parameters` section contains subsections (e.g. '[parameters.1]', + '[parameters.2]'), collect the subsection key-value pairs. Otherwise, + return an empty dictionary (i.e. there are no subsections). + + This is useful for specifying multiple API keys for your configuration. + For example: + ``` + [parameters.alice] + api_key=KKKKK1 + api_url=UUUUU1 + [parameters.bob] + api_key=KKKKK2 + api_url=UUUUU2 + [parameters.eve] + api_key=KKKKK3 + api_url=UUUUU3 + ``` + """ + return [ + (name, params) + for name, params in config.kwargs.items() + if isinstance(params, dict) + ] or [("default", {})] diff --git a/weather_dl_v2/fastapi-server/config_processing/partition.py b/weather_dl_v2/fastapi-server/config_processing/partition.py new file mode 100644 index 00000000..a9f6a9e2 --- /dev/null +++ b/weather_dl_v2/fastapi-server/config_processing/partition.py @@ -0,0 +1,129 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import copy as cp +import dataclasses +import itertools +import typing as t + +from .manifest import Manifest +from .parsers import prepare_target_name +from .config import Config +from .stores import Store, FSStore + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class PartitionConfig: + """Partition a config into multiple data requests. + + Partitioning involves four main operations: First, we fan-out shards based on + partition keys (a cross product of the values). Second, we filter out existing + downloads (unless we want to force downloads). Last, we assemble each partition + into a single Config. + + Attributes: + store: A cloud storage system, used for checking the existence of downloads. + manifest: A download manifest to register preparation state. + """ + + config: Config + store: Store + manifest: Manifest + + def _create_partition_config(self, option: t.Tuple) -> Config: + """Create a config for a single partition option. + + Output a config dictionary, overriding the range of values for + each key with the partition instance in 'selection'. + Continuing the example from prepare_partitions, the selection section + would be: + { 'foo': ..., 'year': ['2020'], 'month': ['01'], ... } + { 'foo': ..., 'year': ['2020'], 'month': ['02'], ... } + { 'foo': ..., 'year': ['2020'], 'month': ['03'], ... } + + Args: + option: A single item in the range of partition_keys. + config: The download config, including the parameters and selection sections. + + Returns: + A configuration with that selects a single download partition. + """ + copy = cp.deepcopy(self.config.selection) + out = cp.deepcopy(self.config) + for idx, key in enumerate(self.config.partition_keys): + copy[key] = [option[idx]] + + out.selection = copy + return out + + def skip_partition(self, config: Config) -> bool: + """Return true if partition should be skipped.""" + + if config.force_download: + return False + + target = prepare_target_name(config) + if self.store.exists(target): + logger.info(f"file {target} found, skipping.") + self.manifest.skip( + config.config_name, + config.dataset, + config.selection, + target, + config.user_id, + ) + return True + + return False + + def prepare_partitions(self) -> t.Iterator[Config]: + """Iterate over client parameters, partitioning over `partition_keys`. + + This produces a Cartesian-Cross over the range of keys. + + For example, if the keys were 'year' and 'month', it would produce + an iterable like: + ( ('2020', '01'), ('2020', '02'), ('2020', '03'), ...) + + Returns: + An iterator of `Config`s. + """ + for option in itertools.product( + *[self.config.selection[key] for key in self.config.partition_keys] + ): + yield self._create_partition_config(option) + + def new_downloads_only(self, candidate: Config) -> bool: + """Predicate function to skip already downloaded partitions.""" + if self.store is None: + self.store = FSStore() + should_skip = self.skip_partition(candidate) + + return not should_skip + + def update_manifest_collection(self, partition: Config) -> Config: + """Updates the DB.""" + location = prepare_target_name(partition) + self.manifest.schedule( + partition.config_name, + partition.dataset, + partition.selection, + location, + partition.user_id, + ) + logger.info(f"Created partition {location!r}.") diff --git a/weather_dl_v2/fastapi-server/config_processing/pipeline.py b/weather_dl_v2/fastapi-server/config_processing/pipeline.py new file mode 100644 index 00000000..175dd798 --- /dev/null +++ b/weather_dl_v2/fastapi-server/config_processing/pipeline.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import getpass +import logging +import os +from .parsers import process_config +from .partition import PartitionConfig +from .manifest import FirestoreManifest +from database.download_handler import get_download_handler +from database.queue_handler import get_queue_handler +from fastapi.concurrency import run_in_threadpool + +logger = logging.getLogger(__name__) + +download_handler = get_download_handler() +queue_handler = get_queue_handler() + + +def _do_partitions(partition_obj: PartitionConfig): + for partition in partition_obj.prepare_partitions(): + # Skip existing downloads + if partition_obj.new_downloads_only(partition): + partition_obj.update_manifest_collection(partition) + + +# TODO: Make partitioning faster. +async def start_processing_config(config_file, licenses, force_download): + config = {} + manifest = FirestoreManifest() + + with open(config_file, "r", encoding="utf-8") as f: + # configs/example.cfg -> example.cfg + config_name = os.path.split(config_file)[1] + config = process_config(f, config_name) + + config.force_download = force_download + config.user_id = getpass.getuser() + + partition_obj = PartitionConfig(config, None, manifest) + + # Make entry in 'download' & 'queues' collection. + await download_handler._start_download(config_name, config.client) + await download_handler._mark_partitioning_status( + config_name, "Partitioning in-progress." + ) + try: + # Prepare partitions + await run_in_threadpool(_do_partitions, partition_obj) + await download_handler._mark_partitioning_status( + config_name, "Partitioning completed." + ) + await queue_handler._update_queues_on_start_download(config_name, licenses) + except Exception as e: + error_str = f"Partitioning failed for {config_name} due to {e}." + logger.error(error_str) + await download_handler._mark_partitioning_status(config_name, error_str) diff --git a/weather_dl_v2/fastapi-server/config_processing/stores.py b/weather_dl_v2/fastapi-server/config_processing/stores.py new file mode 100644 index 00000000..4f60e337 --- /dev/null +++ b/weather_dl_v2/fastapi-server/config_processing/stores.py @@ -0,0 +1,122 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Download destinations, or `Store`s.""" + +import abc +import io +import os +import tempfile +import typing as t + +from apache_beam.io.filesystems import FileSystems + + +class Store(abc.ABC): + """A interface to represent where downloads are stored. + + Default implementation uses Apache Beam's Filesystems. + """ + + @abc.abstractmethod + def open(self, filename: str, mode: str = "r") -> t.IO: + pass + + @abc.abstractmethod + def exists(self, filename: str) -> bool: + pass + + +class InMemoryStore(Store): + """Store file data in memory.""" + + def __init__(self): + self.store = {} + + def open(self, filename: str, mode: str = "r") -> t.IO: + """Create or read in-memory data.""" + if "b" in mode: + file = io.BytesIO() + else: + file = io.StringIO() + self.store[filename] = file + return file + + def exists(self, filename: str) -> bool: + """Return true if the 'file' exists in memory.""" + return filename in self.store + + +class TempFileStore(Store): + """Store data into temporary files.""" + + def __init__(self, directory: t.Optional[str] = None) -> None: + """Optionally specify the directory that contains all temporary files.""" + self.dir = directory + if self.dir and not os.path.exists(self.dir): + os.makedirs(self.dir) + + def open(self, filename: str, mode: str = "r") -> t.IO: + """Create a temporary file in the store directory.""" + return tempfile.TemporaryFile(mode, dir=self.dir) + + def exists(self, filename: str) -> bool: + """Return true if file exists.""" + return os.path.exists(filename) + + +class LocalFileStore(Store): + """Store data into local files.""" + + def __init__(self, directory: t.Optional[str] = None) -> None: + """Optionally specify the directory that contains all downloaded files.""" + self.dir = directory + if self.dir and not os.path.exists(self.dir): + os.makedirs(self.dir) + + def open(self, filename: str, mode: str = "r") -> t.IO: + """Open a local file from the store directory.""" + return open(os.sep.join([self.dir, filename]), mode) + + def exists(self, filename: str) -> bool: + """Returns true if local file exists.""" + return os.path.exists(os.sep.join([self.dir, filename])) + + +class FSStore(Store): + """Store data into any store supported by Apache Beam's FileSystems.""" + + def open(self, filename: str, mode: str = "r") -> t.IO: + """Open object in cloud bucket (or local file system) as a read or write channel. + + To work with cloud storage systems, only a read or write channel can be openend + at one time. Data will be treated as bytes, not text (equivalent to `rb` or `wb`). + + Further, append operations, or writes on existing objects, are dissallowed (the + error thrown will depend on the implementation of the underlying cloud provider). + """ + if "r" in mode and "w" not in mode: + return FileSystems().open(filename) + + if "w" in mode and "r" not in mode: + return FileSystems().create(filename) + + raise ValueError( + f"invalid mode {mode!r}: mode must have either 'r' or 'w', but not both." + ) + + def exists(self, filename: str) -> bool: + """Returns true if object exists.""" + return FileSystems().exists(filename) diff --git a/weather_dl_v2/fastapi-server/config_processing/util.py b/weather_dl_v2/fastapi-server/config_processing/util.py new file mode 100644 index 00000000..765a9c47 --- /dev/null +++ b/weather_dl_v2/fastapi-server/config_processing/util.py @@ -0,0 +1,229 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import datetime +import geojson +import hashlib +import itertools +import os +import socket +import subprocess +import sys +import typing as t + +import numpy as np +import pandas as pd +from apache_beam.io.gcp import gcsio +from apache_beam.utils import retry +from xarray.core.utils import ensure_us_time_resolution +from urllib.parse import urlparse +from google.api_core.exceptions import BadRequest + + +LATITUDE_RANGE = (-90, 90) +LONGITUDE_RANGE = (-180, 180) +GLOBAL_COVERAGE_AREA = [90, -180, -90, 180] + +logger = logging.getLogger(__name__) + + +def _retry_if_valid_input_but_server_or_socket_error_and_timeout_filter( + exception, +) -> bool: + if isinstance(exception, socket.timeout): + return True + if isinstance(exception, TimeoutError): + return True + # To handle the concurrency issue in BigQuery. + if isinstance(exception, BadRequest): + return True + return retry.retry_if_valid_input_but_server_error_and_timeout_filter(exception) + + +class _FakeClock: + + def sleep(self, value): + pass + + +def retry_with_exponential_backoff(fun): + """A retry decorator that doesn't apply during test time.""" + clock = retry.Clock() + + # Use a fake clock only during test time... + if "unittest" in sys.modules.keys(): + clock = _FakeClock() + + return retry.with_exponential_backoff( + retry_filter=_retry_if_valid_input_but_server_or_socket_error_and_timeout_filter, + clock=clock, + )(fun) + + +# TODO(#245): Group with common utilities (duplicated) +def ichunked(iterable: t.Iterable, n: int) -> t.Iterator[t.Iterable]: + """Yield evenly-sized chunks from an iterable.""" + input_ = iter(iterable) + try: + while True: + it = itertools.islice(input_, n) + # peek to check if 'it' has next item. + first = next(it) + yield itertools.chain([first], it) + except StopIteration: + pass + + +# TODO(#245): Group with common utilities (duplicated) +def copy(src: str, dst: str) -> None: + """Copy data via `gsutil cp`.""" + try: + subprocess.run(["gsutil", "cp", src, dst], check=True, capture_output=True) + except subprocess.CalledProcessError as e: + logger.info( + f'Failed to copy file {src!r} to {dst!r} due to {e.stderr.decode("utf-8")}.' + ) + raise + + +# TODO(#245): Group with common utilities (duplicated) +def to_json_serializable_type(value: t.Any) -> t.Any: + """Returns the value with a type serializable to JSON""" + # Note: The order of processing is significant. + logger.info("Serializing to JSON.") + + if pd.isna(value) or value is None: + return None + elif np.issubdtype(type(value), np.floating): + return float(value) + elif isinstance(value, np.ndarray): + # Will return a scaler if array is of size 1, else will return a list. + return value.tolist() + elif ( + isinstance(value, datetime.datetime) + or isinstance(value, str) + or isinstance(value, np.datetime64) + ): + # Assume strings are ISO format timestamps... + try: + value = datetime.datetime.fromisoformat(value) + except ValueError: + # ... if they are not, assume serialization is already correct. + return value + except TypeError: + # ... maybe value is a numpy datetime ... + try: + value = ensure_us_time_resolution(value).astype(datetime.datetime) + except AttributeError: + # ... value is a datetime object, continue. + pass + + # We use a string timestamp representation. + if value.tzname(): + return value.isoformat() + + # We assume here that naive timestamps are in UTC timezone. + return value.replace(tzinfo=datetime.timezone.utc).isoformat() + elif isinstance(value, np.timedelta64): + # Return time delta in seconds. + return float(value / np.timedelta64(1, "s")) + # This check must happen after processing np.timedelta64 and np.datetime64. + elif np.issubdtype(type(value), np.integer): + return int(value) + + return value + + +def fetch_geo_polygon(area: t.Union[list, str]) -> str: + """Calculates a geography polygon from an input area.""" + # Ref: https://confluence.ecmwf.int/pages/viewpage.action?pageId=151520973 + if isinstance(area, str): + # European area + if area == "E": + area = [73.5, -27, 33, 45] + # Global area + elif area == "G": + area = GLOBAL_COVERAGE_AREA + else: + raise RuntimeError(f"Not a valid value for area in config: {area}.") + + n, w, s, e = [float(x) for x in area] + if s < LATITUDE_RANGE[0]: + raise ValueError(f"Invalid latitude value for south: '{s}'") + if n > LATITUDE_RANGE[1]: + raise ValueError(f"Invalid latitude value for north: '{n}'") + if w < LONGITUDE_RANGE[0]: + raise ValueError(f"Invalid longitude value for west: '{w}'") + if e > LONGITUDE_RANGE[1]: + raise ValueError(f"Invalid longitude value for east: '{e}'") + + # Define the coordinates of the bounding box. + coords = [[w, n], [w, s], [e, s], [e, n], [w, n]] + + # Create the GeoJSON polygon object. + polygon = geojson.dumps(geojson.Polygon([coords])) + return polygon + + +def get_file_size(path: str) -> float: + parsed_gcs_path = urlparse(path) + if parsed_gcs_path.scheme != "gs" or parsed_gcs_path.netloc == "": + return os.stat(path).st_size / (1024**3) if os.path.exists(path) else 0 + else: + return ( + gcsio.GcsIO().size(path) / (1024**3) if gcsio.GcsIO().exists(path) else 0 + ) + + +def get_wait_interval(num_retries: int = 0) -> float: + """Returns next wait interval in seconds, using an exponential backoff algorithm.""" + if 0 == num_retries: + return 0 + return 2**num_retries + + +def generate_md5_hash(input: str) -> str: + """Generates md5 hash for the input string.""" + return hashlib.md5(input.encode("utf-8")).hexdigest() + + +def download_with_aria2(url: str, path: str) -> None: + """Downloads a file from the given URL using the `aria2c` command-line utility, + with options set to improve download speed and reliability.""" + dir_path, file_name = os.path.split(path) + try: + subprocess.run( + [ + "aria2c", + "-x", + "16", + "-s", + "16", + url, + "-d", + dir_path, + "-o", + file_name, + "--allow-overwrite", + ], + check=True, + capture_output=True, + ) + except subprocess.CalledProcessError as e: + logger.info( + f'Failed download from server {url!r} to {path!r} due to {e.stderr.decode("utf-8")}.' + ) + raise diff --git a/weather_dl_v2/fastapi-server/database/__init__.py b/weather_dl_v2/fastapi-server/database/__init__.py new file mode 100644 index 00000000..5678014c --- /dev/null +++ b/weather_dl_v2/fastapi-server/database/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/weather_dl_v2/fastapi-server/database/download_handler.py b/weather_dl_v2/fastapi-server/database/download_handler.py new file mode 100644 index 00000000..1377e5b4 --- /dev/null +++ b/weather_dl_v2/fastapi-server/database/download_handler.py @@ -0,0 +1,160 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import logging +from firebase_admin import firestore +from google.cloud.firestore_v1 import DocumentSnapshot, FieldFilter +from google.cloud.firestore_v1.types import WriteResult +from database.session import get_async_client +from server_config import get_config + +logger = logging.getLogger(__name__) + + +def get_download_handler(): + return DownloadHandlerFirestore(db=get_async_client()) + + +def get_mock_download_handler(): + return DownloadHandlerMock() + + +class DownloadHandler(abc.ABC): + + @abc.abstractmethod + async def _start_download(self, config_name: str, client_name: str) -> None: + pass + + @abc.abstractmethod + async def _stop_download(self, config_name: str) -> None: + pass + + @abc.abstractmethod + async def _mark_partitioning_status(self, config_name: str, status: str) -> None: + pass + + @abc.abstractmethod + async def _check_download_exists(self, config_name: str) -> bool: + pass + + @abc.abstractmethod + async def _get_downloads(self, client_name: str) -> list: + pass + + @abc.abstractmethod + async def _get_download_by_config_name(self, config_name: str): + pass + + +class DownloadHandlerMock(DownloadHandler): + + def __init__(self): + pass + + async def _start_download(self, config_name: str, client_name: str) -> None: + logger.info( + f"Added {config_name} in 'download' collection. Update_time: 000000." + ) + + async def _stop_download(self, config_name: str) -> None: + logger.info( + f"Removed {config_name} in 'download' collection. Update_time: 000000." + ) + + async def _mark_partitioning_status(self, config_name: str, status: str) -> None: + logger.info( + f"Updated {config_name} in 'download' collection. Update_time: 000000." + ) + + async def _check_download_exists(self, config_name: str) -> bool: + if config_name == "not_exist": + return False + elif config_name == "not_exist.cfg": + return False + else: + return True + + async def _get_downloads(self, client_name: str) -> list: + return [{"config_name": "example.cfg", "client_name": "client", "status": "partitioning completed."}] + + async def _get_download_by_config_name(self, config_name: str): + if config_name == "not_exist": + return None + return {"config_name": "example.cfg", "client_name": "client", "status": "partitioning completed."} + + +class DownloadHandlerFirestore(DownloadHandler): + + def __init__(self, db: firestore.firestore.Client): + self.db = db + self.collection = get_config().download_collection + + async def _start_download(self, config_name: str, client_name: str) -> None: + result: WriteResult = ( + await self.db.collection(self.collection) + .document(config_name) + .set({"config_name": config_name, "client_name": client_name}) + ) + + logger.info( + f"Added {config_name} in 'download' collection. Update_time: {result.update_time}." + ) + + async def _stop_download(self, config_name: str) -> None: + timestamp = ( + await self.db.collection(self.collection).document(config_name).delete() + ) + logger.info( + f"Removed {config_name} in 'download' collection. Update_time: {timestamp}." + ) + + async def _mark_partitioning_status(self, config_name: str, status: str) -> None: + timestamp = ( + await self.db.collection(self.collection) + .document(config_name) + .update({"status": status}) + ) + logger.info( + f"Updated {config_name} in 'download' collection. Update_time: {timestamp}." + ) + + async def _check_download_exists(self, config_name: str) -> bool: + result: DocumentSnapshot = ( + await self.db.collection(self.collection).document(config_name).get() + ) + return result.exists + + async def _get_downloads(self, client_name: str) -> list: + docs = [] + if client_name: + docs = ( + self.db.collection(self.collection) + .where(filter=FieldFilter("client_name", "==", client_name)) + .stream() + ) + else: + docs = self.db.collection(self.collection).stream() + + return [doc.to_dict() async for doc in docs] + + async def _get_download_by_config_name(self, config_name: str): + result: DocumentSnapshot = ( + await self.db.collection(self.collection).document(config_name).get() + ) + if result.exists: + return result.to_dict() + else: + return None diff --git a/weather_dl_v2/fastapi-server/database/license_handler.py b/weather_dl_v2/fastapi-server/database/license_handler.py new file mode 100644 index 00000000..d4878e25 --- /dev/null +++ b/weather_dl_v2/fastapi-server/database/license_handler.py @@ -0,0 +1,200 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import logging +from firebase_admin import firestore +from google.cloud.firestore_v1 import DocumentSnapshot, FieldFilter +from google.cloud.firestore_v1.types import WriteResult +from database.session import get_async_client +from server_config import get_config + + +logger = logging.getLogger(__name__) + + +def get_license_handler(): + return LicenseHandlerFirestore(db=get_async_client()) + + +def get_mock_license_handler(): + return LicenseHandlerMock() + + +class LicenseHandler(abc.ABC): + + @abc.abstractmethod + async def _add_license(self, license_dict: dict) -> str: + pass + + @abc.abstractmethod + async def _delete_license(self, license_id: str) -> None: + pass + + @abc.abstractmethod + async def _check_license_exists(self, license_id: str) -> bool: + pass + + @abc.abstractmethod + async def _get_license_by_license_id(self, license_id: str) -> dict: + pass + + @abc.abstractmethod + async def _get_license_by_client_name(self, client_name: str) -> list: + pass + + @abc.abstractmethod + async def _get_licenses(self) -> list: + pass + + @abc.abstractmethod + async def _update_license(self, license_id: str, license_dict: dict) -> None: + pass + + @abc.abstractmethod + async def _get_license_without_deployment(self) -> list: + pass + + +class LicenseHandlerMock(LicenseHandler): + + def __init__(self): + pass + + async def _add_license(self, license_dict: dict) -> str: + license_id = "L1" + logger.info(f"Added {license_id} in 'license' collection. Update_time: 00000.") + return license_id + + async def _delete_license(self, license_id: str) -> None: + logger.info( + f"Removed {license_id} in 'license' collection. Update_time: 00000." + ) + + async def _update_license(self, license_id: str, license_dict: dict) -> None: + logger.info( + f"Updated {license_id} in 'license' collection. Update_time: 00000." + ) + + async def _check_license_exists(self, license_id: str) -> bool: + if license_id == "not_exist": + return False + elif license_id == "no-exists": + return False + else: + return True + + async def _get_license_by_license_id(self, license_id: str) -> dict: + if license_id == "not_exist": + return None + return { + "license_id": license_id, + "secret_id": "xxxx", + "client_name": "dummy_client", + "k8s_deployment_id": "k1", + "number_of_requets": 100, + } + + async def _get_license_by_client_name(self, client_name: str) -> list: + return [{ + "license_id": "L1", + "secret_id": "xxxx", + "client_name": client_name, + "k8s_deployment_id": "k1", + "number_of_requets": 100, + }] + + async def _get_licenses(self) -> list: + return [{ + "license_id": "L1", + "secret_id": "xxxx", + "client_name": "dummy_client", + "k8s_deployment_id": "k1", + "number_of_requets": 100, + }] + + async def _get_license_without_deployment(self) -> list: + return [] + + +class LicenseHandlerFirestore(LicenseHandler): + + def __init__(self, db: firestore.firestore.AsyncClient): + self.db = db + self.collection = get_config().license_collection + + async def _add_license(self, license_dict: dict) -> str: + license_dict["license_id"] = license_dict["license_id"].lower() + license_id = license_dict["license_id"] + + result: WriteResult = ( + await self.db.collection(self.collection) + .document(license_id) + .set(license_dict) + ) + logger.info( + f"Added {license_id} in 'license' collection. Update_time: {result.update_time}." + ) + return license_id + + async def _delete_license(self, license_id: str) -> None: + timestamp = ( + await self.db.collection(self.collection).document(license_id).delete() + ) + logger.info( + f"Removed {license_id} in 'license' collection. Update_time: {timestamp}." + ) + + async def _update_license(self, license_id: str, license_dict: dict) -> None: + result: WriteResult = ( + await self.db.collection(self.collection) + .document(license_id) + .update(license_dict) + ) + logger.info( + f"Updated {license_id} in 'license' collection. Update_time: {result.update_time}." + ) + + async def _check_license_exists(self, license_id: str) -> bool: + result: DocumentSnapshot = ( + await self.db.collection(self.collection).document(license_id).get() + ) + return result.exists + + async def _get_license_by_license_id(self, license_id: str) -> dict: + result: DocumentSnapshot = ( + await self.db.collection(self.collection).document(license_id).get() + ) + return result.to_dict() + + async def _get_license_by_client_name(self, client_name: str) -> list: + docs = ( + self.db.collection(self.collection) + .where(filter=FieldFilter("client_name", "==", client_name)) + .stream() + ) + return [doc.to_dict() async for doc in docs] + + async def _get_licenses(self) -> list: + docs = self.db.collection(self.collection).stream() + return [doc.to_dict() async for doc in docs] + + async def _get_license_without_deployment(self) -> list: + docs = ( + self.db.collection(self.collection) + .where(filter=FieldFilter("k8s_deployment_id", "==", "")) + .stream() + ) + return [doc.to_dict() async for doc in docs] diff --git a/weather_dl_v2/fastapi-server/database/manifest_handler.py b/weather_dl_v2/fastapi-server/database/manifest_handler.py new file mode 100644 index 00000000..d5facfab --- /dev/null +++ b/weather_dl_v2/fastapi-server/database/manifest_handler.py @@ -0,0 +1,181 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import logging +from firebase_admin import firestore +from google.cloud.firestore_v1.base_query import FieldFilter, Or, And +from server_config import get_config +from database.session import get_async_client + +logger = logging.getLogger(__name__) + + +def get_manifest_handler(): + return ManifestHandlerFirestore(db=get_async_client()) + + +def get_mock_manifest_handler(): + return ManifestHandlerMock() + + +class ManifestHandler(abc.ABC): + + @abc.abstractmethod + async def _get_download_success_count(self, config_name: str) -> int: + pass + + @abc.abstractmethod + async def _get_download_failure_count(self, config_name: str) -> int: + pass + + @abc.abstractmethod + async def _get_download_scheduled_count(self, config_name: str) -> int: + pass + + @abc.abstractmethod + async def _get_download_inprogress_count(self, config_name: str) -> int: + pass + + @abc.abstractmethod + async def _get_download_total_count(self, config_name: str) -> int: + pass + + @abc.abstractmethod + async def _get_non_successfull_downloads(self, config_name: str) -> list: + pass + + +class ManifestHandlerMock(ManifestHandler): + + async def _get_download_failure_count(self, config_name: str) -> int: + return 0 + + async def _get_download_inprogress_count(self, config_name: str) -> int: + return 0 + + async def _get_download_scheduled_count(self, config_name: str) -> int: + return 0 + + async def _get_download_success_count(self, config_name: str) -> int: + return 0 + + async def _get_download_total_count(self, config_name: str) -> int: + return 0 + + async def _get_non_successfull_downloads(self, config_name: str) -> list: + return [] + + +class ManifestHandlerFirestore(ManifestHandler): + + def __init__(self, db: firestore.firestore.Client): + self.db = db + self.collection = get_config().manifest_collection + + async def _get_download_success_count(self, config_name: str) -> int: + result = ( + await self.db.collection(self.collection) + .where(filter=FieldFilter("config_name", "==", config_name)) + .where(filter=FieldFilter("stage", "==", "upload")) + .where(filter=FieldFilter("status", "==", "success")) + .count() + .get() + ) + + count = result[0][0].value + + return count + + async def _get_download_failure_count(self, config_name: str) -> int: + result = ( + await self.db.collection(self.collection) + .where(filter=FieldFilter("config_name", "==", config_name)) + .where(filter=FieldFilter("status", "==", "failure")) + .count() + .get() + ) + + count = result[0][0].value + + return count + + async def _get_download_scheduled_count(self, config_name: str) -> int: + result = ( + await self.db.collection(self.collection) + .where(filter=FieldFilter("config_name", "==", config_name)) + .where(filter=FieldFilter("status", "==", "scheduled")) + .count() + .get() + ) + + count = result[0][0].value + + return count + + async def _get_download_inprogress_count(self, config_name: str) -> int: + and_filter = And( + filters=[ + FieldFilter("status", "==", "success"), + FieldFilter("stage", "!=", "upload"), + ] + ) + or_filter = Or(filters=[FieldFilter("status", "==", "in-progress"), and_filter]) + + result = ( + await self.db.collection(self.collection) + .where(filter=FieldFilter("config_name", "==", config_name)) + .where(filter=or_filter) + .count() + .get() + ) + + count = result[0][0].value + + return count + + async def _get_download_total_count(self, config_name: str) -> int: + result = ( + await self.db.collection(self.collection) + .where(filter=FieldFilter("config_name", "==", config_name)) + .count() + .get() + ) + + count = result[0][0].value + + return count + + async def _get_non_successfull_downloads(self, config_name: str) -> list: + or_filter = Or( + filters=[ + FieldFilter("stage", "==", "fetch"), + FieldFilter("stage", "==", "download"), + And( + filters=[ + FieldFilter("status", "!=", "success"), + FieldFilter("stage", "==", "upload"), + ] + ), + ] + ) + + docs = ( + self.db.collection(self.collection) + .where(filter=FieldFilter("config_name", "==", config_name)) + .where(filter=or_filter) + .stream() + ) + return [doc.to_dict() async for doc in docs] diff --git a/weather_dl_v2/fastapi-server/database/queue_handler.py b/weather_dl_v2/fastapi-server/database/queue_handler.py new file mode 100644 index 00000000..1909d583 --- /dev/null +++ b/weather_dl_v2/fastapi-server/database/queue_handler.py @@ -0,0 +1,247 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import logging +from firebase_admin import firestore +from google.cloud.firestore_v1 import DocumentSnapshot, FieldFilter +from google.cloud.firestore_v1.types import WriteResult +from database.session import get_async_client +from server_config import get_config + +logger = logging.getLogger(__name__) + + +def get_queue_handler(): + return QueueHandlerFirestore(db=get_async_client()) + + +def get_mock_queue_handler(): + return QueueHandlerMock() + + +class QueueHandler(abc.ABC): + + @abc.abstractmethod + async def _create_license_queue(self, license_id: str, client_name: str) -> None: + pass + + @abc.abstractmethod + async def _remove_license_queue(self, license_id: str) -> None: + pass + + @abc.abstractmethod + async def _get_queues(self) -> list: + pass + + @abc.abstractmethod + async def _get_queue_by_license_id(self, license_id: str) -> dict: + pass + + @abc.abstractmethod + async def _get_queue_by_client_name(self, client_name: str) -> list: + pass + + @abc.abstractmethod + async def _update_license_queue(self, license_id: str, priority_list: list) -> None: + pass + + @abc.abstractmethod + async def _update_queues_on_start_download( + self, config_name: str, licenses: list + ) -> None: + pass + + @abc.abstractmethod + async def _update_queues_on_stop_download(self, config_name: str) -> None: + pass + + @abc.abstractmethod + async def _update_config_priority_in_license( + self, license_id: str, config_name: str, priority: int + ) -> None: + pass + + @abc.abstractmethod + async def _update_client_name_in_license_queue( + self, license_id: str, client_name: str + ) -> None: + pass + + +class QueueHandlerMock(QueueHandler): + + def __init__(self): + pass + + async def _create_license_queue(self, license_id: str, client_name: str) -> None: + logger.info( + f"Added {license_id} queue in 'queues' collection. Update_time: 000000." + ) + + async def _remove_license_queue(self, license_id: str) -> None: + logger.info( + f"Removed {license_id} queue in 'queues' collection. Update_time: 000000." + ) + + async def _get_queues(self) -> list: + return [{"client_name": "dummy_client", "license_id": "L1", "queue": []}] + + async def _get_queue_by_license_id(self, license_id: str) -> dict: + if license_id == "not_exist": + return None + return {"client_name": "dummy_client", "license_id": license_id, "queue": []} + + async def _get_queue_by_client_name(self, client_name: str) -> list: + return [{"client_name": client_name, "license_id": "L1", "queue": []}] + + async def _update_license_queue(self, license_id: str, priority_list: list) -> None: + logger.info( + f"Updated {license_id} queue in 'queues' collection. Update_time: 00000." + ) + + async def _update_queues_on_start_download( + self, config_name: str, licenses: list + ) -> None: + logger.info( + f"Updated {license} queue in 'queues' collection. Update_time: 00000." + ) + + async def _update_queues_on_stop_download(self, config_name: str) -> None: + logger.info( + "Updated snapshot.id queue in 'queues' collection. Update_time: 00000." + ) + + async def _update_config_priority_in_license( + self, license_id: str, config_name: str, priority: int + ) -> None: + logger.info( + "Updated snapshot.id queue in 'queues' collection. Update_time: 00000." + ) + + async def _update_client_name_in_license_queue( + self, license_id: str, client_name: str + ) -> None: + logger.info( + "Updated snapshot.id queue in 'queues' collection. Update_time: 00000." + ) + + +class QueueHandlerFirestore(QueueHandler): + + def __init__(self, db: firestore.firestore.Client): + self.db = db + self.collection = get_config().queues_collection + + async def _create_license_queue(self, license_id: str, client_name: str) -> None: + result: WriteResult = ( + await self.db.collection(self.collection) + .document(license_id) + .set({"license_id": license_id, "client_name": client_name, "queue": []}) + ) + logger.info( + f"Added {license_id} queue in 'queues' collection. Update_time: {result.update_time}." + ) + + async def _remove_license_queue(self, license_id: str) -> None: + timestamp = ( + await self.db.collection(self.collection).document(license_id).delete() + ) + logger.info( + f"Removed {license_id} queue in 'queues' collection. Update_time: {timestamp}." + ) + + async def _get_queues(self) -> list: + docs = self.db.collection(self.collection).stream() + return [doc.to_dict() async for doc in docs] + + async def _get_queue_by_license_id(self, license_id: str) -> dict: + result: DocumentSnapshot = ( + await self.db.collection(self.collection).document(license_id).get() + ) + return result.to_dict() + + async def _get_queue_by_client_name(self, client_name: str) -> list: + docs = ( + self.db.collection(self.collection) + .where(filter=FieldFilter("client_name", "==", client_name)) + .stream() + ) + return [doc.to_dict() async for doc in docs] + + async def _update_license_queue(self, license_id: str, priority_list: list) -> None: + result: WriteResult = ( + await self.db.collection(self.collection) + .document(license_id) + .update({"queue": priority_list}) + ) + logger.info( + f"Updated {license_id} queue in 'queues' collection. Update_time: {result.update_time}." + ) + + async def _update_queues_on_start_download( + self, config_name: str, licenses: list + ) -> None: + for license in licenses: + result: WriteResult = ( + await self.db.collection(self.collection) + .document(license) + .update({"queue": firestore.ArrayUnion([config_name])}) + ) + logger.info( + f"Updated {license} queue in 'queues' collection. Update_time: {result.update_time}." + ) + + async def _update_queues_on_stop_download(self, config_name: str) -> None: + snapshot_list = await self.db.collection(self.collection).get() + for snapshot in snapshot_list: + result: WriteResult = ( + await self.db.collection(self.collection) + .document(snapshot.id) + .update({"queue": firestore.ArrayRemove([config_name])}) + ) + logger.info( + f"Updated {snapshot.id} queue in 'queues' collection. Update_time: {result.update_time}." + ) + + async def _update_config_priority_in_license( + self, license_id: str, config_name: str, priority: int + ) -> None: + snapshot: DocumentSnapshot = ( + await self.db.collection(self.collection).document(license_id).get() + ) + priority_list = snapshot.to_dict()["queue"] + new_priority_list = [c for c in priority_list if c != config_name] + new_priority_list.insert(priority, config_name) + result: WriteResult = ( + await self.db.collection(self.collection) + .document(license_id) + .update({"queue": new_priority_list}) + ) + logger.info( + f"Updated {snapshot.id} queue in 'queues' collection. Update_time: {result.update_time}." + ) + + async def _update_client_name_in_license_queue( + self, license_id: str, client_name: str + ) -> None: + result: WriteResult = ( + await self.db.collection(self.collection) + .document(license_id) + .update({"client_name": client_name}) + ) + logger.info( + f"Updated {license_id} queue in 'queues' collection. Update_time: {result.update_time}." + ) diff --git a/weather_dl_v2/fastapi-server/database/session.py b/weather_dl_v2/fastapi-server/database/session.py new file mode 100644 index 00000000..85dbc8be --- /dev/null +++ b/weather_dl_v2/fastapi-server/database/session.py @@ -0,0 +1,79 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import time +import abc +import logging +import firebase_admin +from google.cloud import firestore +from firebase_admin import credentials +from config_processing.util import get_wait_interval +from server_config import get_config +from gcloud import storage + +logger = logging.getLogger(__name__) + + +class Database(abc.ABC): + + @abc.abstractmethod + def _get_db(self): + pass + + +db: firestore.AsyncClient = None +gcs: storage.Client = None + + +def get_async_client() -> firestore.AsyncClient: + global db + attempts = 0 + + while db is None: + try: + db = firestore.AsyncClient() + except ValueError as e: + # The above call will fail with a value error when the firebase app is not initialized. + # Initialize the app here, and try again. + # Use the application default credentials. + cred = credentials.ApplicationDefault() + + firebase_admin.initialize_app(cred) + logger.info("Initialized Firebase App.") + + if attempts > 4: + raise RuntimeError( + "Exceeded number of retries to get firestore client." + ) from e + + time.sleep(get_wait_interval(attempts)) + + attempts += 1 + + return db + + +def get_gcs_client() -> storage.Client: + global gcs + + if gcs: + return gcs + + try: + gcs = storage.Client(project=get_config().gcs_project) + except ValueError as e: + logger.error(f"Error initializing GCS client: {e}.") + + return gcs diff --git a/weather_dl_v2/fastapi-server/database/storage_handler.py b/weather_dl_v2/fastapi-server/database/storage_handler.py new file mode 100644 index 00000000..fcdf6a1a --- /dev/null +++ b/weather_dl_v2/fastapi-server/database/storage_handler.py @@ -0,0 +1,77 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import os +import logging +import tempfile +import contextlib +import typing as t +from google.cloud import storage +from database.session import get_gcs_client +from server_config import get_config + + +logger = logging.getLogger(__name__) + + +def get_storage_handler(): + return StorageHandlerGCS(client=get_gcs_client()) + + +class StorageHandler(abc.ABC): + + @abc.abstractmethod + def _upload_file(self, file_path) -> str: + pass + + @abc.abstractmethod + def _open_local(self, file_name) -> t.Iterator[str]: + pass + + +class StorageHandlerMock(StorageHandler): + + def __init__(self) -> None: + pass + + def _upload_file(self, file_path) -> None: + pass + + def _open_local(self, file_name) -> t.Iterator[str]: + pass + + +class StorageHandlerGCS(StorageHandler): + + def __init__(self, client: storage.Client) -> None: + self.client = client + self.bucket = self.client.get_bucket(get_config().storage_bucket) + + def _upload_file(self, file_path) -> str: + filename = os.path.basename(file_path).split("/")[-1] + + blob = self.bucket.blob(filename) + blob.upload_from_filename(file_path) + + logger.info(f"Uploaded {filename} to {self.bucket}.") + return blob.public_url + + @contextlib.contextmanager + def _open_local(self, file_name) -> t.Iterator[str]: + blob = self.bucket.blob(file_name) + with tempfile.NamedTemporaryFile() as dest_file: + blob.download_to_filename(dest_file.name) + yield dest_file.name diff --git a/weather_dl_v2/fastapi-server/environment.yml b/weather_dl_v2/fastapi-server/environment.yml new file mode 100644 index 00000000..a6ce07fb --- /dev/null +++ b/weather_dl_v2/fastapi-server/environment.yml @@ -0,0 +1,18 @@ +name: weather-dl-v2-server +channels: + - conda-forge +dependencies: + - python=3.10 + - xarray + - geojson + - pip=22.3 + - google-cloud-sdk=410.0.0 + - pip: + - kubernetes + - fastapi[all]==0.97.0 + - python-multipart + - numpy + - apache-beam[gcp] + - aiohttp + - firebase-admin + - gcloud diff --git a/weather_dl_v2/fastapi-server/example.cfg b/weather_dl_v2/fastapi-server/example.cfg new file mode 100644 index 00000000..6747012c --- /dev/null +++ b/weather_dl_v2/fastapi-server/example.cfg @@ -0,0 +1,32 @@ +[parameters] +client=mars + +target_path=gs:///test-weather-dl-v2/{date}T00z.gb +partition_keys= + date + # step + +# API Keys & Subsections go here... + +[selection] +class=od +type=pf +stream=enfo +expver=0001 +levtype=pl +levelist=100 +# params: +# (z) Geopotential 129, (t) Temperature 130, +# (u) U component of wind 131, (v) V component of wind 132, +# (q) Specific humidity 133, (w) vertical velocity 135, +# (vo) Vorticity (relative) 138, (d) Divergence 155, +# (r) Relative humidity 157 +param=129.128 +# +# next: 2019-01-01/to/existing +# +date=2019-07-18/to/2019-07-20 +time=0000 +step=0/to/2 +number=1/to/2 +grid=F640 diff --git a/weather_dl_v2/fastapi-server/license_dep/deployment_creator.py b/weather_dl_v2/fastapi-server/license_dep/deployment_creator.py new file mode 100644 index 00000000..f79521d2 --- /dev/null +++ b/weather_dl_v2/fastapi-server/license_dep/deployment_creator.py @@ -0,0 +1,67 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +from os import path +import yaml +from kubernetes import client, config +from server_config import get_config + +logger = logging.getLogger(__name__) + + +def create_license_deployment(license_id: str) -> str: + """Creates a kubernetes workflow of type Job for downloading the data.""" + config.load_config() + + with open(path.join(path.dirname(__file__), "license_deployment.yaml")) as f: + deployment_manifest = yaml.safe_load(f) + deployment_name = f"weather-dl-v2-license-dep-{license_id}".lower() + + # Update the deployment name with a unique identifier + deployment_manifest["metadata"]["name"] = deployment_name + deployment_manifest["spec"]["template"]["spec"]["containers"][0]["args"] = [ + "--license", + license_id, + ] + deployment_manifest["spec"]["template"]["spec"]["containers"][0][ + "image" + ] = get_config().license_deployment_image + + # Create an instance of the Kubernetes API client + api_instance = client.AppsV1Api() + # Create the deployment in the specified namespace + response = api_instance.create_namespaced_deployment( + body=deployment_manifest, namespace="default" + ) + + logger.info(f"Deployment created successfully: {response.metadata.name}.") + return deployment_name + + +def terminate_license_deployment(license_id: str) -> None: + # Load Kubernetes configuration + config.load_config() + + # Create an instance of the Kubernetes API client + api_instance = client.AppsV1Api() + + # Specify the name and namespace of the deployment to delete + deployment_name = f"weather-dl-v2-license-dep-{license_id}".lower() + + # Delete the deployment + api_instance.delete_namespaced_deployment(name=deployment_name, namespace="default") + + logger.info(f"Deployment '{deployment_name}' deleted successfully.") diff --git a/weather_dl_v2/fastapi-server/license_dep/license_deployment.yaml b/weather_dl_v2/fastapi-server/license_dep/license_deployment.yaml new file mode 100644 index 00000000..707e5b91 --- /dev/null +++ b/weather_dl_v2/fastapi-server/license_dep/license_deployment.yaml @@ -0,0 +1,35 @@ +# weather-dl-v2-license-dep Deployment +# Defines the deployment of the app running in a pod on any worker node +apiVersion: apps/v1 +kind: Deployment +metadata: + name: weather-dl-v2-license-dep + labels: + app: weather-dl-v2-license-dep +spec: + replicas: 1 + selector: + matchLabels: + app: weather-dl-v2-license-dep + template: + metadata: + labels: + app: weather-dl-v2-license-dep + spec: + containers: + - name: weather-dl-v2-license-dep + image: XXXXXXX + imagePullPolicy: Always + args: [] + volumeMounts: + - name: config-volume + mountPath: ./config + volumes: + - name: config-volume + configMap: + name: dl-v2-config + # resources: + # # You must specify requests for CPU to autoscale + # # based on CPU utilization + # requests: + # cpu: "250m" \ No newline at end of file diff --git a/weather_dl_v2/fastapi-server/logging.conf b/weather_dl_v2/fastapi-server/logging.conf new file mode 100644 index 00000000..ed0a5e29 --- /dev/null +++ b/weather_dl_v2/fastapi-server/logging.conf @@ -0,0 +1,36 @@ +[loggers] +keys=root,server + +[handlers] +keys=consoleHandler,detailedConsoleHandler + +[formatters] +keys=normalFormatter,detailedFormatter + +[logger_root] +level=INFO +handlers=consoleHandler + +[logger_server] +level=DEBUG +handlers=detailedConsoleHandler +qualname=server +propagate=0 + +[handler_consoleHandler] +class=StreamHandler +level=DEBUG +formatter=normalFormatter +args=(sys.stdout,) + +[handler_detailedConsoleHandler] +class=StreamHandler +level=DEBUG +formatter=detailedFormatter +args=(sys.stdout,) + +[formatter_normalFormatter] +format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() msg:%(message)s + +[formatter_detailedFormatter] +format=%(asctime)s loglevel=%(levelname)-6s logger=%(name)s %(funcName)s() msg:%(message)s call_trace=%(pathname)s L%(lineno)-4d \ No newline at end of file diff --git a/weather_dl_v2/fastapi-server/main.py b/weather_dl_v2/fastapi-server/main.py new file mode 100644 index 00000000..05124123 --- /dev/null +++ b/weather_dl_v2/fastapi-server/main.py @@ -0,0 +1,70 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import os +import logging.config +from contextlib import asynccontextmanager +from fastapi import FastAPI +from routers import license, download, queues +from database.license_handler import get_license_handler +from routers.license import get_create_deployment +from server_config import get_config + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# set up logger. +logging.config.fileConfig("logging.conf", disable_existing_loggers=False) +logger = logging.getLogger(__name__) + + +async def create_pending_license_deployments(): + """Creates license deployments for Licenses whose deployments does not exist.""" + license_handler = get_license_handler() + create_deployment = get_create_deployment() + license_list = await license_handler._get_license_without_deployment() + + for _license in license_list: + license_id = _license["license_id"] + try: + logger.info(f"Creating license deployment for {license_id}.") + await create_deployment(license_id, license_handler) + except Exception as e: + logger.error(f"License deployment failed for {license_id}. Exception: {e}.") + + +@asynccontextmanager +async def lifespan(app: FastAPI): + logger.info("Started FastAPI server.") + # Boot up + # Make directory to store the uploaded config files. + os.makedirs(os.path.join(os.getcwd(), "config_files"), exist_ok=True) + # Retrieve license information & create license deployment if needed. + await create_pending_license_deployments() + # TODO: Automatically create required indexes on firestore collections on server startup. + yield + # Clean up + + +app = FastAPI(lifespan=lifespan) + +app.include_router(license.router) +app.include_router(download.router) +app.include_router(queues.router) + + +@app.get("/") +async def main(): + return {"msg": get_config().welcome_message} diff --git a/weather_dl_v2/fastapi-server/routers/download.py b/weather_dl_v2/fastapi-server/routers/download.py new file mode 100644 index 00000000..e3de4b57 --- /dev/null +++ b/weather_dl_v2/fastapi-server/routers/download.py @@ -0,0 +1,386 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio +import logging +import os +import shutil +import json + +from enum import Enum +from config_processing.parsers import parse_config, process_config +from config_processing.config import Config +from fastapi import APIRouter, HTTPException, BackgroundTasks, UploadFile, Depends, Body +from config_processing.pipeline import start_processing_config +from database.download_handler import DownloadHandler, get_download_handler +from database.queue_handler import QueueHandler, get_queue_handler +from database.license_handler import LicenseHandler, get_license_handler +from database.manifest_handler import ManifestHandler, get_manifest_handler +from database.storage_handler import StorageHandler, get_storage_handler +from config_processing.manifest import FirestoreManifest, Manifest +from fastapi.concurrency import run_in_threadpool + +logger = logging.getLogger(__name__) + +router = APIRouter( + prefix="/download", + tags=["download"], + responses={404: {"description": "Not found"}}, +) + + +async def fetch_config_stats( + config_name: str, client_name: str, status: str, manifest_handler: ManifestHandler +): + """Get all the config stats parallely.""" + + success_coroutine = manifest_handler._get_download_success_count(config_name) + scheduled_coroutine = manifest_handler._get_download_scheduled_count(config_name) + failure_coroutine = manifest_handler._get_download_failure_count(config_name) + inprogress_coroutine = manifest_handler._get_download_inprogress_count(config_name) + total_coroutine = manifest_handler._get_download_total_count(config_name) + + ( + success_count, + scheduled_count, + failure_count, + inprogress_count, + total_count, + ) = await asyncio.gather( + success_coroutine, + scheduled_coroutine, + failure_coroutine, + inprogress_coroutine, + total_coroutine, + ) + + return { + "config_name": config_name, + "client_name": client_name, + "partitioning_status": status, + "downloaded_shards": success_count, + "scheduled_shards": scheduled_count, + "failed_shards": failure_count, + "in-progress_shards": inprogress_count, + "total_shards": total_count, + } + + +def get_fetch_config_stats(): + return fetch_config_stats + + +def get_fetch_config_stats_mock(): + async def fetch_config_stats( + config_name: str, client_name: str, status: str, manifest_handler: ManifestHandler + ): + return { + "config_name": config_name, + "client_name": client_name, + "downloaded_shards": 0, + "scheduled_shards": 0, + "failed_shards": 0, + "in-progress_shards": 0, + "total_shards": 0, + } + + return fetch_config_stats + + +def get_upload(): + def upload(file: UploadFile): + dest = os.path.join(os.getcwd(), "config_files", file.filename) + with open(dest, "wb+") as dest_: + shutil.copyfileobj(file.file, dest_) + + logger.info(f"Uploading {file.filename} to gcs bucket.") + storage_handler: StorageHandler = get_storage_handler() + storage_handler._upload_file(dest) + return dest + + return upload + + +def get_upload_mock(): + def upload(file: UploadFile): + return f"{os.getcwd()}/tests/test_data/{file.filename}" + + return upload + + +def get_reschedule_partitions(): + def invoke_manifest_schedule( + partition_list: list, config: Config, manifest: Manifest + ): + for partition in partition_list: + logger.info(f"Rescheduling partition {partition}.") + manifest.schedule( + config.config_name, + config.dataset, + json.loads(partition["selection"]), + partition["location"], + partition["username"], + ) + + async def reschedule_partitions(config_name: str, licenses: list): + manifest_handler: ManifestHandler = get_manifest_handler() + download_handler: DownloadHandler = get_download_handler() + queue_handler: QueueHandler = get_queue_handler() + storage_handler: StorageHandler = get_storage_handler() + + partition_list = await manifest_handler._get_non_successfull_downloads( + config_name + ) + + config = None + manifest = FirestoreManifest() + + with storage_handler._open_local(config_name) as local_path: + with open(local_path, "r", encoding="utf-8") as f: + config = process_config(f, config_name) + + await download_handler._mark_partitioning_status( + config_name, "Partitioning in-progress." + ) + + try: + if config is None: + logger.error( + f"Failed reschedule_partitions. Could not open {config_name}." + ) + raise FileNotFoundError( + f"Failed reschedule_partitions. Could not open {config_name}." + ) + + await run_in_threadpool( + invoke_manifest_schedule, partition_list, config, manifest + ) + await download_handler._mark_partitioning_status( + config_name, "Partitioning completed." + ) + await queue_handler._update_queues_on_start_download(config_name, licenses) + except Exception as e: + error_str = f"Partitioning failed for {config_name} due to {e}." + logger.error(error_str) + await download_handler._mark_partitioning_status(config_name, error_str) + + return reschedule_partitions + + +def get_reschedule_partitions_mock(): + def reschedule_partitions(config_name: str, licenses: list): + pass + + return reschedule_partitions + + +# Can submit a config to the server. +@router.post("/") +async def submit_download( + file: UploadFile | None = None, + licenses: list = [], + force_download: bool = False, + background_tasks: BackgroundTasks = BackgroundTasks(), + download_handler: DownloadHandler = Depends(get_download_handler), + license_handler: LicenseHandler = Depends(get_license_handler), + upload=Depends(get_upload), +): + if not file: + logger.error("No upload file sent.") + raise HTTPException(status_code=404, detail="No upload file sent.") + else: + if await download_handler._check_download_exists(file.filename): + logger.error( + f"Please stop the ongoing download of the config file '{file.filename}' " + "before attempting to start a new download." + ) + raise HTTPException( + status_code=400, + detail=f"Please stop the ongoing download of the config file '{file.filename}' " + "before attempting to start a new download.", + ) + + for license_id in licenses: + if not await license_handler._check_license_exists(license_id): + logger.info(f"No such license {license_id}.") + raise HTTPException( + status_code=404, detail=f"No such license {license_id}." + ) + try: + dest = upload(file) + # Start processing config. + background_tasks.add_task( + start_processing_config, dest, licenses, force_download + ) + return { + "message": f"file '{file.filename}' saved at '{dest}' successfully." + } + except Exception as e: + logger.error(f"Failed to save file '{file.filename} due to {e}.") + raise HTTPException( + status_code=500, detail=f"Failed to save file '{file.filename}'." + ) + + +class DownloadStatus(str, Enum): + COMPLETED = "completed" + FAILED = "failed" + IN_PROGRESS = "in-progress" + + +@router.get("/show/{config_name}") +async def show_download_config( + config_name: str, + download_handler: DownloadHandler = Depends(get_download_handler), + storage_handler: StorageHandler = Depends(get_storage_handler), +): + if not await download_handler._check_download_exists(config_name): + logger.error(f"No such download config {config_name} to show.") + raise HTTPException( + status_code=404, + detail=f"No such download config {config_name} to show.", + ) + + contents = None + + with storage_handler._open_local(config_name) as local_path: + with open(local_path, "r", encoding="utf-8") as f: + contents = parse_config(f) + logger.info(f"Contents of {config_name}: {contents}.") + + return {"config_name": config_name, "contents": contents} + + +# Can check the current status of the submitted config. +# List status for all the downloads + handle filters +@router.get("/") +async def get_downloads( + client_name: str | None = None, + status: DownloadStatus | None = None, + download_handler: DownloadHandler = Depends(get_download_handler), + manifest_handler: ManifestHandler = Depends(get_manifest_handler), + fetch_config_stats=Depends(get_fetch_config_stats), +): + downloads = await download_handler._get_downloads(client_name) + coroutines = [] + + for download in downloads: + coroutines.append( + fetch_config_stats( + download["config_name"], + download["client_name"], + download["status"], + manifest_handler, + ) + ) + + config_details = await asyncio.gather(*coroutines) + + if status is None: + return config_details + + if status.value == DownloadStatus.COMPLETED: + return list( + filter( + lambda detail: detail["downloaded_shards"] == detail["total_shards"], + config_details, + ) + ) + elif status.value == DownloadStatus.FAILED: + return list(filter(lambda detail: detail["failed_shards"] > 0, config_details)) + elif status.value == DownloadStatus.IN_PROGRESS: + return list( + filter( + lambda detail: detail["downloaded_shards"] != detail["total_shards"], + config_details, + ) + ) + else: + return config_details + + +# Get status of particular download +@router.get("/{config_name}") +async def get_download_by_config_name( + config_name: str, + download_handler: DownloadHandler = Depends(get_download_handler), + manifest_handler: ManifestHandler = Depends(get_manifest_handler), + fetch_config_stats=Depends(get_fetch_config_stats), +): + download = await download_handler._get_download_by_config_name(config_name) + + if download is None: + logger.error(f"Download config {config_name} not found in weather-dl v2.") + raise HTTPException( + status_code=404, + detail=f"Download config {config_name} not found in weather-dl v2.", + ) + + return await fetch_config_stats( + download["config_name"], + download["client_name"], + download["status"], + manifest_handler, + ) + + +# Stop & remove the execution of the config. +@router.delete("/{config_name}") +async def delete_download( + config_name: str, + download_handler: DownloadHandler = Depends(get_download_handler), + queue_handler: QueueHandler = Depends(get_queue_handler), +): + if not await download_handler._check_download_exists(config_name): + logger.error(f"No such download config {config_name} to stop & remove.") + raise HTTPException( + status_code=404, + detail=f"No such download config {config_name} to stop & remove.", + ) + + await download_handler._stop_download(config_name) + await queue_handler._update_queues_on_stop_download(config_name) + return { + "config_name": config_name, + "message": "Download config stopped & removed successfully.", + } + + +@router.post("/retry/{config_name}") +async def retry_config( + config_name: str, + licenses: list = Body(embed=True), + background_tasks: BackgroundTasks = BackgroundTasks(), + download_handler: DownloadHandler = Depends(get_download_handler), + license_handler: LicenseHandler = Depends(get_license_handler), + reschedule_partitions=Depends(get_reschedule_partitions), +): + if not await download_handler._check_download_exists(config_name): + logger.error(f"No such download config {config_name} to retry.") + raise HTTPException( + status_code=404, + detail=f"No such download config {config_name} to retry.", + ) + + for license_id in licenses: + if not await license_handler._check_license_exists(license_id): + logger.info(f"No such license {license_id}.") + raise HTTPException( + status_code=404, detail=f"No such license {license_id}." + ) + + background_tasks.add_task(reschedule_partitions, config_name, licenses) + + return {"msg": "Refetch initiated successfully."} diff --git a/weather_dl_v2/fastapi-server/routers/license.py b/weather_dl_v2/fastapi-server/routers/license.py new file mode 100644 index 00000000..05ac5139 --- /dev/null +++ b/weather_dl_v2/fastapi-server/routers/license.py @@ -0,0 +1,202 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import re +from fastapi import APIRouter, HTTPException, BackgroundTasks, Depends +from pydantic import BaseModel +from license_dep.deployment_creator import create_license_deployment, terminate_license_deployment +from database.license_handler import LicenseHandler, get_license_handler +from database.queue_handler import QueueHandler, get_queue_handler + +logger = logging.getLogger(__name__) + + +class License(BaseModel): + license_id: str + client_name: str + number_of_requests: int + secret_id: str + + +class LicenseInternal(License): + k8s_deployment_id: str + + +# Can perform CRUD on license table -- helps in handling API KEY expiry. +router = APIRouter( + prefix="/license", + tags=["license"], + responses={404: {"description": "Not found"}}, +) + + +# Add/Update k8s deployment ID for existing license (intenally). +async def update_license_internal( + license_id: str, + k8s_deployment_id: str, + license_handler: LicenseHandler, +): + if not await license_handler._check_license_exists(license_id): + logger.info(f"No such license {license_id} to update.") + raise HTTPException( + status_code=404, detail=f"No such license {license_id} to update." + ) + license_dict = {"k8s_deployment_id": k8s_deployment_id} + + await license_handler._update_license(license_id, license_dict) + return {"license_id": license_id, "message": "License updated successfully."} + + +def get_create_deployment(): + async def create_deployment(license_id: str, license_handler: LicenseHandler): + k8s_deployment_id = create_license_deployment(license_id) + await update_license_internal(license_id, k8s_deployment_id, license_handler) + + return create_deployment + + +def get_create_deployment_mock(): + async def create_deployment_mock(license_id: str, license_handler: LicenseHandler): + logger.info("create deployment mock.") + + return create_deployment_mock + + +def get_terminate_license_deployment(): + return terminate_license_deployment + + +def get_terminate_license_deployment_mock(): + def get_terminate_license_deployment_mock(license_id): + logger.info(f"terminating license deployment for {license_id}.") + + return get_terminate_license_deployment_mock + + +# List all the license + handle filters of {client_name} +@router.get("/") +async def get_licenses( + client_name: str | None = None, + license_handler: LicenseHandler = Depends(get_license_handler), +): + if client_name: + result = await license_handler._get_license_by_client_name(client_name) + else: + result = await license_handler._get_licenses() + return result + + +# Get particular license +@router.get("/{license_id}") +async def get_license_by_license_id( + license_id: str, license_handler: LicenseHandler = Depends(get_license_handler) +): + result = await license_handler._get_license_by_license_id(license_id) + if not result: + logger.info(f"License {license_id} not found.") + raise HTTPException(status_code=404, detail=f"License {license_id} not found.") + return result + + +# Update existing license +@router.put("/{license_id}") +async def update_license( + license_id: str, + license: License, + license_handler: LicenseHandler = Depends(get_license_handler), + queue_handler: QueueHandler = Depends(get_queue_handler), + create_deployment=Depends(get_create_deployment), + terminate_license_deployment=Depends(get_terminate_license_deployment), +): + if not await license_handler._check_license_exists(license_id): + logger.error(f"No such license {license_id} to update.") + raise HTTPException( + status_code=404, detail=f"No such license {license_id} to update." + ) + + license_dict = license.dict() + await license_handler._update_license(license_id, license_dict) + await queue_handler._update_client_name_in_license_queue( + license_id, license_dict["client_name"] + ) + + terminate_license_deployment(license_id) + await create_deployment(license_id, license_handler) + return {"license_id": license_id, "name": "License updated successfully."} + + +# Add new license +@router.post("/") +async def add_license( + license: License, + background_tasks: BackgroundTasks = BackgroundTasks(), + license_handler: LicenseHandler = Depends(get_license_handler), + queue_handler: QueueHandler = Depends(get_queue_handler), + create_deployment=Depends(get_create_deployment), +): + license_id = license.license_id.lower() + + # Check if license id is in correct format. + LICENSE_REGEX = re.compile( + r"[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*" + ) + if not bool(LICENSE_REGEX.fullmatch(license_id)): + logger.error( + """Invalid format for license_id. License id must consist of lower case alphanumeric""" + """ characters, '-' or '.', and must start and end with an alphanumeric character""" + ) + raise HTTPException( + status_code=400, + detail="""Invalid format for license_id. License id must consist of lower case alphanumeric""" + """ characters, '-' or '.', and must start and end with an alphanumeric character""", + ) + + if await license_handler._check_license_exists(license_id): + logger.error(f"License with license_id {license_id} already exist.") + raise HTTPException( + status_code=409, + detail=f"License with license_id {license_id} already exist.", + ) + + license_dict = license.dict() + license_dict["k8s_deployment_id"] = "" + license_id = await license_handler._add_license(license_dict) + await queue_handler._create_license_queue(license_id, license_dict["client_name"]) + background_tasks.add_task(create_deployment, license_id, license_handler) + return {"license_id": license_id, "message": "License added successfully."} + + +# Remove license +@router.delete("/{license_id}") +async def delete_license( + license_id: str, + background_tasks: BackgroundTasks = BackgroundTasks(), + license_handler: LicenseHandler = Depends(get_license_handler), + queue_handler: QueueHandler = Depends(get_queue_handler), + terminate_license_deployment=Depends(get_terminate_license_deployment), +): + if not await license_handler._check_license_exists(license_id): + logger.error(f"No such license {license_id} to delete.") + raise HTTPException( + status_code=404, detail=f"No such license {license_id} to delete." + ) + await license_handler._delete_license(license_id) + await queue_handler._remove_license_queue(license_id) + background_tasks.add_task(terminate_license_deployment, license_id) + return {"license_id": license_id, "message": "License removed successfully."} + + +# TODO: Add route to re-deploy license deployments. diff --git a/weather_dl_v2/fastapi-server/routers/queues.py b/weather_dl_v2/fastapi-server/routers/queues.py new file mode 100644 index 00000000..eda6a7c5 --- /dev/null +++ b/weather_dl_v2/fastapi-server/routers/queues.py @@ -0,0 +1,124 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging + +from fastapi import APIRouter, HTTPException, Depends +from database.queue_handler import QueueHandler, get_queue_handler +from database.license_handler import LicenseHandler, get_license_handler +from database.download_handler import DownloadHandler, get_download_handler + +logger = logging.getLogger(__name__) + +router = APIRouter( + prefix="/queues", + tags=["queues"], + responses={404: {"description": "Not found"}}, +) + + +# Users can change the execution order of config per license basis. +# List the licenses priority + {client_name} filter +@router.get("/") +async def get_all_license_queue( + client_name: str | None = None, + queue_handler: QueueHandler = Depends(get_queue_handler), +): + if client_name: + result = await queue_handler._get_queue_by_client_name(client_name) + else: + result = await queue_handler._get_queues() + return result + + +# Get particular license priority +@router.get("/{license_id}") +async def get_license_queue( + license_id: str, queue_handler: QueueHandler = Depends(get_queue_handler) +): + result = await queue_handler._get_queue_by_license_id(license_id) + if not result: + logger.error(f"License priority for {license_id} not found.") + raise HTTPException( + status_code=404, detail=f"License priority for {license_id} not found." + ) + return result + + +# Change priority queue of particular license +@router.post("/{license_id}") +async def modify_license_queue( + license_id: str, + priority_list: list | None = [], + queue_handler: QueueHandler = Depends(get_queue_handler), + license_handler: LicenseHandler = Depends(get_license_handler), + download_handler: DownloadHandler = Depends(get_download_handler), +): + if not await license_handler._check_license_exists(license_id): + logger.error(f"License {license_id} not found.") + raise HTTPException(status_code=404, detail=f"License {license_id} not found.") + + for config_name in priority_list: + config = await download_handler._get_download_by_config_name(config_name) + if config is None: + logger.error(f"Download config {config_name} not found in weather-dl v2.") + raise HTTPException( + status_code=404, + detail=f"Download config {config_name} not found in weather-dl v2.", + ) + try: + await queue_handler._update_license_queue(license_id, priority_list) + return {"message": f"'{license_id}' license priority updated successfully."} + except Exception as e: + logger.error(f"Failed to update '{license_id}' license priority due to {e}.") + raise HTTPException( + status_code=404, detail=f"Failed to update '{license_id}' license priority." + ) + + +# Change config's priority in particular license +@router.put("/priority/{license_id}") +async def modify_config_priority_in_license( + license_id: str, + config_name: str, + priority: int, + queue_handler: QueueHandler = Depends(get_queue_handler), + license_handler: LicenseHandler = Depends(get_license_handler), + download_handler: DownloadHandler = Depends(get_download_handler), +): + if not await license_handler._check_license_exists(license_id): + logger.error(f"License {license_id} not found.") + raise HTTPException(status_code=404, detail=f"License {license_id} not found.") + + config = await download_handler._get_download_by_config_name(config_name) + if config is None: + logger.error(f"Download config {config_name} not found in weather-dl v2.") + raise HTTPException( + status_code=404, + detail=f"Download config {config_name} not found in weather-dl v2.", + ) + + try: + await queue_handler._update_config_priority_in_license( + license_id, config_name, priority + ) + return { + "message": f"'{license_id}' license -- '{config_name}' priority updated successfully." + } + except Exception as e: + logger.error(f"Failed to update '{license_id}' license priority due to {e}.") + raise HTTPException( + status_code=404, detail=f"Failed to update '{license_id}' license priority." + ) diff --git a/weather_dl_v2/fastapi-server/server.yaml b/weather_dl_v2/fastapi-server/server.yaml new file mode 100644 index 00000000..b8a2f40d --- /dev/null +++ b/weather_dl_v2/fastapi-server/server.yaml @@ -0,0 +1,93 @@ +# Due to our org level policy we can't expose external-ip. +# In case your project don't have any such restriction a +# then no need to create a nginx-server on VM to access this fastapi server +# instead create the LoadBalancer Service given below. +# +# # weather-dl server LoadBalancer Service +# # Enables the pods in a deployment to be accessible from outside the cluster +# apiVersion: v1 +# kind: Service +# metadata: +# name: weather-dl-v2-server-service +# spec: +# selector: +# app: weather-dl-v2-server-api +# ports: +# - protocol: "TCP" +# port: 8080 +# targetPort: 8080 +# type: LoadBalancer + +--- +# weather-dl-server-api Deployment +# Defines the deployment of the app running in a pod on any worker node +apiVersion: apps/v1 +kind: Deployment +metadata: + name: weather-dl-v2-server-api + labels: + app: weather-dl-v2-server-api +spec: + replicas: 1 + selector: + matchLabels: + app: weather-dl-v2-server-api + template: + metadata: + labels: + app: weather-dl-v2-server-api + spec: + containers: + - name: weather-dl-v2-server-api + image: XXXXXXX + ports: + - containerPort: 8080 + imagePullPolicy: Always + volumeMounts: + - name: config-volume + mountPath: ./config + volumes: + - name: config-volume + configMap: + name: dl-v2-config + # resources: + # # You must specify requests for CPU to autoscale + # # based on CPU utilization + # requests: + # cpu: "250m" +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: weather-dl-v2-server-api +rules: + - apiGroups: + - "" + - "apps" + - "batch" + resources: + - endpoints + - deployments + - pods + - jobs + verbs: + - get + - list + - watch + - create + - delete +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: weather-dl-v2-server-api + namespace: default +subjects: + - kind: ServiceAccount + name: default + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: weather-dl-v2-server-api +--- \ No newline at end of file diff --git a/weather_dl_v2/fastapi-server/server_config.py b/weather_dl_v2/fastapi-server/server_config.py new file mode 100644 index 00000000..4ca8c21b --- /dev/null +++ b/weather_dl_v2/fastapi-server/server_config.py @@ -0,0 +1,72 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import dataclasses +import typing as t +import json +import os +import logging + +logger = logging.getLogger(__name__) + +Values = t.Union[t.List["Values"], t.Dict[str, "Values"], bool, int, float, str] # pytype: disable=not-supported-yet + + +@dataclasses.dataclass +class ServerConfig: + download_collection: str = "" + queues_collection: str = "" + license_collection: str = "" + manifest_collection: str = "" + storage_bucket: str = "" + gcs_project: str = "" + license_deployment_image: str = "" + welcome_message: str = "" + kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict) + + @classmethod + def from_dict(cls, config: t.Dict): + config_instance = cls() + + for key, value in config.items(): + if hasattr(config_instance, key): + setattr(config_instance, key, value) + else: + config_instance.kwargs[key] = value + + return config_instance + + +server_config = None + + +def get_config(): + global server_config + if server_config: + return server_config + + server_config_json = "config/config.json" + if not os.path.exists(server_config_json): + server_config_json = os.environ.get("CONFIG_PATH", None) + + if server_config_json is None: + logger.error("Couldn't load config file for fastAPI server.") + raise FileNotFoundError("Couldn't load config file for fastAPI server.") + + with open(server_config_json) as file: + config_dict = json.load(file) + server_config = ServerConfig.from_dict(config_dict) + + return server_config diff --git a/weather_dl_v2/fastapi-server/tests/__init__.py b/weather_dl_v2/fastapi-server/tests/__init__.py new file mode 100644 index 00000000..5678014c --- /dev/null +++ b/weather_dl_v2/fastapi-server/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/weather_dl_v2/fastapi-server/tests/integration/__init__.py b/weather_dl_v2/fastapi-server/tests/integration/__init__.py new file mode 100644 index 00000000..5678014c --- /dev/null +++ b/weather_dl_v2/fastapi-server/tests/integration/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/weather_dl_v2/fastapi-server/tests/integration/test_download.py b/weather_dl_v2/fastapi-server/tests/integration/test_download.py new file mode 100644 index 00000000..fc707d10 --- /dev/null +++ b/weather_dl_v2/fastapi-server/tests/integration/test_download.py @@ -0,0 +1,175 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import os +from fastapi.testclient import TestClient +from main import app, ROOT_DIR +from database.download_handler import get_download_handler, get_mock_download_handler +from database.license_handler import get_license_handler, get_mock_license_handler +from database.queue_handler import get_queue_handler, get_mock_queue_handler +from routers.download import get_upload, get_upload_mock, get_fetch_config_stats, get_fetch_config_stats_mock + +client = TestClient(app) + +logger = logging.getLogger(__name__) + +app.dependency_overrides[get_download_handler] = get_mock_download_handler +app.dependency_overrides[get_license_handler] = get_mock_license_handler +app.dependency_overrides[get_queue_handler] = get_mock_queue_handler +app.dependency_overrides[get_upload] = get_upload_mock +app.dependency_overrides[get_fetch_config_stats] = get_fetch_config_stats_mock + + +def _get_download(headers, query, code, expected): + response = client.get("/download", headers=headers, params=query) + + assert response.status_code == code + assert response.json() == expected + + +def test_get_downloads_basic(): + headers = {} + query = {} + code = 200 + expected = [{ + "config_name": "example.cfg", + "client_name": "client", + "downloaded_shards": 0, + "scheduled_shards": 0, + "failed_shards": 0, + "in-progress_shards": 0, + "total_shards": 0, + }] + + _get_download(headers, query, code, expected) + + +def _submit_download(headers, file_path, licenses, code, expected): + file = None + try: + file = {"file": open(file_path, "rb")} + except FileNotFoundError: + logger.info("file not found.") + + payload = {"licenses": licenses} + + response = client.post("/download", headers=headers, files=file, data=payload) + + logger.info(f"resp {response.json()}") + + assert response.status_code == code + assert response.json() == expected + + +def test_submit_download_basic(): + header = { + "accept": "application/json", + } + file_path = os.path.join(ROOT_DIR, "tests/test_data/not_exist.cfg") + licenses = ["L1"] + code = 200 + expected = { + "message": f"file 'not_exist.cfg' saved at '{os.getcwd()}/tests/test_data/not_exist.cfg' " + "successfully." + } + + _submit_download(header, file_path, licenses, code, expected) + + +def test_submit_download_file_not_uploaded(): + header = { + "accept": "application/json", + } + file_path = os.path.join(ROOT_DIR, "tests/test_data/wrong_file.cfg") + licenses = ["L1"] + code = 404 + expected = {"detail": "No upload file sent."} + + _submit_download(header, file_path, licenses, code, expected) + + +def test_submit_download_file_alreadys_exist(): + header = { + "accept": "application/json", + } + file_path = os.path.join(ROOT_DIR, "tests/test_data/example.cfg") + licenses = ["L1"] + code = 400 + expected = { + "detail": "Please stop the ongoing download of the config file 'example.cfg' before attempting to start a new download." # noqa: E501 + } + + _submit_download(header, file_path, licenses, code, expected) + + +def _get_download_by_config(headers, config_name, code, expected): + response = client.get(f"/download/{config_name}", headers=headers) + + assert response.status_code == code + assert response.json() == expected + + +def test_get_download_by_config_basic(): + headers = {} + config_name = "example.cfg" + code = 200 + expected = { + "config_name": config_name, + "client_name": "client", + "downloaded_shards": 0, + "scheduled_shards": 0, + "failed_shards": 0, + "in-progress_shards": 0, + "total_shards": 0, + } + + _get_download_by_config(headers, config_name, code, expected) + + +def test_get_download_by_config_wrong_config(): + headers = {} + config_name = "not_exist" + code = 404 + expected = {"detail": "Download config not_exist not found in weather-dl v2."} + + _get_download_by_config(headers, config_name, code, expected) + + +def _delete_download_by_config(headers, config_name, code, expected): + response = client.delete(f"/download/{config_name}", headers=headers) + assert response.status_code == code + assert response.json() == expected + + +def test_delete_download_by_config_basic(): + headers = {} + config_name = "dummy_config" + code = 200 + expected = { + "config_name": "dummy_config", + "message": "Download config stopped & removed successfully.", + } + + _delete_download_by_config(headers, config_name, code, expected) + + +def test_delete_download_by_config_wrong_config(): + headers = {} + config_name = "not_exist" + code = 404 + expected = {"detail": "No such download config not_exist to stop & remove."} + + _delete_download_by_config(headers, config_name, code, expected) diff --git a/weather_dl_v2/fastapi-server/tests/integration/test_license.py b/weather_dl_v2/fastapi-server/tests/integration/test_license.py new file mode 100644 index 00000000..f4a5dea7 --- /dev/null +++ b/weather_dl_v2/fastapi-server/tests/integration/test_license.py @@ -0,0 +1,207 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import json +from fastapi.testclient import TestClient +from main import app +from database.download_handler import get_download_handler, get_mock_download_handler +from database.license_handler import get_license_handler, get_mock_license_handler +from routers.license import ( + get_create_deployment, + get_create_deployment_mock, + get_terminate_license_deployment, + get_terminate_license_deployment_mock, +) +from database.queue_handler import get_queue_handler, get_mock_queue_handler + +client = TestClient(app) + +logger = logging.getLogger(__name__) + +app.dependency_overrides[get_download_handler] = get_mock_download_handler +app.dependency_overrides[get_license_handler] = get_mock_license_handler +app.dependency_overrides[get_queue_handler] = get_mock_queue_handler +app.dependency_overrides[get_create_deployment] = get_create_deployment_mock +app.dependency_overrides[ + get_terminate_license_deployment +] = get_terminate_license_deployment_mock + + +def _get_license(headers, query, code, expected): + response = client.get("/license", headers=headers, params=query) + + assert response.status_code == code + assert response.json() == expected + + +def test_get_license_basic(): + headers = {} + query = {} + code = 200 + expected = [{ + "license_id": "L1", + "secret_id": "xxxx", + "client_name": "dummy_client", + "k8s_deployment_id": "k1", + "number_of_requets": 100, + }] + + _get_license(headers, query, code, expected) + + +def test_get_license_client_name(): + headers = {} + client_name = "dummy_client" + query = {"client_name": client_name} + code = 200 + expected = [{ + "license_id": "L1", + "secret_id": "xxxx", + "client_name": client_name, + "k8s_deployment_id": "k1", + "number_of_requets": 100, + }] + + _get_license(headers, query, code, expected) + + +def _add_license(headers, payload, code, expected): + response = client.post( + "/license", + headers=headers, + data=json.dumps(payload), + params={"license_id": "L1"}, + ) + + print(f"test add license {response.json()}") + + assert response.status_code == code + assert response.json() == expected + + +def test_add_license_basic(): + headers = {"accept": "application/json", "Content-Type": "application/json"} + license = { + "license_id": "no-exists", + "client_name": "dummy_client", + "number_of_requests": 0, + "secret_id": "xxxx", + } + payload = license + code = 200 + expected = {"license_id": "L1", "message": "License added successfully."} + + _add_license(headers, payload, code, expected) + + +def _get_license_by_license_id(headers, license_id, code, expected): + response = client.get(f"/license/{license_id}", headers=headers) + + logger.info(f"response {response.json()}") + assert response.status_code == code + assert response.json() == expected + + +def test_get_license_by_license_id(): + headers = {"accept": "application/json", "Content-Type": "application/json"} + license_id = "L1" + code = 200 + expected = { + "license_id": license_id, + "secret_id": "xxxx", + "client_name": "dummy_client", + "k8s_deployment_id": "k1", + "number_of_requets": 100, + } + + _get_license_by_license_id(headers, license_id, code, expected) + + +def test_get_license_wrong_license(): + headers = {} + license_id = "not_exist" + code = 404 + expected = { + "detail": "License not_exist not found.", + } + + _get_license_by_license_id(headers, license_id, code, expected) + + +def _update_license(headers, license_id, license, code, expected): + response = client.put( + f"/license/{license_id}", headers=headers, data=json.dumps(license) + ) + + print(f"_update license {response.json()}") + + assert response.status_code == code + assert response.json() == expected + + +def test_update_license_basic(): + headers = {} + license_id = "L1" + license = { + "license_id": "L1", + "client_name": "dummy_client", + "number_of_requests": 0, + "secret_id": "xxxx", + } + code = 200 + expected = {"license_id": license_id, "name": "License updated successfully."} + + _update_license(headers, license_id, license, code, expected) + + +def test_update_license_wrong_license_id(): + headers = {} + license_id = "no-exists" + license = { + "license_id": "no-exists", + "client_name": "dummy_client", + "number_of_requests": 0, + "secret_id": "xxxx", + } + code = 404 + expected = {"detail": "No such license no-exists to update."} + + _update_license(headers, license_id, license, code, expected) + + +def _delete_license(headers, license_id, code, expected): + response = client.delete(f"/license/{license_id}", headers=headers) + + assert response.status_code == code + assert response.json() == expected + + +def test_delete_license_basic(): + headers = {} + license_id = "L1" + code = 200 + expected = {"license_id": license_id, "message": "License removed successfully."} + + _delete_license(headers, license_id, code, expected) + + +def test_delete_license_wrong_license(): + headers = {} + license_id = "not_exist" + code = 404 + expected = {"detail": "No such license not_exist to delete."} + + _delete_license(headers, license_id, code, expected) diff --git a/weather_dl_v2/fastapi-server/tests/integration/test_queues.py b/weather_dl_v2/fastapi-server/tests/integration/test_queues.py new file mode 100644 index 00000000..5fa7855a --- /dev/null +++ b/weather_dl_v2/fastapi-server/tests/integration/test_queues.py @@ -0,0 +1,148 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +from main import app +from fastapi.testclient import TestClient +from database.download_handler import get_download_handler, get_mock_download_handler +from database.license_handler import get_license_handler, get_mock_license_handler +from database.queue_handler import get_queue_handler, get_mock_queue_handler + +client = TestClient(app) + +logger = logging.getLogger(__name__) + +app.dependency_overrides[get_download_handler] = get_mock_download_handler +app.dependency_overrides[get_license_handler] = get_mock_license_handler +app.dependency_overrides[get_queue_handler] = get_mock_queue_handler + + +def _get_all_queue(headers, query, code, expected): + response = client.get("/queues", headers=headers, params=query) + + assert response.status_code == code + assert response.json() == expected + + +def test_get_all_queues(): + headers = {} + query = {} + code = 200 + expected = [{"client_name": "dummy_client", "license_id": "L1", "queue": []}] + + _get_all_queue(headers, query, code, expected) + + +def test_get_client_queues(): + headers = {} + client_name = "dummy_client" + query = {"client_name": client_name} + code = 200 + expected = [{"client_name": client_name, "license_id": "L1", "queue": []}] + + _get_all_queue(headers, query, code, expected) + + +def _get_queue_by_license(headers, license_id, code, expected): + response = client.get(f"/queues/{license_id}", headers=headers) + + assert response.status_code == code + assert response.json() == expected + + +def test_get_queue_by_license_basic(): + headers = {} + license_id = "L1" + code = 200 + expected = {"client_name": "dummy_client", "license_id": license_id, "queue": []} + + _get_queue_by_license(headers, license_id, code, expected) + + +def test_get_queue_by_license_wrong_license(): + headers = {} + license_id = "not_exist" + code = 404 + expected = {"detail": 'License priority for not_exist not found.'} + + _get_queue_by_license(headers, license_id, code, expected) + + +def _modify_license_queue(headers, license_id, priority_list, code, expected): + response = client.post(f"/queues/{license_id}", headers=headers, data=priority_list) + + assert response.status_code == code + assert response.json() == expected + + +def test_modify_license_queue_basic(): + headers = {} + license_id = "L1" + priority_list = [] + code = 200 + expected = {"message": f"'{license_id}' license priority updated successfully."} + + _modify_license_queue(headers, license_id, priority_list, code, expected) + + +def test_modify_license_queue_wrong_license_id(): + headers = {} + license_id = "not_exist" + priority_list = [] + code = 404 + expected = {"detail": 'License not_exist not found.'} + + _modify_license_queue(headers, license_id, priority_list, code, expected) + + +def _modify_config_priority_in_license(headers, license_id, query, code, expected): + response = client.put(f"/queues/priority/{license_id}", params=query) + + logger.info(f"response {response.json()}") + + assert response.status_code == code + assert response.json() == expected + + +def test_modify_config_priority_in_license_basic(): + headers = {} + license_id = "L1" + query = {"config_name": "example.cfg", "priority": 0} + code = 200 + expected = { + "message": f"'{license_id}' license -- 'example.cfg' priority updated successfully." + } + + _modify_config_priority_in_license(headers, license_id, query, code, expected) + + +def test_modify_config_priority_in_license_wrong_license(): + headers = {} + license_id = "not_exist" + query = {"config_name": "example.cfg", "priority": 0} + code = 404 + expected = {"detail": 'License not_exist not found.'} + + _modify_config_priority_in_license(headers, license_id, query, code, expected) + + +def test_modify_config_priority_in_license_wrong_config(): + headers = {} + license_id = "not_exist" + query = {"config_name": "wrong.cfg", "priority": 0} + code = 404 + expected = {"detail": 'License not_exist not found.'} + + _modify_config_priority_in_license(headers, license_id, query, code, expected) diff --git a/weather_dl_v2/fastapi-server/tests/test_data/example.cfg b/weather_dl_v2/fastapi-server/tests/test_data/example.cfg new file mode 100644 index 00000000..6747012c --- /dev/null +++ b/weather_dl_v2/fastapi-server/tests/test_data/example.cfg @@ -0,0 +1,32 @@ +[parameters] +client=mars + +target_path=gs:///test-weather-dl-v2/{date}T00z.gb +partition_keys= + date + # step + +# API Keys & Subsections go here... + +[selection] +class=od +type=pf +stream=enfo +expver=0001 +levtype=pl +levelist=100 +# params: +# (z) Geopotential 129, (t) Temperature 130, +# (u) U component of wind 131, (v) V component of wind 132, +# (q) Specific humidity 133, (w) vertical velocity 135, +# (vo) Vorticity (relative) 138, (d) Divergence 155, +# (r) Relative humidity 157 +param=129.128 +# +# next: 2019-01-01/to/existing +# +date=2019-07-18/to/2019-07-20 +time=0000 +step=0/to/2 +number=1/to/2 +grid=F640 diff --git a/weather_dl_v2/fastapi-server/tests/test_data/not_exist.cfg b/weather_dl_v2/fastapi-server/tests/test_data/not_exist.cfg new file mode 100644 index 00000000..6747012c --- /dev/null +++ b/weather_dl_v2/fastapi-server/tests/test_data/not_exist.cfg @@ -0,0 +1,32 @@ +[parameters] +client=mars + +target_path=gs:///test-weather-dl-v2/{date}T00z.gb +partition_keys= + date + # step + +# API Keys & Subsections go here... + +[selection] +class=od +type=pf +stream=enfo +expver=0001 +levtype=pl +levelist=100 +# params: +# (z) Geopotential 129, (t) Temperature 130, +# (u) U component of wind 131, (v) V component of wind 132, +# (q) Specific humidity 133, (w) vertical velocity 135, +# (vo) Vorticity (relative) 138, (d) Divergence 155, +# (r) Relative humidity 157 +param=129.128 +# +# next: 2019-01-01/to/existing +# +date=2019-07-18/to/2019-07-20 +time=0000 +step=0/to/2 +number=1/to/2 +grid=F640 diff --git a/weather_dl_v2/license_deployment/Dockerfile b/weather_dl_v2/license_deployment/Dockerfile new file mode 100644 index 00000000..68388f78 --- /dev/null +++ b/weather_dl_v2/license_deployment/Dockerfile @@ -0,0 +1,34 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +FROM continuumio/miniconda3:latest + +# Update miniconda +RUN conda update conda -y + +# Add the mamba solver for faster builds +RUN conda install -n base conda-libmamba-solver +RUN conda config --set solver libmamba + +COPY . . +# Create conda env using environment.yml +RUN conda env create -f environment.yml --debug + +# Activate the conda env and update the PATH +ARG CONDA_ENV_NAME=weather-dl-v2-license-dep +RUN echo "source activate ${CONDA_ENV_NAME}" >> ~/.bashrc +ENV PATH /opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH + +ENTRYPOINT ["python", "-u", "fetch.py"] diff --git a/weather_dl_v2/license_deployment/README.md b/weather_dl_v2/license_deployment/README.md new file mode 100644 index 00000000..4c5cc6a1 --- /dev/null +++ b/weather_dl_v2/license_deployment/README.md @@ -0,0 +1,21 @@ +# Deployment Instructions & General Notes + +### How to create environment +``` +conda env create --name weather-dl-v2-license-dep --file=environment.yml + +conda activate weather-dl-v2-license-dep +``` + +### Make changes in weather_dl_v2/config.json, if required [for running locally] +``` +export CONFIG_PATH=/path/to/weather_dl_v2/config.json +``` + +### Create docker image for license deployment +``` +export PROJECT_ID= +export REPO= eg:weather-tools + +gcloud builds submit . --tag "gcr.io/$PROJECT_ID/$REPO:weather-dl-v2-license-dep" --timeout=79200 --machine-type=e2-highcpu-32 +``` \ No newline at end of file diff --git a/weather_dl_v2/license_deployment/__init__.py b/weather_dl_v2/license_deployment/__init__.py new file mode 100644 index 00000000..5678014c --- /dev/null +++ b/weather_dl_v2/license_deployment/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/weather_dl_v2/license_deployment/clients.py b/weather_dl_v2/license_deployment/clients.py new file mode 100644 index 00000000..331888ea --- /dev/null +++ b/weather_dl_v2/license_deployment/clients.py @@ -0,0 +1,417 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""ECMWF Downloader Clients.""" + +import abc +import collections +import contextlib +import datetime +import io +import logging +import os +import time +import typing as t +import warnings +from urllib.parse import urljoin + +from cdsapi import api as cds_api +import urllib3 +from ecmwfapi import api + +from config import optimize_selection_partition +from manifest import Manifest, Stage +from util import download_with_aria2, retry_with_exponential_backoff + +warnings.simplefilter("ignore", category=urllib3.connectionpool.InsecureRequestWarning) + + +class Client(abc.ABC): + """Weather data provider client interface. + + Defines methods and properties required to efficiently interact with weather + data providers. + + Attributes: + config: A config that contains pipeline parameters, such as API keys. + level: Default log level for the client. + """ + + def __init__(self, dataset: str, level: int = logging.INFO) -> None: + """Clients are initialized with the general CLI configuration.""" + self.dataset = dataset + self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}") + self.logger.setLevel(level) + + @abc.abstractmethod + def retrieve( + self, dataset: str, selection: t.Dict, output: str, manifest: Manifest + ) -> None: + """Download from data source.""" + pass + + @classmethod + @abc.abstractmethod + def num_requests_per_key(cls, dataset: str) -> int: + """Specifies the number of workers to be used per api key for the dataset.""" + pass + + @property + @abc.abstractmethod + def license_url(self): + """Specifies the License URL.""" + pass + + +class SplitCDSRequest(cds_api.Client): + """Extended CDS class that separates fetch and download stage.""" + + @retry_with_exponential_backoff + def _download(self, url, path: str, size: int) -> None: + self.info("Downloading %s to %s (%s)", url, path, cds_api.bytes_to_string(size)) + start = time.time() + + download_with_aria2(url, path) + + elapsed = time.time() - start + if elapsed: + self.info("Download rate %s/s", cds_api.bytes_to_string(size / elapsed)) + + def fetch(self, request: t.Dict, dataset: str) -> t.Dict: + result = self.retrieve(dataset, request) + return {"href": result.location, "size": result.content_length} + + def download(self, result: cds_api.Result, target: t.Optional[str] = None) -> None: + if target: + if os.path.exists(target): + # Empty the target file, if it already exists, otherwise the + # transfer below might be fooled into thinking we're resuming + # an interrupted download. + open(target, "w").close() + + self._download(result["href"], target, result["size"]) + + +class CdsClient(Client): + """A client to access weather data from the Cloud Data Store (CDS). + + Datasets on CDS can be found at: + https://cds.climate.copernicus.eu/cdsapp#!/search?type=dataset + + The parameters section of the input `config` requires two values: `api_url` and + `api_key`. Or, these values can be set as the environment variables: `CDSAPI_URL` + and `CDSAPI_KEY`. These can be acquired from the following URL, which requires + creating a free account: https://cds.climate.copernicus.eu/api-how-to + + The CDS global queues for data access has dynamic rate limits. These can be viewed + live here: https://cds.climate.copernicus.eu/live/limits. + + Attributes: + config: A config that contains pipeline parameters, such as API keys. + level: Default log level for the client. + """ + + """Name patterns of datasets that are hosted internally on CDS servers.""" + cds_hosted_datasets = {"reanalysis-era"} + + def retrieve(self, dataset: str, selection: t.Dict, manifest: Manifest) -> None: + c = CDSClientExtended( + url=os.environ.get("CLIENT_URL"), + key=os.environ.get("CLIENT_KEY"), + debug_callback=self.logger.debug, + info_callback=self.logger.info, + warning_callback=self.logger.warning, + error_callback=self.logger.error, + ) + selection_ = optimize_selection_partition(selection) + with StdoutLogger(self.logger, level=logging.DEBUG): + manifest.set_stage(Stage.FETCH) + precise_fetch_start_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + manifest.prev_stage_precise_start_time = precise_fetch_start_time + result = c.fetch(selection_, dataset) + return result + + @property + def license_url(self): + return "https://cds.climate.copernicus.eu/api/v2/terms/static/licence-to-use-copernicus-products.pdf" + + @classmethod + def num_requests_per_key(cls, dataset: str) -> int: + """Number of requests per key from the CDS API. + + CDS has dynamic, data-specific limits, defined here: + https://cds.climate.copernicus.eu/live/limits + + Typically, the reanalysis dataset allows for 3-5 simultaneous requets. + For all standard CDS data (backed on disk drives), it's common that 2 + requests are allowed, though this is dynamically set, too. + + If the Beam pipeline encounters a user request limit error, please cancel + all outstanding requests (per each user account) at the following link: + https://cds.climate.copernicus.eu/cdsapp#!/yourrequests + """ + # TODO(#15): Parse live CDS limits API to set data-specific limits. + for internal_set in cls.cds_hosted_datasets: + if dataset.startswith(internal_set): + return 5 + return 2 + + +class StdoutLogger(io.StringIO): + """Special logger to redirect stdout to logs.""" + + def __init__(self, logger_: logging.Logger, level: int = logging.INFO): + super().__init__() + self.logger = logger_ + self.level = level + self._redirector = contextlib.redirect_stdout(self) + + def log(self, msg) -> None: + self.logger.log(self.level, msg) + + def write(self, msg): + if msg and not msg.isspace(): + self.logger.log(self.level, msg) + + def __enter__(self): + self._redirector.__enter__() + return self + + def __exit__(self, exc_type, exc_value, traceback): + # let contextlib do any exception handling here + self._redirector.__exit__(exc_type, exc_value, traceback) + + +class SplitMARSRequest(api.APIRequest): + """Extended MARS APIRequest class that separates fetch and download stage.""" + + @retry_with_exponential_backoff + def _download(self, url, path: str, size: int) -> None: + self.log("Transferring %s into %s" % (self._bytename(size), path)) + self.log("From %s" % (url,)) + + download_with_aria2(url, path) + + def fetch(self, request: t.Dict, dataset: str) -> t.Dict: + status = None + + self.connection.submit("%s/%s/requests" % (self.url, self.service), request) + self.log("Request submitted") + self.log("Request id: " + self.connection.last.get("name")) + if self.connection.status != status: + status = self.connection.status + self.log("Request is %s" % (status,)) + + while not self.connection.ready(): + if self.connection.status != status: + status = self.connection.status + self.log("Request is %s" % (status,)) + self.connection.wait() + + if self.connection.status != status: + status = self.connection.status + self.log("Request is %s" % (status,)) + + result = self.connection.result() + return result + + def download(self, result: t.Dict, target: t.Optional[str] = None) -> None: + if target: + if os.path.exists(target): + # Empty the target file, if it already exists, otherwise the + # transfer below might be fooled into thinking we're resuming + # an interrupted download. + open(target, "w").close() + + self._download(urljoin(self.url, result["href"]), target, result["size"]) + self.connection.cleanup() + + +class SplitRequestMixin: + c = None + + def fetch(self, req: t.Dict, dataset: t.Optional[str] = None) -> t.Dict: + return self.c.fetch(req, dataset) + + def download(self, res: t.Dict, target: str) -> None: + self.c.download(res, target) + + +class CDSClientExtended(SplitRequestMixin): + """Extended CDS Client class that separates fetch and download stage.""" + + def __init__(self, *args, **kwargs): + self.c = SplitCDSRequest(*args, **kwargs) + + +class MARSECMWFServiceExtended(api.ECMWFService, SplitRequestMixin): + """Extended MARS ECMFService class that separates fetch and download stage.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.c = SplitMARSRequest( + self.url, + "services/%s" % (self.service,), + email=self.email, + key=self.key, + log=self.log, + verbose=self.verbose, + quiet=self.quiet, + ) + + +class PublicECMWFServerExtended(api.ECMWFDataServer, SplitRequestMixin): + + def __init__(self, *args, dataset="", **kwargs): + super().__init__(*args, **kwargs) + self.c = SplitMARSRequest( + self.url, + "datasets/%s" % (dataset,), + email=self.email, + key=self.key, + log=self.log, + verbose=self.verbose, + ) + + +class MarsClient(Client): + """A client to access data from the Meteorological Archival and Retrieval System (MARS). + + See https://www.ecmwf.int/en/forecasts/datasets for a summary of datasets available + on MARS. Most notable, MARS provides access to ECMWF's Operational Archive + https://www.ecmwf.int/en/forecasts/dataset/operational-archive. + + The client config must contain three parameters to autheticate access to the MARS archive: + `api_key`, `api_url`, and `api_email`. These can also be configued by setting the + commensurate environment variables: `MARSAPI_KEY`, `MARSAPI_URL`, and `MARSAPI_EMAIL`. + These credentials can be looked up by after registering for an ECMWF account + (https://apps.ecmwf.int/registration/) and visitng: https://api.ecmwf.int/v1/key/. + + MARS server activity can be observed at https://apps.ecmwf.int/mars-activity/. + + Attributes: + config: A config that contains pipeline parameters, such as API keys. + level: Default log level for the client. + """ + + def retrieve(self, dataset: str, selection: t.Dict, manifest: Manifest) -> None: + c = MARSECMWFServiceExtended( + "mars", + key=os.environ.get("CLIENT_KEY"), + url=os.environ.get("CLIENT_URL"), + email=os.environ.get("CLIENT_EMAIL"), + log=self.logger.debug, + verbose=True, + ) + selection_ = optimize_selection_partition(selection) + with StdoutLogger(self.logger, level=logging.DEBUG): + manifest.set_stage(Stage.FETCH) + precise_fetch_start_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + manifest.prev_stage_precise_start_time = precise_fetch_start_time + result = c.fetch(req=selection_) + return result + + @property + def license_url(self): + return "https://apps.ecmwf.int/datasets/licences/general/" + + @classmethod + def num_requests_per_key(cls, dataset: str) -> int: + """Number of requests per key (or user) for the Mars API. + + Mars allows 2 active requests per user and 20 queued requests per user, as of Sept 27, 2021. + To ensure we never hit a rate limit error during download, we only make use of the active + requests. + See: https://confluence.ecmwf.int/display/UDOC/Total+number+of+requests+a+user+can+submit+-+Web+API+FAQ + + Queued requests can _only_ be canceled manually from a web dashboard. If the + `ERROR 101 (USER_QUEUED_LIMIT_EXCEEDED)` error occurs in the Beam pipeline, then go to + http://apps.ecmwf.int/webmars/joblist/ and cancel queued jobs. + """ + return 2 + + +class ECMWFPublicClient(Client): + """A client for ECMWF's public datasets, like TIGGE.""" + + def retrieve(self, dataset: str, selection: t.Dict, manifest: Manifest) -> None: + c = PublicECMWFServerExtended( + url=os.environ.get("CLIENT_URL"), + key=os.environ.get("CLIENT_KEY"), + email=os.environ.get("CLIENT_EMAIL"), + log=self.logger.debug, + verbose=True, + dataset=dataset, + ) + selection_ = optimize_selection_partition(selection) + with StdoutLogger(self.logger, level=logging.DEBUG): + manifest.set_stage(Stage.FETCH) + precise_fetch_start_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + manifest.prev_stage_precise_start_time = precise_fetch_start_time + result = c.fetch(req=selection_) + return result + + @classmethod + def num_requests_per_key(cls, dataset: str) -> int: + # Experimentally validated request limit. + return 5 + + @property + def license_url(self): + if not self.dataset: + raise ValueError("must specify a dataset for this client!") + return f"https://apps.ecmwf.int/datasets/data/{self.dataset.lower()}/licence/" + + +class FakeClient(Client): + """A client that writes the selection arguments to the output file.""" + + def retrieve(self, dataset: str, selection: t.Dict, manifest: Manifest) -> None: + manifest.set_stage(Stage.RETRIEVE) + precise_retrieve_start_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + manifest.prev_stage_precise_start_time = precise_retrieve_start_time + self.logger.debug(f"Downloading {dataset}.") + + @property + def license_url(self): + return "lorem ipsum" + + @classmethod + def num_requests_per_key(cls, dataset: str) -> int: + return 1 + + +CLIENTS = collections.OrderedDict( + cds=CdsClient, + mars=MarsClient, + ecpublic=ECMWFPublicClient, + fake=FakeClient, +) diff --git a/weather_dl_v2/license_deployment/config.py b/weather_dl_v2/license_deployment/config.py new file mode 100644 index 00000000..fe2199b8 --- /dev/null +++ b/weather_dl_v2/license_deployment/config.py @@ -0,0 +1,120 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import calendar +import copy +import dataclasses +import typing as t + +Values = t.Union[t.List["Values"], t.Dict[str, "Values"], bool, int, float, str] # pytype: disable=not-supported-yet + + +@dataclasses.dataclass +class Config: + """Contains pipeline parameters. + + Attributes: + config_name: + Name of the config file. + client: + Name of the Weather-API-client. Supported clients are mentioned in the 'CLIENTS' variable. + dataset (optional): + Name of the target dataset. Allowed options are dictated by the client. + partition_keys (optional): + Choose the keys from the selection section to partition the data request. + This will compute a cartesian cross product of the selected keys + and assign each as their own download. + target_path: + Download artifact filename template. Can make use of Python's standard string formatting. + It can contain format symbols to be replaced by partition keys; + if this is used, the total number of format symbols must match the number of partition keys. + subsection_name: + Name of the particular subsection. 'default' if there is no subsection. + force_download: + Force redownload of partitions that were previously downloaded. + user_id: + Username from the environment variables. + kwargs (optional): + For representing subsections or any other parameters. + selection: + Contains parameters used to select desired data. + """ + + config_name: str = "" + client: str = "" + dataset: t.Optional[str] = "" + target_path: str = "" + partition_keys: t.Optional[t.List[str]] = dataclasses.field(default_factory=list) + subsection_name: str = "default" + force_download: bool = False + user_id: str = "unknown" + kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict) + selection: t.Dict[str, Values] = dataclasses.field(default_factory=dict) + + @classmethod + def from_dict(cls, config: t.Dict) -> "Config": + config_instance = cls() + for section_key, section_value in config.items(): + if section_key == "parameters": + for key, value in section_value.items(): + if hasattr(config_instance, key): + setattr(config_instance, key, value) + else: + config_instance.kwargs[key] = value + if section_key == "selection": + config_instance.selection = section_value + return config_instance + + +def optimize_selection_partition(selection: t.Dict) -> t.Dict: + """Compute right-hand-side values for the selection section of a single partition. + + Used to support custom syntax and optimizations, such as 'all'. + """ + selection_ = copy.deepcopy(selection) + + if "day" in selection_.keys() and selection_["day"] == "all": + year, month = selection_["year"], selection_["month"] + + multiples_error = ( + "Cannot use keyword 'all' on selections with multiple '{type}'s." + ) + + if isinstance(year, list): + assert len(year) == 1, multiples_error.format(type="year") + year = year[0] + + if isinstance(month, list): + assert len(month) == 1, multiples_error.format(type="month") + month = month[0] + + if isinstance(year, str): + assert "/" not in year, multiples_error.format(type="year") + + if isinstance(month, str): + assert "/" not in month, multiples_error.format(type="month") + + year, month = int(year), int(month) + + _, n_days_in_month = calendar.monthrange(year, month) + + selection_[ + "date" + ] = f"{year:04d}-{month:02d}-01/to/{year:04d}-{month:02d}-{n_days_in_month:02d}" + del selection_["day"] + del selection_["month"] + del selection_["year"] + + return selection_ diff --git a/weather_dl_v2/license_deployment/database.py b/weather_dl_v2/license_deployment/database.py new file mode 100644 index 00000000..23c0f064 --- /dev/null +++ b/weather_dl_v2/license_deployment/database.py @@ -0,0 +1,161 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import abc +import time +import logging +import firebase_admin +from firebase_admin import firestore +from firebase_admin import credentials +from google.cloud.firestore_v1 import DocumentSnapshot, DocumentReference +from google.cloud.firestore_v1.types import WriteResult +from google.cloud.firestore_v1.base_query import FieldFilter, And +from util import get_wait_interval +from deployment_config import get_config + +logger = logging.getLogger(__name__) + + +class Database(abc.ABC): + + @abc.abstractmethod + def _get_db(self): + pass + + +class CRUDOperations(abc.ABC): + + @abc.abstractmethod + def _initialize_license_deployment(self, license_id: str) -> dict: + pass + + @abc.abstractmethod + def _get_config_from_queue_by_license_id(self, license_id: str) -> dict: + pass + + @abc.abstractmethod + def _remove_config_from_license_queue( + self, license_id: str, config_name: str + ) -> None: + pass + + @abc.abstractmethod + def _get_partition_from_manifest(self, config_name: str) -> str: + pass + + +class FirestoreClient(Database, CRUDOperations): + + def _get_db(self) -> firestore.firestore.Client: + """Acquire a firestore client, initializing the firebase app if necessary. + Will attempt to get the db client five times. If it's still unsuccessful, a + `ManifestException` will be raised. + """ + db = None + attempts = 0 + + while db is None: + try: + db = firestore.client() + except ValueError as e: + # The above call will fail with a value error when the firebase app is not initialized. + # Initialize the app here, and try again. + # Use the application default credentials. + cred = credentials.ApplicationDefault() + + firebase_admin.initialize_app(cred) + logger.info("Initialized Firebase App.") + + if attempts > 4: + raise RuntimeError( + "Exceeded number of retries to get firestore client." + ) from e + + time.sleep(get_wait_interval(attempts)) + + attempts += 1 + + return db + + def _initialize_license_deployment(self, license_id: str) -> dict: + result: DocumentSnapshot = ( + self._get_db() + .collection(get_config().license_collection) + .document(license_id) + .get() + ) + return result.to_dict() + + def _get_config_from_queue_by_license_id(self, license_id: str) -> str | None: + result: DocumentSnapshot = ( + self._get_db() + .collection(get_config().queues_collection) + .document(license_id) + .get(["queue"]) + ) + if result.exists: + queue = result.to_dict()["queue"] + if len(queue) > 0: + return queue[0] + return None + + def _get_partition_from_manifest(self, config_name: str) -> str | None: + transaction = self._get_db().transaction() + return get_partition_from_manifest(transaction, config_name) + + def _remove_config_from_license_queue( + self, license_id: str, config_name: str + ) -> None: + result: WriteResult = ( + self._get_db() + .collection(get_config().queues_collection) + .document(license_id) + .update({"queue": firestore.ArrayRemove([config_name])}) + ) + logger.info( + f"Updated {license_id} queue in 'queues' collection. Update_time: {result.update_time}." + ) + + +# TODO: Firestore transcational fails after reading a document 20 times with roll over. +# This happens when too many licenses try to access the same partition document. +# Find some alternative approach to handle this. +@firestore.transactional +def get_partition_from_manifest(transaction, config_name: str) -> str | None: + db_client = FirestoreClient() + filter_1 = FieldFilter("config_name", "==", config_name) + filter_2 = FieldFilter("status", "==", "scheduled") + and_filter = And(filters=[filter_1, filter_2]) + + snapshot = ( + db_client._get_db() + .collection(get_config().manifest_collection) + .where(filter=and_filter) + .limit(1) + .get(transaction=transaction) + ) + if len(snapshot) > 0: + snapshot = snapshot[0] + else: + return None + + ref: DocumentReference = ( + db_client._get_db() + .collection(get_config().manifest_collection) + .document(snapshot.id) + ) + transaction.update(ref, {"status": "processing"}) + + return snapshot.to_dict() diff --git a/weather_dl_v2/license_deployment/deployment_config.py b/weather_dl_v2/license_deployment/deployment_config.py new file mode 100644 index 00000000..8ae162ea --- /dev/null +++ b/weather_dl_v2/license_deployment/deployment_config.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import dataclasses +import typing as t +import json +import os +import logging + +logger = logging.getLogger(__name__) + +Values = t.Union[t.List["Values"], t.Dict[str, "Values"], bool, int, float, str] # pytype: disable=not-supported-yet + + +@dataclasses.dataclass +class DeploymentConfig: + download_collection: str = "" + queues_collection: str = "" + license_collection: str = "" + manifest_collection: str = "" + downloader_k8_image: str = "" + kwargs: t.Optional[t.Dict[str, Values]] = dataclasses.field(default_factory=dict) + + @classmethod + def from_dict(cls, config: t.Dict): + config_instance = cls() + + for key, value in config.items(): + if hasattr(config_instance, key): + setattr(config_instance, key, value) + else: + config_instance.kwargs[key] = value + + return config_instance + + +deployment_config = None + + +def get_config(): + global deployment_config + if deployment_config: + return deployment_config + + deployment_config_json = "config/config.json" + if not os.path.exists(deployment_config_json): + deployment_config_json = os.environ.get("CONFIG_PATH", None) + + if deployment_config_json is None: + logger.error("Couldn't load config file for license deployment.") + raise FileNotFoundError("Couldn't load config file for license deployment.") + + with open(deployment_config_json) as file: + config_dict = json.load(file) + deployment_config = DeploymentConfig.from_dict(config_dict) + + return deployment_config diff --git a/weather_dl_v2/license_deployment/downloader.yaml b/weather_dl_v2/license_deployment/downloader.yaml new file mode 100644 index 00000000..361c2b36 --- /dev/null +++ b/weather_dl_v2/license_deployment/downloader.yaml @@ -0,0 +1,33 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: downloader-with-ttl +spec: + ttlSecondsAfterFinished: 0 + template: + spec: + nodeSelector: + cloud.google.com/gke-nodepool: downloader-pool + containers: + - name: downloader + image: XXXXXXX + imagePullPolicy: Always + command: [] + resources: + requests: + cpu: "1000m" # CPU: 1 vCPU + memory: "2Gi" # RAM: 2 GiB + ephemeral-storage: "100Gi" # Storage: 100 GiB + volumeMounts: + - name: data + mountPath: /data + - name: config-volume + mountPath: ./config + restartPolicy: Never + volumes: + - name: data + emptyDir: + sizeLimit: 100Gi + - name: config-volume + configMap: + name: dl-v2-config \ No newline at end of file diff --git a/weather_dl_v2/license_deployment/environment.yml b/weather_dl_v2/license_deployment/environment.yml new file mode 100644 index 00000000..4848fafd --- /dev/null +++ b/weather_dl_v2/license_deployment/environment.yml @@ -0,0 +1,17 @@ +name: weather-dl-v2-license-dep +channels: + - conda-forge +dependencies: + - python=3.10 + - geojson + - cdsapi=0.5.1 + - ecmwf-api-client=1.6.3 + - pip=22.3 + - pip: + - kubernetes + - google-cloud-secret-manager + - aiohttp + - numpy + - xarray + - apache-beam[gcp] + - firebase-admin diff --git a/weather_dl_v2/license_deployment/fetch.py b/weather_dl_v2/license_deployment/fetch.py new file mode 100644 index 00000000..63adb33a --- /dev/null +++ b/weather_dl_v2/license_deployment/fetch.py @@ -0,0 +1,139 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from concurrent.futures import ThreadPoolExecutor +from google.cloud import secretmanager +import json +import logging +import time +import sys +import os + +from database import FirestoreClient +from job_creator import create_download_job +from clients import CLIENTS +from manifest import FirestoreManifest +from util import exceptionit + +db_client = FirestoreClient() +secretmanager_client = secretmanager.SecretManagerServiceClient() + + +def create_job(request, result): + res = { + "config_name": request["config_name"], + "dataset": request["dataset"], + "selection": json.loads(request["selection"]), + "user_id": request["username"], + "url": result["href"], + "target_path": request["location"], + "license_id": license_id, + } + + data_str = json.dumps(res) + logger.info(f"Creating download job for res: {data_str}") + create_download_job(data_str) + + +@exceptionit +def make_fetch_request(request): + client = CLIENTS[client_name](request["dataset"]) + manifest = FirestoreManifest(license_id=license_id) + logger.info( + f"By using {client_name} datasets, " + f"users agree to the terms and conditions specified in {client.license_url!r}" + ) + + target = request["location"] + selection = json.loads(request["selection"]) + + logger.info(f"Fetching data for {target!r}.") + with manifest.transact( + request["config_name"], + request["dataset"], + selection, + target, + request["username"], + ): + result = client.retrieve(request["dataset"], selection, manifest) + + create_job(request, result) + + +def fetch_request_from_db(): + request = None + config_name = db_client._get_config_from_queue_by_license_id(license_id) + if config_name: + try: + logger.info(f"Fetching partition for {config_name}.") + request = db_client._get_partition_from_manifest(config_name) + if not request: + db_client._remove_config_from_license_queue(license_id, config_name) + except Exception as e: + logger.error( + f"Error in fetch_request_from_db for {config_name}. error: {e}." + ) + return request + + +def main(): + logger.info("Started looking at the request.") + with ThreadPoolExecutor(concurrency_limit) as executor: + while True: + # Fetch a request from the database + request = fetch_request_from_db() + + if request is not None: + executor.submit(make_fetch_request, request) + else: + logger.info("No request available. Waiting...") + time.sleep(5) + + # Check if the maximum concurrency level has been reached + # If so, wait for a slot to become available + while executor._work_queue.qsize() >= concurrency_limit: + time.sleep(1) + + +def boot_up(license: str) -> None: + global license_id, client_name, concurrency_limit + + result = db_client._initialize_license_deployment(license) + license_id = license + client_name = result["client_name"] + concurrency_limit = result["number_of_requests"] + + response = secretmanager_client.access_secret_version( + request={"name": result["secret_id"]} + ) + payload = response.payload.data.decode("UTF-8") + secret_dict = json.loads(payload) + + os.environ.setdefault("CLIENT_URL", secret_dict.get("api_url", "")) + os.environ.setdefault("CLIENT_KEY", secret_dict.get("api_key", "")) + os.environ.setdefault("CLIENT_EMAIL", secret_dict.get("api_email", "")) + + +if __name__ == "__main__": + license = sys.argv[2] + global logger + logging.basicConfig( + level=logging.INFO, format=f"[{license}] %(levelname)s - %(message)s" + ) + logger = logging.getLogger(__name__) + + logger.info(f"Deployment for license: {license}.") + boot_up(license) + main() diff --git a/weather_dl_v2/license_deployment/job_creator.py b/weather_dl_v2/license_deployment/job_creator.py new file mode 100644 index 00000000..f0acd802 --- /dev/null +++ b/weather_dl_v2/license_deployment/job_creator.py @@ -0,0 +1,58 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from os import path +import yaml +import json +import uuid +from kubernetes import client, config +from deployment_config import get_config + + +def create_download_job(message): + """Creates a kubernetes workflow of type Job for downloading the data.""" + parsed_message = json.loads(message) + ( + config_name, + dataset, + selection, + user_id, + url, + target_path, + license_id, + ) = parsed_message.values() + selection = str(selection).replace(" ", "") + config.load_config() + + with open(path.join(path.dirname(__file__), "downloader.yaml")) as f: + dep = yaml.safe_load(f) + uid = uuid.uuid4() + dep["metadata"]["name"] = f"downloader-job-id-{uid}" + dep["spec"]["template"]["spec"]["containers"][0]["command"] = [ + "python", + "downloader.py", + config_name, + dataset, + selection, + user_id, + url, + target_path, + license_id, + ] + dep["spec"]["template"]["spec"]["containers"][0][ + "image" + ] = get_config().downloader_k8_image + batch_api = client.BatchV1Api() + batch_api.create_namespaced_job(body=dep, namespace="default") diff --git a/weather_dl_v2/license_deployment/manifest.py b/weather_dl_v2/license_deployment/manifest.py new file mode 100644 index 00000000..1b5355d8 --- /dev/null +++ b/weather_dl_v2/license_deployment/manifest.py @@ -0,0 +1,520 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Client interface for connecting to a manifest.""" + +import abc +import logging +import dataclasses +import datetime +import enum +import json +import pandas as pd +import time +import traceback +import typing as t + +from util import ( + to_json_serializable_type, + fetch_geo_polygon, + get_file_size, + get_wait_interval, + generate_md5_hash, + GLOBAL_COVERAGE_AREA, +) + +import firebase_admin +from firebase_admin import credentials +from firebase_admin import firestore +from google.cloud.firestore_v1 import DocumentReference +from google.cloud.firestore_v1.types import WriteResult +from deployment_config import get_config +from database import Database + +logger = logging.getLogger(__name__) + +"""An implementation-dependent Manifest URI.""" +Location = t.NewType("Location", str) + + +class ManifestException(Exception): + """Errors that occur in Manifest Clients.""" + + pass + + +class Stage(enum.Enum): + """A request can be either in one of the following stages at a time: + + fetch : This represents request is currently in fetch stage i.e. request placed on the client's server + & waiting for some result before starting download (eg. MARS client). + download : This represents request is currently in download stage i.e. data is being downloading from client's + server to the worker's local file system. + upload : This represents request is currently in upload stage i.e. data is getting uploaded from worker's local + file system to target location (GCS path). + retrieve : In case of clients where there is no proper separation of fetch & download stages (eg. CDS client), + request will be in the retrieve stage i.e. fetch + download. + """ + + RETRIEVE = "retrieve" + FETCH = "fetch" + DOWNLOAD = "download" + UPLOAD = "upload" + + +class Status(enum.Enum): + """Depicts the request's state status: + + scheduled : A request partition is created & scheduled for processing. + Note: Its corresponding state can be None only. + processing: This represents that the request picked by license deployment. + in-progress : This represents the request state is currently in-progress (i.e. running). + The next status would be "success" or "failure". + success : This represents the request state execution completed successfully without any error. + failure : This represents the request state execution failed. + """ + + PROCESSING = "processing" + SCHEDULED = "scheduled" + IN_PROGRESS = "in-progress" + SUCCESS = "success" + FAILURE = "failure" + + +@dataclasses.dataclass +class DownloadStatus: + """Data recorded in `Manifest`s reflecting the status of a download.""" + + """The name of the config file associated with the request.""" + config_name: str = "" + + """Represents the dataset field of the configuration.""" + dataset: t.Optional[str] = "" + + """Copy of selection section of the configuration.""" + selection: t.Dict = dataclasses.field(default_factory=dict) + + """Location of the downloaded data.""" + location: str = "" + + """Represents area covered by the shard.""" + area: str = "" + + """Current stage of request : 'fetch', 'download', 'retrieve', 'upload' or None.""" + stage: t.Optional[Stage] = None + + """Download status: 'scheduled', 'in-progress', 'success', or 'failure'.""" + status: t.Optional[Status] = None + + """Cause of error, if any.""" + error: t.Optional[str] = "" + + """Identifier for the user running the download.""" + username: str = "" + + """Shard size in GB.""" + size: t.Optional[float] = 0 + + """A UTC datetime when download was scheduled.""" + scheduled_time: t.Optional[str] = "" + + """A UTC datetime when the retrieve stage starts.""" + retrieve_start_time: t.Optional[str] = "" + + """A UTC datetime when the retrieve state ends.""" + retrieve_end_time: t.Optional[str] = "" + + """A UTC datetime when the fetch state starts.""" + fetch_start_time: t.Optional[str] = "" + + """A UTC datetime when the fetch state ends.""" + fetch_end_time: t.Optional[str] = "" + + """A UTC datetime when the download state starts.""" + download_start_time: t.Optional[str] = "" + + """A UTC datetime when the download state ends.""" + download_end_time: t.Optional[str] = "" + + """A UTC datetime when the upload state starts.""" + upload_start_time: t.Optional[str] = "" + + """A UTC datetime when the upload state ends.""" + upload_end_time: t.Optional[str] = "" + + @classmethod + def from_dict(cls, download_status: t.Dict) -> "DownloadStatus": + """Instantiate DownloadStatus dataclass from dict.""" + download_status_instance = cls() + for key, value in download_status.items(): + if key == "status": + setattr(download_status_instance, key, Status(value)) + elif key == "stage" and value is not None: + setattr(download_status_instance, key, Stage(value)) + else: + setattr(download_status_instance, key, value) + return download_status_instance + + @classmethod + def to_dict(cls, instance) -> t.Dict: + """Return the fields of a dataclass instance as a manifest ingestible + dictionary mapping of field names to field values.""" + download_status_dict = {} + for field in dataclasses.fields(instance): + key = field.name + value = getattr(instance, field.name) + if isinstance(value, Status) or isinstance(value, Stage): + download_status_dict[key] = value.value + elif isinstance(value, pd.Timestamp): + download_status_dict[key] = value.isoformat() + elif key == "selection" and value is not None: + download_status_dict[key] = json.dumps(value) + else: + download_status_dict[key] = value + return download_status_dict + + +@dataclasses.dataclass +class Manifest(abc.ABC): + """Abstract manifest of download statuses. + + Update download statuses to some storage medium. + + This class lets one indicate that a download is `scheduled` or in a transaction process. + In the event of a transaction, a download will be updated with an `in-progress`, `success` + or `failure` status (with accompanying metadata). + + Example: + ``` + my_manifest = parse_manifest_location(Location('fs://some-firestore-collection')) + + # Schedule data for download + my_manifest.schedule({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') + + # ... + + # Initiate a transaction – it will record that the download is `in-progess` + with my_manifest.transact({'some': 'metadata'}, 'path/to/downloaded/file', 'my-username') as tx: + # download logic here + pass + + # ... + + # on error, will record the download as a `failure` before propagating the error. By default, it will + # record download as a `success`. + ``` + + Attributes: + status: The current `DownloadStatus` of the Manifest. + """ + + # To reduce the impact of _read() and _update() calls + # on the start time of the stage. + license_id: str = "" + prev_stage_precise_start_time: t.Optional[str] = None + status: t.Optional[DownloadStatus] = None + + # This is overridden in subclass. + def __post_init__(self): + """Initialize the manifest.""" + pass + + def schedule( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> None: + """Indicate that a job has been scheduled for download. + + 'scheduled' jobs occur before 'in-progress', 'success' or 'finished'. + """ + scheduled_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + self.status = DownloadStatus( + config_name=config_name, + dataset=dataset if dataset else None, + selection=selection, + location=location, + area=fetch_geo_polygon(selection.get("area", GLOBAL_COVERAGE_AREA)), + username=user, + stage=None, + status=Status.SCHEDULED, + error=None, + size=None, + scheduled_time=scheduled_time, + retrieve_start_time=None, + retrieve_end_time=None, + fetch_start_time=None, + fetch_end_time=None, + download_start_time=None, + download_end_time=None, + upload_start_time=None, + upload_end_time=None, + ) + self._update(self.status) + + def skip( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> None: + """Updates the manifest to mark the shards that were skipped in the current job + as 'upload' stage and 'success' status, indicating that they have already been downloaded. + """ + old_status = self._read(location) + # The manifest needs to be updated for a skipped shard if its entry is not present, or + # if the stage is not 'upload', or if the stage is 'upload' but the status is not 'success'. + if ( + old_status.location != location + or old_status.stage != Stage.UPLOAD + or old_status.status != Status.SUCCESS + ): + current_utc_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + + size = get_file_size(location) + + status = DownloadStatus( + config_name=config_name, + dataset=dataset if dataset else None, + selection=selection, + location=location, + area=fetch_geo_polygon(selection.get("area", GLOBAL_COVERAGE_AREA)), + username=user, + stage=Stage.UPLOAD, + status=Status.SUCCESS, + error=None, + size=size, + scheduled_time=None, + retrieve_start_time=None, + retrieve_end_time=None, + fetch_start_time=None, + fetch_end_time=None, + download_start_time=None, + download_end_time=None, + upload_start_time=current_utc_time, + upload_end_time=current_utc_time, + ) + self._update(status) + logger.info( + f"Manifest updated for skipped shard: {location!r} -- {DownloadStatus.to_dict(status)!r}." + ) + + def _set_for_transaction( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> None: + """Reset Manifest state in preparation for a new transaction.""" + self.status = dataclasses.replace(self._read(location)) + self.status.config_name = config_name + self.status.dataset = dataset if dataset else None + self.status.selection = selection + self.status.location = location + self.status.username = user + + def __enter__(self) -> None: + pass + + def __exit__(self, exc_type, exc_inst, exc_tb) -> None: + """Record end status of a transaction as either 'success' or 'failure'.""" + if exc_type is None: + status = Status.SUCCESS + error = None + else: + status = Status.FAILURE + # For explanation, see https://docs.python.org/3/library/traceback.html#traceback.format_exception + error = f"license_id: {self.license_id} " + error += "\n".join(traceback.format_exception(exc_type, exc_inst, exc_tb)) + + new_status = dataclasses.replace(self.status) + new_status.error = error + new_status.status = status + current_utc_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + + # This is necessary for setting the precise start time of the previous stage + # and end time of the final stage, as well as handling the case of Status.FAILURE. + if new_status.stage == Stage.FETCH: + new_status.fetch_start_time = self.prev_stage_precise_start_time + new_status.fetch_end_time = current_utc_time + elif new_status.stage == Stage.RETRIEVE: + new_status.retrieve_start_time = self.prev_stage_precise_start_time + new_status.retrieve_end_time = current_utc_time + elif new_status.stage == Stage.DOWNLOAD: + new_status.download_start_time = self.prev_stage_precise_start_time + new_status.download_end_time = current_utc_time + else: + new_status.upload_start_time = self.prev_stage_precise_start_time + new_status.upload_end_time = current_utc_time + + new_status.size = get_file_size(new_status.location) + + self.status = new_status + + self._update(self.status) + + def transact( + self, + config_name: str, + dataset: str, + selection: t.Dict, + location: str, + user: str, + ) -> "Manifest": + """Create a download transaction.""" + self._set_for_transaction(config_name, dataset, selection, location, user) + return self + + def set_stage(self, stage: Stage) -> None: + """Sets the current stage in manifest.""" + prev_stage = self.status.stage + new_status = dataclasses.replace(self.status) + new_status.stage = stage + new_status.status = Status.IN_PROGRESS + current_utc_time = ( + datetime.datetime.utcnow() + .replace(tzinfo=datetime.timezone.utc) + .isoformat(timespec="seconds") + ) + + if stage == Stage.FETCH: + new_status.fetch_start_time = current_utc_time + elif stage == Stage.RETRIEVE: + new_status.retrieve_start_time = current_utc_time + elif stage == Stage.DOWNLOAD: + new_status.fetch_start_time = self.prev_stage_precise_start_time + new_status.fetch_end_time = current_utc_time + new_status.download_start_time = current_utc_time + else: + if prev_stage == Stage.DOWNLOAD: + new_status.download_start_time = self.prev_stage_precise_start_time + new_status.download_end_time = current_utc_time + else: + new_status.retrieve_start_time = self.prev_stage_precise_start_time + new_status.retrieve_end_time = current_utc_time + new_status.upload_start_time = current_utc_time + + self.status = new_status + self._update(self.status) + + @abc.abstractmethod + def _read(self, location: str) -> DownloadStatus: + pass + + @abc.abstractmethod + def _update(self, download_status: DownloadStatus) -> None: + pass + + +class FirestoreManifest(Manifest, Database): + """A Firestore Manifest. + This Manifest implementation stores DownloadStatuses in a Firebase document store. + The document hierarchy for the manifest is as follows: + [manifest ] + ├── doc_id (md5 hash of the path) { 'selection': {...}, 'location': ..., 'username': ... } + └── etc... + Where `[]` indicates a collection and ` {...}` indicates a document. + """ + + def _get_db(self) -> firestore.firestore.Client: + """Acquire a firestore client, initializing the firebase app if necessary. + Will attempt to get the db client five times. If it's still unsuccessful, a + `ManifestException` will be raised. + """ + db = None + attempts = 0 + + while db is None: + try: + db = firestore.client() + except ValueError as e: + # The above call will fail with a value error when the firebase app is not initialized. + # Initialize the app here, and try again. + # Use the application default credentials. + cred = credentials.ApplicationDefault() + + firebase_admin.initialize_app(cred) + logger.info("Initialized Firebase App.") + + if attempts > 4: + raise ManifestException( + "Exceeded number of retries to get firestore client." + ) from e + + time.sleep(get_wait_interval(attempts)) + + attempts += 1 + + return db + + def _read(self, location: str) -> DownloadStatus: + """Reads the JSON data from a manifest.""" + + doc_id = generate_md5_hash(location) + + # Update document with download status + download_doc_ref = self.root_document_for_store(doc_id) + + result = download_doc_ref.get() + row = {} + if result.exists: + records = result.to_dict() + row = {n: to_json_serializable_type(v) for n, v in records.items()} + return DownloadStatus.from_dict(row) + + def _update(self, download_status: DownloadStatus) -> None: + """Update or create a download status record.""" + logger.info("Updating Firestore Manifest.") + + status = DownloadStatus.to_dict(download_status) + doc_id = generate_md5_hash(status["location"]) + + # Update document with download status + download_doc_ref = self.root_document_for_store(doc_id) + + result: WriteResult = download_doc_ref.set(status) + + logger.info( + f"Firestore manifest updated. " + f"update_time={result.update_time}, " + f"filename={download_status.location}." + ) + + def root_document_for_store(self, store_scheme: str) -> DocumentReference: + """Get the root manifest document given the user's config and current document's storage location.""" + return ( + self._get_db() + .collection(get_config().manifest_collection) + .document(store_scheme) + ) diff --git a/weather_dl_v2/license_deployment/util.py b/weather_dl_v2/license_deployment/util.py new file mode 100644 index 00000000..14b1f827 --- /dev/null +++ b/weather_dl_v2/license_deployment/util.py @@ -0,0 +1,239 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import datetime +import logging +import geojson +import hashlib +import itertools +import os +import socket +import subprocess +import sys +import typing as t + +import numpy as np +import pandas as pd +from apache_beam.io.gcp import gcsio +from apache_beam.utils import retry +from xarray.core.utils import ensure_us_time_resolution +from urllib.parse import urlparse +from google.api_core.exceptions import BadRequest + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +LATITUDE_RANGE = (-90, 90) +LONGITUDE_RANGE = (-180, 180) +GLOBAL_COVERAGE_AREA = [90, -180, -90, 180] + + +def exceptionit(func): + def inner_function(*args, **kwargs): + try: + func(*args, **kwargs) + except Exception as e: + logger.error(f"exception in {func.__name__} {e.__class__.__name__} {e}.") + + return inner_function + + +def _retry_if_valid_input_but_server_or_socket_error_and_timeout_filter( + exception, +) -> bool: + if isinstance(exception, socket.timeout): + return True + if isinstance(exception, TimeoutError): + return True + # To handle the concurrency issue in BigQuery. + if isinstance(exception, BadRequest): + return True + return retry.retry_if_valid_input_but_server_error_and_timeout_filter(exception) + + +class _FakeClock: + + def sleep(self, value): + pass + + +def retry_with_exponential_backoff(fun): + """A retry decorator that doesn't apply during test time.""" + clock = retry.Clock() + + # Use a fake clock only during test time... + if "unittest" in sys.modules.keys(): + clock = _FakeClock() + + return retry.with_exponential_backoff( + retry_filter=_retry_if_valid_input_but_server_or_socket_error_and_timeout_filter, + clock=clock, + )(fun) + + +# TODO(#245): Group with common utilities (duplicated) +def ichunked(iterable: t.Iterable, n: int) -> t.Iterator[t.Iterable]: + """Yield evenly-sized chunks from an iterable.""" + input_ = iter(iterable) + try: + while True: + it = itertools.islice(input_, n) + # peek to check if 'it' has next item. + first = next(it) + yield itertools.chain([first], it) + except StopIteration: + pass + + +# TODO(#245): Group with common utilities (duplicated) +def copy(src: str, dst: str) -> None: + """Copy data via `gsutil cp`.""" + try: + subprocess.run(["gsutil", "cp", src, dst], check=True, capture_output=True) + except subprocess.CalledProcessError as e: + logger.info( + f'Failed to copy file {src!r} to {dst!r} due to {e.stderr.decode("utf-8")}.' + ) + raise + + +# TODO(#245): Group with common utilities (duplicated) +def to_json_serializable_type(value: t.Any) -> t.Any: + """Returns the value with a type serializable to JSON""" + # Note: The order of processing is significant. + logger.info("Serializing to JSON.") + + if pd.isna(value) or value is None: + return None + elif np.issubdtype(type(value), np.floating): + return float(value) + elif isinstance(value, np.ndarray): + # Will return a scaler if array is of size 1, else will return a list. + return value.tolist() + elif ( + isinstance(value, datetime.datetime) + or isinstance(value, str) + or isinstance(value, np.datetime64) + ): + # Assume strings are ISO format timestamps... + try: + value = datetime.datetime.fromisoformat(value) + except ValueError: + # ... if they are not, assume serialization is already correct. + return value + except TypeError: + # ... maybe value is a numpy datetime ... + try: + value = ensure_us_time_resolution(value).astype(datetime.datetime) + except AttributeError: + # ... value is a datetime object, continue. + pass + + # We use a string timestamp representation. + if value.tzname(): + return value.isoformat() + + # We assume here that naive timestamps are in UTC timezone. + return value.replace(tzinfo=datetime.timezone.utc).isoformat() + elif isinstance(value, np.timedelta64): + # Return time delta in seconds. + return float(value / np.timedelta64(1, "s")) + # This check must happen after processing np.timedelta64 and np.datetime64. + elif np.issubdtype(type(value), np.integer): + return int(value) + + return value + + +def fetch_geo_polygon(area: t.Union[list, str]) -> str: + """Calculates a geography polygon from an input area.""" + # Ref: https://confluence.ecmwf.int/pages/viewpage.action?pageId=151520973 + if isinstance(area, str): + # European area + if area == "E": + area = [73.5, -27, 33, 45] + # Global area + elif area == "G": + area = GLOBAL_COVERAGE_AREA + else: + raise RuntimeError(f"Not a valid value for area in config: {area}.") + + n, w, s, e = [float(x) for x in area] + if s < LATITUDE_RANGE[0]: + raise ValueError(f"Invalid latitude value for south: '{s}'") + if n > LATITUDE_RANGE[1]: + raise ValueError(f"Invalid latitude value for north: '{n}'") + if w < LONGITUDE_RANGE[0]: + raise ValueError(f"Invalid longitude value for west: '{w}'") + if e > LONGITUDE_RANGE[1]: + raise ValueError(f"Invalid longitude value for east: '{e}'") + + # Define the coordinates of the bounding box. + coords = [[w, n], [w, s], [e, s], [e, n], [w, n]] + + # Create the GeoJSON polygon object. + polygon = geojson.dumps(geojson.Polygon([coords])) + return polygon + + +def get_file_size(path: str) -> float: + parsed_gcs_path = urlparse(path) + if parsed_gcs_path.scheme != "gs" or parsed_gcs_path.netloc == "": + return os.stat(path).st_size / (1024**3) if os.path.exists(path) else 0 + else: + return ( + gcsio.GcsIO().size(path) / (1024**3) if gcsio.GcsIO().exists(path) else 0 + ) + + +def get_wait_interval(num_retries: int = 0) -> float: + """Returns next wait interval in seconds, using an exponential backoff algorithm.""" + if 0 == num_retries: + return 0 + return 2**num_retries + + +def generate_md5_hash(input: str) -> str: + """Generates md5 hash for the input string.""" + return hashlib.md5(input.encode("utf-8")).hexdigest() + + +def download_with_aria2(url: str, path: str) -> None: + """Downloads a file from the given URL using the `aria2c` command-line utility, + with options set to improve download speed and reliability.""" + dir_path, file_name = os.path.split(path) + try: + subprocess.run( + [ + "aria2c", + "-x", + "16", + "-s", + "16", + url, + "-d", + dir_path, + "-o", + file_name, + "--allow-overwrite", + ], + check=True, + capture_output=True, + ) + except subprocess.CalledProcessError as e: + logger.info( + f'Failed download from server {url!r} to {path!r} due to {e.stderr.decode("utf-8")}.' + ) + raise From 59408aebb2716bf4bfc426f8b750d3ae907942f8 Mon Sep 17 00:00:00 2001 From: Darshan Prajapati <93967637+DarshanSP19@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:30:30 +0000 Subject: [PATCH 15/16] Fix CI/CD: Use libmamba while creating conda env (#416) * Use libmamba for conda * Use mamba version also --------- Co-authored-by: Darshan Prajapati --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b8d0839f..b691aadb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -56,6 +56,9 @@ jobs: channels: conda-forge environment-file: ci${{ matrix.python-version}}.yml activate-environment: weather-tools + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true - name: Check MetView's installation shell: bash -l {0} run: python -m metview selfcheck @@ -116,6 +119,9 @@ jobs: channels: conda-forge environment-file: ci${{ matrix.python-version}}.yml activate-environment: weather-tools + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true - name: Install weather-tools[test] run: | conda run -n weather-tools pip install -e .[test] --use-deprecated=legacy-resolver From 00de1d56c871844ae7ea7b015dc0c67bfe408464 Mon Sep 17 00:00:00 2001 From: aniketinfocusp <122869307+aniketinfocusp@users.noreply.github.com> Date: Mon, 6 Nov 2023 19:57:45 +0530 Subject: [PATCH 16/16] Added error handling in license deployment. (#417) * Added error handling in license deployment. * lint fixes * nits * lint fixes --- weather_dl_v2/license_deployment/database.py | 15 ++++ weather_dl_v2/license_deployment/fetch.py | 77 ++++++++++++++++---- weather_dl_v2/license_deployment/manifest.py | 8 +- weather_dl_v2/license_deployment/util.py | 63 ++++++++++++++++ 4 files changed, 144 insertions(+), 19 deletions(-) diff --git a/weather_dl_v2/license_deployment/database.py b/weather_dl_v2/license_deployment/database.py index 23c0f064..24206561 100644 --- a/weather_dl_v2/license_deployment/database.py +++ b/weather_dl_v2/license_deployment/database.py @@ -51,6 +51,10 @@ def _remove_config_from_license_queue( ) -> None: pass + @abc.abstractmethod + def _empty_license_queue(self, license_id: str) -> None: + pass + @abc.abstractmethod def _get_partition_from_manifest(self, config_name: str) -> str: pass @@ -128,6 +132,17 @@ def _remove_config_from_license_queue( f"Updated {license_id} queue in 'queues' collection. Update_time: {result.update_time}." ) + def _empty_license_queue(self, license_id: str) -> None: + result: WriteResult = ( + self._get_db() + .collection(get_config().queues_collection) + .document(license_id) + .update({"queue": []}) + ) + logger.info( + f"Updated {license_id} queue in 'queues' collection. Update_time: {result.update_time}." + ) + # TODO: Firestore transcational fails after reading a document 20 times with roll over. # This happens when too many licenses try to access the same partition document. diff --git a/weather_dl_v2/license_deployment/fetch.py b/weather_dl_v2/license_deployment/fetch.py index 63adb33a..3e69a56f 100644 --- a/weather_dl_v2/license_deployment/fetch.py +++ b/weather_dl_v2/license_deployment/fetch.py @@ -25,11 +25,11 @@ from job_creator import create_download_job from clients import CLIENTS from manifest import FirestoreManifest -from util import exceptionit +from util import exceptionit, ThreadSafeDict db_client = FirestoreClient() secretmanager_client = secretmanager.SecretManagerServiceClient() - +CONFIG_MAX_ERROR_COUNT = 10 def create_job(request, result): res = { @@ -48,27 +48,64 @@ def create_job(request, result): @exceptionit -def make_fetch_request(request): +def make_fetch_request(request, error_map: ThreadSafeDict): client = CLIENTS[client_name](request["dataset"]) manifest = FirestoreManifest(license_id=license_id) logger.info( f"By using {client_name} datasets, " - f"users agree to the terms and conditions specified in {client.license_url!r}" + f"users agree to the terms and conditions specified in {client.license_url!r}." ) target = request["location"] selection = json.loads(request["selection"]) logger.info(f"Fetching data for {target!r}.") - with manifest.transact( - request["config_name"], - request["dataset"], - selection, - target, - request["username"], - ): - result = client.retrieve(request["dataset"], selection, manifest) + config_name = request["config_name"] + + if not error_map.has_key(config_name): + error_map[config_name] = 0 + + if error_map[config_name] >= CONFIG_MAX_ERROR_COUNT: + logger.info(f"Error count for config {config_name} exceeded CONFIG_MAX_ERROR_COUNT ({CONFIG_MAX_ERROR_COUNT}).") + error_map.remove(config_name) + logger.info(f"Removing config {config_name} from license queue.") + # Remove config from this license queue. + db_client._remove_config_from_license_queue(license_id=license_id, config_name=config_name) + return + + # Wait for exponential time based on error count. + if error_map[config_name] > 0: + logger.info(f"Error count for config {config_name}: {error_map[config_name]}.") + time = error_map.exponential_time(config_name) + logger.info(f"Sleeping for {time} mins.") + time.sleep(time) + + try: + with manifest.transact( + request["config_name"], + request["dataset"], + selection, + target, + request["username"], + ): + result = client.retrieve(request["dataset"], selection, manifest) + except Exception as e: + # We are handling this as generic case as CDS client throws generic exceptions. + + # License expired. + if "Access token expired" in str(e): + logger.error(f"{license_id} expired. Emptying queue! error: {e}.") + db_client._empty_license_queue(license_id=license_id) + return + + # Increment error count for a config. + logger.error(f"Partition fetching failed. Error {e}.") + error_map.increment(config_name) + return + + # If any partition in successful reset the error count. + error_map[config_name] = 0 create_job(request, result) @@ -90,20 +127,28 @@ def fetch_request_from_db(): def main(): logger.info("Started looking at the request.") + error_map = ThreadSafeDict() with ThreadPoolExecutor(concurrency_limit) as executor: + # Disclaimer: A license will pick always pick concurrency_limit + 1 + # parition. One extra parition will be kept in threadpool task queue. + while True: # Fetch a request from the database request = fetch_request_from_db() if request is not None: - executor.submit(make_fetch_request, request) + executor.submit(make_fetch_request, request, error_map) else: logger.info("No request available. Waiting...") time.sleep(5) - # Check if the maximum concurrency level has been reached - # If so, wait for a slot to become available - while executor._work_queue.qsize() >= concurrency_limit: + # Each license should not pick more partitions than it's + # concurrency_limit. We limit the threadpool queue size to just 1 + # to prevent the license from picking more partitions than + # it's concurrency_limit. When an executor is freed up, the task + # in queue is picked and license fetches another task. + while executor._work_queue.qsize() >= 1: + logger.info("Worker busy. Waiting...") time.sleep(1) diff --git a/weather_dl_v2/license_deployment/manifest.py b/weather_dl_v2/license_deployment/manifest.py index 1b5355d8..3119de9e 100644 --- a/weather_dl_v2/license_deployment/manifest.py +++ b/weather_dl_v2/license_deployment/manifest.py @@ -500,14 +500,16 @@ def _update(self, download_status: DownloadStatus) -> None: status = DownloadStatus.to_dict(download_status) doc_id = generate_md5_hash(status["location"]) - # Update document with download status + # Update document with download status. download_doc_ref = self.root_document_for_store(doc_id) result: WriteResult = download_doc_ref.set(status) logger.info( - f"Firestore manifest updated. " - f"update_time={result.update_time}, " + "Firestore manifest updated. " + + f"update_time={result.update_time}, " + + f"status={status['status']} " + + f"stage={status['stage']} " + f"filename={download_status.location}." ) diff --git a/weather_dl_v2/license_deployment/util.py b/weather_dl_v2/license_deployment/util.py index 14b1f827..d24a1405 100644 --- a/weather_dl_v2/license_deployment/util.py +++ b/weather_dl_v2/license_deployment/util.py @@ -31,6 +31,7 @@ from xarray.core.utils import ensure_us_time_resolution from urllib.parse import urlparse from google.api_core.exceptions import BadRequest +from threading import Lock logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -237,3 +238,65 @@ def download_with_aria2(url: str, path: str) -> None: f'Failed download from server {url!r} to {path!r} due to {e.stderr.decode("utf-8")}.' ) raise + +class ThreadSafeDict: + """A thread safe dict with crud operations.""" + + + def __init__(self) -> None: + self._dict = {} + self._lock = Lock() + self.initial_delay = 1 + self.factor = 0.5 + + + def __getitem__(self, key): + val = None + with self._lock: + val = self._dict[key] + return val + + + def __setitem__(self, key, value): + with self._lock: + self._dict[key] = value + + + def remove(self, key): + with self._lock: + self._dict.__delitem__(key) + + + def has_key(self, key): + present = False + with self._lock: + present = key in self._dict + return present + + + def increment(self, key, delta=1): + with self._lock: + if key in self._dict: + self._dict[key] += delta + + + def decrement(self, key, delta=1): + with self._lock: + if key in self._dict: + self._dict[key] -= delta + + + def find_exponential_delay(self, n: int) -> int: + delay = self.initial_delay + for _ in range(n): + delay += delay*self.factor + return delay + + + def exponential_time(self, key): + """Returns exponential time based on dict value. Time in seconds.""" + delay = 0 + with self._lock: + if key in self._dict: + delay = self.find_exponential_delay(self._dict[key]) + return delay * 60