diff --git a/.github/workflows/mambatest.yml b/.github/workflows/mambatest.yml new file mode 100644 index 0000000..6af6a04 --- /dev/null +++ b/.github/workflows/mambatest.yml @@ -0,0 +1,51 @@ +# This workflow will install Python dependencies using Conda, run tests and lint with a single version of Python +# For more information see: https://autobencoder.com/2020-08-24-conda-actions/ + +name: Mamba PyTest + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +permissions: + contents: read + +jobs: + build: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10"] + defaults: + run: + shell: bash -el {0} + steps: + - uses: actions/checkout@v3 + - name: provision-with-micromamba + uses: mamba-org/provision-with-micromamba@v14 + with: + environment-file: environment.yml + environment-name: smmregrid + cache-downloads: true + extra-specs: | + python=${{ matrix.python-version }} + - name: Install smmregrid + run: | + # install package + pip install -e . + - name: Lint with flake8 + run: | + # install flake8 + conda install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + conda install pytest + python -m pytest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..535e49f --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +smmregrid.egg-info +__pycache__ +*.idx diff --git a/README.md b/README.md index 2e00363..d9a91b0 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,30 @@ # smmregrid A compact regridder using sparse matrix multiplication -This repository represents a modification of the regridding routines in [climtas](https://github.com/ScottWales/climtas) by Scott Wales, which already implements efficiently this idea and has no other significant dependencies (it does not use iris or esmf for regridding). +This repository represents a modification of the regridding routines in [climtas](https://github.com/ScottWales/climtas) by Scott Wales, which already implements efficiently this idea and has no other significant dependencies (it does not use iris). +The regridder uses efficiently sparse matrix multiplication with dask + some manipulation of the coordinates. -I only had to change a few lines of code to make it compatible with unstructured grids. The regridder uses efficiently sparse matrix multiplication with dask + some manipulation of the coordinates (which would have to be revised/checked again) +Please note that this tool is not thought as "another interpolation tool", but rather a method to apply pre-computed weights (with CDO, which is currently tested, and with ESMF, which is not yet supported) within the python environment. +The speedup is estimated to be about ~1.5 to ~5 times, slightly lower if then files are written to the disk. 2D and 3D data are supported on all the grids supported by CDO, both xarray.Dataset and xarray.DataArray can be used. Masks are treated in a simple way but are correctly transfered. Attributes are kept. + +It is safer to run it through conda/mamba. Install with: + +``` +conda env create -f environment.yml +``` + +then activate the environment: + +``` +conda activate smmregrid +``` +and install smmregrid in editable mode: -Install with ``` pip install -e . ``` + +Cautionary notes: +- It does not work correctly if the Xarray.Dataset includes fields with different land-sea masks (e.g. temperature and SST) +- It does not support ESMF weigths. + diff --git a/dask_playground.ipynb b/dask_playground.ipynb new file mode 100644 index 0000000..72b8e20 --- /dev/null +++ b/dask_playground.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tests for SMM (with dask) versus CDO\n", + "\n", + "There are the same speed test but using dask. Surprisingly, the code is much slower. There should be something wrong" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/work/users/paolo/miniconda3/envs/DevECmean4/lib/python3.10/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n", + "Perhaps you already have a cluster running?\n", + "Hosting the HTTP server on port 44465 instead\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from time import time\n", + "import timeit\n", + "import os\n", + "import numpy as np\n", + "import xarray as xr\n", + "from smmregrid import cdo_generate_weights, Regridder\n", + "from smmregrid.checker import check_cdo_regrid # this is a new function introduced to verify the output\n", + "from cdo import Cdo\n", + "import pandas as pd\n", + "cdo = Cdo()\n", + "\n", + "# where and which the data are\n", + "indir='tests/data'\n", + "filelist = ['onlytos-ipsl.nc','tas-ecearth.nc', '2t-era5.nc','tos-fesom.nc']\n", + "tfile = os.path.join(indir, 'r360x180.nc')\n", + "\n", + "# method for remapping\n", + "methods = ['nn','con']\n", + "accesses = ['DataArray', 'Data']\n", + "\n", + "from dask.distributed import LocalCluster, Client\n", + "cluster = LocalCluster(ip=\"0.0.0.0\", threads_per_worker=1, n_workers=2)\n", + "client = Client(cluster)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Remapping (with weights available)\n", + "\n", + "This is the real goal of smmregrid. Here we test the computation of the remap when the weights are pre-computed. Considering that SMM does not have to write anything to disk, it is several times faster, between 5 to 10. Running with Dataset implies a bit of overhead (20%). Masks so far does not seem to be an issue." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CDOSMM (Dataset)SMM (DataArray)SMM (DataSet+NoMask)
onlytos-ipsl.nc1.00.7265990.4275390.427731
tas-ecearth.nc1.00.9021230.8412630.869179
2t-era5.nc1.00.6733390.6942630.642410
tos-fesom.nc1.00.4077640.4059180.411269
\n", + "
" + ], + "text/plain": [ + " CDO SMM (Dataset) SMM (DataArray) SMM (DataSet+NoMask)\n", + "onlytos-ipsl.nc 1.0 0.726599 0.427539 0.427731\n", + "tas-ecearth.nc 1.0 0.902123 0.841263 0.869179\n", + "2t-era5.nc 1.0 0.673339 0.694263 0.642410\n", + "tos-fesom.nc 1.0 0.407764 0.405918 0.411269" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# nrepetition for the check\n", + "nr = 10\n", + "\n", + "data =[]\n", + "for filein in filelist: \n", + "\n", + " # CDO\n", + " wfile = cdo.gencon(tfile, input = os.path.join(indir,filein))\n", + " one = timeit.timeit(lambda: cdo.remap(tfile + ',' + wfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)\n", + " #print(filein + ': Exectime CDO Remap ' + str(one/nr))\n", + "\n", + " # SMM\n", + " xfield = xr.open_mfdataset(os.path.join(indir,filein))\n", + " wfield = cdo_generate_weights(os.path.join(indir,filein), tfile, method = 'con')\n", + " interpolator = Regridder(weights=wfield)\n", + " # var as the one which have time and not have bnds (could work)\n", + " myvar = [var for var in xfield.data_vars \n", + " if 'time' in xfield[var].dims and 'bnds' not in xfield[var].dims]\n", + " two = timeit.timeit(lambda: interpolator.regrid(xfield), number = nr)\n", + " three = timeit.timeit(lambda: interpolator.regrid(xfield[myvar]), number = nr)\n", + " four = timeit.timeit(lambda: interpolator.regrid(xfield[myvar], masked = False), number = nr)\n", + " data.append([one, two, three, four])\n", + "\n", + " #print(filein + ': Exectime SMM Remap (DataSet) ' + str(two/nr))\n", + " #print(filein + ': Exectime SMM Remap (DataArray) ' + str(three/nr))\n", + " #print(filein + ': Exectime SMM Remap (DataSet+NoMask) ' + str(four/nr))\n", + "\n", + "cnames = ['CDO', 'SMM (Dataset)', 'SMM (DataArray)', 'SMM (DataSet+NoMask)']\n", + "df = pd.DataFrame(data, index = filelist, columns = cnames)\n", + "df.div(df[cnames[0]],axis =0)\n", + "\n", + "client.shutdown()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "DevECmean4", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d1a27f430e855354fabe9b58ad426cbc88af57f8b66247655f5de977d5b44f64" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/environment.yml b/environment.yml index 589b8ed..5247277 100644 --- a/environment.yml +++ b/environment.yml @@ -5,11 +5,17 @@ name : smmregrid channels: - conda-forge dependencies: - - python>=3.8,<3.11 + - python>=3.7,<3.11 - numpy - netcdf4 - dask - xarray + - cfgrib - xesmf - - sparse - cfunits + - cdo + - python-cdo + - pytest + - ipykernel + - pip: + - sparse==0.13.0 diff --git a/playground.ipynb b/playground.ipynb new file mode 100644 index 0000000..1579b38 --- /dev/null +++ b/playground.ipynb @@ -0,0 +1,616 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tests for SMM (without dask) versus CDO\n", + "\n", + "There are a few tests to check if the SMM approach is faster than the CDO one and if it is reliable in terms of output. Encouraging results below. Tested with both 2D and 3D vars, using DataArray and Datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import timeit\n", + "import os\n", + "import numpy as np\n", + "import xarray as xr\n", + "from smmregrid import cdo_generate_weights, Regridder\n", + "from smmregrid.checker import check_cdo_regrid # this is a new function introduced to verify the output\n", + "from cdo import Cdo\n", + "import pandas as pd\n", + "import copy\n", + "cdo = Cdo()\n", + "import dask\n", + "dask.config.set(scheduler=\"synchronous\")\n", + "\n", + "# where and which the data are\n", + "indir='tests/data'\n", + "filelist = ['lsm-ifs.grb','onlytos-ipsl.nc','tas-ecearth.nc', \n", + " '2t-era5.nc','tos-fesom.nc', 'ua-ecearth.nc', 'mix-cesm.nc']#,'era5-mon.nc'] # the last is not available on github\n", + "#filelist = ['tos-fesom.nc','onlytos-ipsl.nc','tas-ecearth.nc'] \n", + "#filelist = ['tas-ecearth.nc']\n", + "tfile = os.path.join(indir, 'r360x180.nc')\n", + "\n", + "# method for remapping\n", + "methods = ['nn','con','bil']\n", + "#methods = ['con']\n", + "accesses = ['Dataset', 'DataArray']\n", + "\n", + "\n", + "# create an iterable dictionary, and clean cases where we know CDO does not work\n", + "defdict = {'methods': methods, 'accesses': accesses, 'extra': '', 'chunks': None}\n", + "base = {k: copy.deepcopy(defdict) for k in filelist}\n", + "if 'tos-fesom.nc' in filelist:\n", + " base['tos-fesom.nc']['methods'].remove('bil')\n", + "if 'lsm-ifs.grb' in filelist:\n", + " base['lsm-ifs.grb']['extra'] = '-setgridtype,regular'\n", + " base['lsm-ifs.grb']['methods'].remove('bil')\n", + " base['lsm-ifs.grb']['methods'].remove('con')\n", + "if 'mix-cesm.nc' in filelist:\n", + " base['mix-cesm.nc']['accesses'].remove('DataArray')\n", + "if 'era5-mon.nc' in filelist:\n", + " base['era5-mon.nc']['chunks'] = {'time': 12}\n", + "if 'ua-ecearth.nc' in filelist:\n", + " base['ua-ecearth.nc']['chunks'] = {'plev': 3}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Robustness test\n", + "\n", + "This is to verify that the regridding is equal: this is done by comparing the output from CDO to the output obtained by SMM. \n", + "The files to be checked are above. This is the same as the thing done in the tests." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "lsm-ifs.grb: remapnn via Dataset -> True\n", + "lsm-ifs.grb: remapnn via DataArray -> True\n", + "onlytos-ipsl.nc: remapnn via Dataset -> True\n", + "onlytos-ipsl.nc: remapnn via DataArray -> True\n", + "onlytos-ipsl.nc: remapcon via Dataset -> True\n", + "onlytos-ipsl.nc: remapcon via DataArray -> True\n", + "onlytos-ipsl.nc: remapbil via Dataset -> True\n", + "onlytos-ipsl.nc: remapbil via DataArray -> True\n", + "tas-ecearth.nc: remapnn via Dataset -> True\n", + "tas-ecearth.nc: remapnn via DataArray -> True\n", + "tas-ecearth.nc: remapcon via Dataset -> True\n", + "tas-ecearth.nc: remapcon via DataArray -> True\n", + "tas-ecearth.nc: remapbil via Dataset -> True\n", + "tas-ecearth.nc: remapbil via DataArray -> True\n", + "2t-era5.nc: remapnn via Dataset -> True\n", + "2t-era5.nc: remapnn via DataArray -> True\n", + "2t-era5.nc: remapcon via Dataset -> True\n", + "2t-era5.nc: remapcon via DataArray -> True\n", + "2t-era5.nc: remapbil via Dataset -> True\n", + "2t-era5.nc: remapbil via DataArray -> True\n", + "tos-fesom.nc: remapnn via Dataset -> True\n", + "tos-fesom.nc: remapnn via DataArray -> True\n", + "tos-fesom.nc: remapcon via Dataset -> True\n", + "tos-fesom.nc: remapcon via DataArray -> True\n", + "ua-ecearth.nc: remapnn via Dataset -> True\n", + "ua-ecearth.nc: remapnn via DataArray -> True\n", + "ua-ecearth.nc: remapcon via Dataset -> True\n", + "ua-ecearth.nc: remapcon via DataArray -> True\n", + "ua-ecearth.nc: remapbil via Dataset -> True\n", + "ua-ecearth.nc: remapbil via DataArray -> True\n", + "mix-cesm.nc: remapnn via Dataset -> True\n", + "mix-cesm.nc: remapcon via Dataset -> True\n", + "mix-cesm.nc: remapbil via Dataset -> True\n" + ] + } + ], + "source": [ + "\n", + "for filein in filelist: \n", + " for method in base[filein]['methods']:\n", + " for access in base[filein]['accesses']: \n", + " cc = check_cdo_regrid(os.path.join(indir,filein), tfile, method = method, access = access)#, extra = base[filein].get('extra'))\n", + " print(filein + ': remap' + method + ' via ' + access + ' -> ' + str(cc))\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From now we use only cases where we can use remapcon, i.e. we remove gaussiam reduced" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Full remapping \n", + "\n", + "Test the full remap (generation of the weight + applicaton) of CDO vs SMM. Still using conservative remapping. Results seems very much comparable!" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "caught signal .Operator object at 0x7fe995fca620> 2 \n" + ] + } + ], + "source": [ + "# nrepetition for the check\n", + "nr = 5\n", + "\n", + "if 'lsm-ifs.grb' in filelist:\n", + " base.pop('lsm-ifs.grb')\n", + "\n", + "# fast function to call the entire interpolation\n", + "def smm_remap(ifile, tfile):\n", + "\n", + " xfield = xr.open_mfdataset(ifile)\n", + " wfield = cdo_generate_weights(ifile, tfile, method = 'con')\n", + " interpolator = Regridder(weights=wfield)\n", + " var = list(xfield.data_vars)[-1]\n", + " rfield = interpolator.regrid(xfield)\n", + " return(rfield)\n", + "\n", + "data =[]\n", + "for filein in base.keys(): \n", + "\n", + " one = timeit.timeit(lambda: cdo.remapcon(tfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)\n", + " #print(filein + ': Exectime CDO Weight+Remap ' + str(one/nr))\n", + " two = timeit.timeit(lambda: smm_remap(os.path.join(indir,filein), tfile), number = nr)\n", + " #print(filein + ': Exectime SMM Weight+Remap ' + str(two/nr))\n", + " data.append([one, two])\n", + "\n", + "cnames = ['CDO (Weight+Remap)', 'SMM (Weight+Remap)']\n", + "df = pd.DataFrame(data, index = base.keys(), columns = cnames)\n", + "df.div(df[cnames[0]],axis =0)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Remapping (with weights available)\n", + "\n", + "This is the real goal of smmregrid. Here we test the computation of the remap when the weights are pre-computed, still using with conservative remapping. Considering that SMM does not have to write anything to disk, it is a few times faster. Running with Dataset implies a bit of overhead (20%). Masks have been integrated and create a small overhead when needed. Of course, loading the files implies a considerable slowdown." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "onlytos-ipsl.nc\n", + "tas-ecearth.nc\n", + "2t-era5.nc\n", + "tos-fesom.nc\n", + "ua-ecearth.nc\n", + "mix-cesm.nc\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NVarsNRecordsCDOCDO (NoLoad)SMM (Dataset)SMM (DataArray)SMM (DataArray+NoLoad)SMM (Dataset+Write)SMM (DataArray+Write)
onlytos-ipsl.nc1(12, 332, 362)1.00.9802590.3561130.3170130.0778330.4144870.397271
tas-ecearth.nc1(12, 256, 512)1.00.9518500.3420260.3037910.0779690.3981380.389535
2t-era5.nc1(12, 73, 144)1.01.0650760.2347570.1804910.0450160.2922260.278912
tos-fesom.nc1(12, 126859)1.00.9613290.3132250.3111300.0430780.3664380.351693
ua-ecearth.nc1(2, 19, 256, 512)1.00.8882900.5050480.4757890.1470390.7312950.718535
mix-cesm.nc4(12, 192, 288)1.00.8788470.5989810.1722550.0425160.9221430.241714
\n", + "
" + ], + "text/plain": [ + " NVars NRecords CDO CDO (NoLoad) SMM (Dataset) \\\n", + "onlytos-ipsl.nc 1 (12, 332, 362) 1.0 0.980259 0.356113 \n", + "tas-ecearth.nc 1 (12, 256, 512) 1.0 0.951850 0.342026 \n", + "2t-era5.nc 1 (12, 73, 144) 1.0 1.065076 0.234757 \n", + "tos-fesom.nc 1 (12, 126859) 1.0 0.961329 0.313225 \n", + "ua-ecearth.nc 1 (2, 19, 256, 512) 1.0 0.888290 0.505048 \n", + "mix-cesm.nc 4 (12, 192, 288) 1.0 0.878847 0.598981 \n", + "\n", + " SMM (DataArray) SMM (DataArray+NoLoad) SMM (Dataset+Write) \\\n", + "onlytos-ipsl.nc 0.317013 0.077833 0.414487 \n", + "tas-ecearth.nc 0.303791 0.077969 0.398138 \n", + "2t-era5.nc 0.180491 0.045016 0.292226 \n", + "tos-fesom.nc 0.311130 0.043078 0.366438 \n", + "ua-ecearth.nc 0.475789 0.147039 0.731295 \n", + "mix-cesm.nc 0.172255 0.042516 0.922143 \n", + "\n", + " SMM (DataArray+Write) \n", + "onlytos-ipsl.nc 0.397271 \n", + "tas-ecearth.nc 0.389535 \n", + "2t-era5.nc 0.278912 \n", + "tos-fesom.nc 0.351693 \n", + "ua-ecearth.nc 0.718535 \n", + "mix-cesm.nc 0.241714 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "if 'lsm-ifs.grb' in filelist:\n", + " base.pop('lsm-ifs.grb')\n", + "\n", + "data =[]\n", + "for filein in base.keys(): \n", + " print(filein)\n", + " if filein in ['era5-mon.nc', 'era5-day.nc']: \n", + " nr = 2\n", + " else :\n", + " nr = 20\n", + "\n", + " # CDO\n", + " wfile = cdo.gencon(tfile, input = os.path.join(indir,filein))\n", + " ccdo = timeit.timeit(lambda: cdo.remap(tfile + ',' + wfile, input = os.path.join(indir,filein), returnXDataset = True).load(), number = nr)\n", + " cdonoload = timeit.timeit(lambda: cdo.remap(tfile + ',' + wfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)\n", + " #print(filein + ': Exectime CDO Remap ' + str(one/nr))\n", + "\n", + " # SMM: load field and weights, initialize regridder\n", + " xfield = xr.open_mfdataset(os.path.join(indir,filein)).load()\n", + " wfield = cdo_generate_weights(os.path.join(indir,filein), tfile, method = 'con').load()\n", + " interpolator = Regridder(weights=wfield)\n", + " \n", + " # var as the one which have time and not have bnds, pick the first one\n", + " myvar = [var for var in xfield.data_vars \n", + " if 'time' in xfield[var].dims and 'bnds' not in xfield[var].dims]\n", + " \n", + " # dataset infos\n", + " nrecords = xfield[myvar[0]].shape\n", + " nvars = len(myvar)\n", + "\n", + "\n", + " sset = timeit.timeit(lambda: interpolator.regrid(xfield).load(), number = nr)\n", + " arr = timeit.timeit(lambda: interpolator.regrid(xfield[myvar[0]]).load(), number = nr)\n", + " arrnoload = timeit.timeit(lambda: interpolator.regrid(xfield[myvar[0]]), number = nr)\n", + " #arrnomask = timeit.timeit(lambda: interpolator.regrid(xfield[myvar[0]], masked = False).load(), number = nr)\n", + " setwrite = timeit.timeit(lambda: interpolator.regrid(xfield).to_netcdf('test.nc'), number = nr)\n", + " if os.path.isfile('test.nc'):\n", + " os.remove('test.nc')\n", + " arrwrite = timeit.timeit(lambda: interpolator.regrid(xfield[myvar[0]]).to_netcdf('test2.nc'), number = nr)\n", + " if os.path.isfile('test2.nc'):\n", + " os.remove('test2.nc')\n", + " data.append([nvars, nrecords, ccdo, cdonoload, sset, arr, arrnoload, setwrite, arrwrite])\n", + "\n", + "\n", + "cnames = ['NVars', 'NRecords', 'CDO', 'CDO (NoLoad)',\n", + " 'SMM (Dataset)', 'SMM (DataArray)', 'SMM (DataArray+NoLoad)', \n", + " 'SMM (Dataset+Write)', 'SMM (DataArray+Write)']\n", + "df = pd.DataFrame(data, index = base.keys(), columns = cnames)\n", + "final = pd.concat([df.iloc[:,0:2],df.iloc[:,2:].div(df[cnames[2]], axis=0)], join='outer', axis=1)\n", + "final\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print to markdown so that we can copy paste it on the portal" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| | NVars | NRecords | CDO | CDO (NoLoad) | SMM (Dataset) | SMM (DataArray) | SMM (DataArray+NoLoad) | SMM (Dataset+Write) | SMM (DataArray+Write) |\n", + "|:----------------|--------:|:------------------|------:|---------------:|----------------:|------------------:|-------------------------:|----------------------:|------------------------:|\n", + "| onlytos-ipsl.nc | 1 | (12, 332, 362) | 1 | 0.924445 | 0.377513 | 0.347206 | 0.078164 | 0.429603 | 0.431012 |\n", + "| tas-ecearth.nc | 1 | (12, 256, 512) | 1 | 0.927699 | 0.410739 | 0.38098 | 0.0854462 | 0.468425 | 0.461594 |\n", + "| 2t-era5.nc | 1 | (12, 73, 144) | 1 | 0.924335 | 0.204548 | 0.160671 | 0.0439383 | 0.253002 | 0.248918 |\n", + "| tos-fesom.nc | 1 | (12, 126859) | 1 | 1.01328 | 0.332096 | 0.32686 | 0.048309 | 0.379621 | 0.372826 |\n", + "| ua-ecearth.nc | 1 | (2, 19, 256, 512) | 1 | 0.898331 | 0.508237 | 0.475537 | 0.156023 | 0.74989 | 0.735353 |\n", + "| mix-cesm.nc | 4 | (12, 192, 288) | 1 | 0.886593 | 0.652437 | 0.181319 | 0.0456369 | 0.92309 | 0.248312 |\n", + "| era5-mon.nc | 1 | (864, 721, 1440) | 1 | 0.990528 | 0.757665 | 0.794592 | 0.33693 | 0.822056 | 0.803797 |\n" + ] + } + ], + "source": [ + "print(final.to_markdown())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Weight generation\n", + "\n", + "Test the different weights generation possibilities with CDO, tested with conservative remapping: the climtas code is way more efficient if files are already on the disk, since the call to CDO has to be done from file. CDO bindings have a minimum overhead to be considered" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CDO bindingsCDO subprocess (from file)CDO subprocess (from xarray)
onlytos-ipsl.nc1.00.9388951.025188
tas-ecearth.nc1.00.8821291.164144
2t-era5.nc1.00.7470730.852617
tos-fesom.nc1.00.9879441.377073
ua-ecearth.nc1.00.8912981.118633
mix-cesm.nc1.00.7962451.012632
\n", + "
" + ], + "text/plain": [ + " CDO bindings CDO subprocess (from file) \\\n", + "onlytos-ipsl.nc 1.0 0.938895 \n", + "tas-ecearth.nc 1.0 0.882129 \n", + "2t-era5.nc 1.0 0.747073 \n", + "tos-fesom.nc 1.0 0.987944 \n", + "ua-ecearth.nc 1.0 0.891298 \n", + "mix-cesm.nc 1.0 0.796245 \n", + "\n", + " CDO subprocess (from xarray) \n", + "onlytos-ipsl.nc 1.025188 \n", + "tas-ecearth.nc 1.164144 \n", + "2t-era5.nc 0.852617 \n", + "tos-fesom.nc 1.377073 \n", + "ua-ecearth.nc 1.118633 \n", + "mix-cesm.nc 1.012632 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mnotebook controller is DISPOSED. \n", + "\u001b[1;31mPer ulteriori dettagli, visualizzare Jupyter log." + ] + } + ], + "source": [ + "# nrepetition for the check\n", + "nr = 5\n", + "\n", + "if 'lsm-ifs.grb' in filelist:\n", + " base.pop('lsm-ifs.grb')\n", + "\n", + "# generate weights from file\n", + "data = []\n", + "for filein in base.keys(): \n", + " \n", + " # open file\n", + " xfield = xr.open_mfdataset(os.path.join(indir,filein))\n", + " tfield = xr.open_mfdataset(tfile)\n", + "\n", + " # generate weights from file\n", + " one = timeit.timeit(lambda: cdo_generate_weights(os.path.join(indir,filein), tfile, method = 'con'), number = nr)\n", + " #print(filein + ': Exectime climtas from file ' + str(one/nr))\n", + " # generate weights from xarray\n", + " two = timeit.timeit(lambda: cdo_generate_weights(xfield, tfield, method = 'con'), number = nr)\n", + " #print(filein + ': Exectime climtas from xarray ' + str(two/nr))\n", + " # generatre weights with CDO bindings (from file)\n", + " three = timeit.timeit(lambda: cdo.gencon(tfile, input = os.path.join(indir,filein), returnXDataset = True), number = nr)\n", + " #print(filein + ': Exectime cdo from file ' + str(three/nr))\n", + " data.append([three, one, two])\n", + "\n", + "cnames = ['CDO bindings', 'CDO subprocess (from file)', 'CDO subprocess (from xarray)']\n", + "df = pd.DataFrame(data, index = base.keys(), columns = cnames)\n", + "df.div(df[cnames[0]],axis =0)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smmregrid", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d1accb2f512bc25ed3ccfcb2d2713bb58c000279d857d70046a9787281d1913f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/setup.py b/setup.py index f684a5a..695c627 100644 --- a/setup.py +++ b/setup.py @@ -4,17 +4,20 @@ setup(name='smmregrid', version='0.0.1', description='Regridding based on sparse matrix multiplication', - author=' Jost von Hardenberg, Scott Wales', + author=' Jost von Hardenberg, Paolo Davini, Scott Wales', author_email='jost.hardenberg@polito.it', url='https://github.com/jhardenberg/smmregrid', - python_requires='>3.7, <3.10', + python_requires='>=3.7, <3.11', packages=['smmregrid'], install_requires=[ 'numpy', 'xarray', + 'dask', 'netcdf4', + 'cfgrib', 'dask', 'sparse', - 'cfunits' + 'cfunits', + 'cdo' ] ) diff --git a/smmregrid/checker.py b/smmregrid/checker.py new file mode 100644 index 0000000..d97d4bf --- /dev/null +++ b/smmregrid/checker.py @@ -0,0 +1,60 @@ +import xarray as xr +import numpy as np +import xarray as xr +from smmregrid import cdo_generate_weights, Regridder + +from cdo import Cdo +cdo = Cdo() + + +def find_var(xfield): + """Find them most likely set of vars that needs to be interpolated""" + + # var as the one which have time and not have bnds (could work) + myvar = [var for var in xfield.data_vars + if 'time' in xfield[var].dims and 'bnds' not in xfield[var].dims] + + # if find none otherwise, pick what you have + if not myvar: + myvar = list(xfield.data_vars) + + return myvar + + +def check_cdo_regrid(finput, ftarget, method='con', access='Dataset'): + """Given a file to be interpolated finput over the ftarget grid, + check if the output of the last variable is the same as produced + by CDO remap command. This function is used for tests.""" + + # define files and open input file + xfield = xr.open_mfdataset(finput) + + # var as the last available + # myvar = list(xfield.data_vars)[-1] + + # interpolation with pure CDO + cdo_interpolator = getattr(cdo, 'remap' + method) + cdofield = cdo_interpolator(ftarget, input=finput, returnXDataset=True) + # print(cdofield) + + # var as the one which have time and not have bnds (could work) + smmvar = find_var(xfield) + cdovar = find_var(cdofield) + + if len(smmvar) == 1 and access == 'DataArray': + xfield = xfield[smmvar[0]] + if len(cdovar) == 1 and access == 'DataArray': + cdofield = cdofield[cdovar[0]] + + # interpolation with smmregrid (CDO-based) + wfield = cdo_generate_weights(finput, ftarget, method=method) + interpolator = Regridder(weights=wfield) + rfield = interpolator.regrid(xfield) + + if access == 'Dataset': + rfield = rfield[smmvar].to_array() + cdofield = cdofield[cdovar].to_array() + + # check if arrays are equal with numerical tolerance + checker = np.allclose(cdofield, rfield, equal_nan=True) + return checker diff --git a/smmregrid/regrid.py b/smmregrid/regrid.py index 2062e7f..ef24c4a 100644 --- a/smmregrid/regrid.py +++ b/smmregrid/regrid.py @@ -228,7 +228,7 @@ def esmf_generate_weights( extrap_method, "--norm_type", norm_type, - #'--user_areas', + # '--user_areas', "--no-log", "--check", ] @@ -293,17 +293,30 @@ def compute_weights_matrix(weights): return sparse_weights -def apply_weights(source_data, weights, weights_matrix=None): +def apply_weights(source_data, weights, weights_matrix=None, masked=True): """ Apply the CDO weights ``weights`` to ``source_data``, performing a regridding operation Args: - source_data (xarray.Dataset): Source dataset - weights (xarray.Dataset): CDO weights information + source_data (xarray.DataArray): Source dataset + weights (xarray.DataArray): CDO weights information Returns: - xarray.Dataset: Regridded version of the source dataset + xarray.DataArray: Regridded version of the source dataset """ + + # Understand immediately if we need to retunr something or not + # This is done if we have bounds variables + if ("bnds" in source_data.name or "bounds" in source_data.name): + + # we keep time bounds, and we ignore all the rest + if 'time' in source_data.name: + # print('original ' + source_data.name) + return source_data + else: + # print('empty ' + source_data.name) + return xarray.DataArray(data=None) + # Alias the weights dataset from CDO w = weights @@ -339,7 +352,7 @@ def apply_weights(source_data, weights, weights_matrix=None): dst_grid_shape = w.dst_grid_dims.values dst_grid_center_lat = w.dst_grid_center_lat.data.reshape( - #dst_grid_shape[::-1], order="C" + # dst_grid_shape[::-1], order="C" dst_grid_shape[::-1] ) dst_grid_center_lon = w.dst_grid_center_lon.data.reshape( @@ -348,25 +361,35 @@ def apply_weights(source_data, weights, weights_matrix=None): ) dst_mask = w.dst_grid_imask + src_mask = w.src_grid_imask axis_scale = 180.0 / math.pi # Weight lat/lon in radians # Check lat/lon are the last axes - #source_lat, source_lon = identify_lat_lon(source_data) - #if not ( - #source_lat.name in source_data.dims[-2:] - #and source_lon.name in source_data.dims[-2:] - #): + # source_lat, source_lon = identify_lat_lon(source_data) + # if not ( + # source_lat.name in source_data.dims[-2:] + # and source_lon.name in source_data.dims[-2:] + # ): # raise Exception( # "Last two dimensions should be spatial coordinates," # f" got {source_data.dims[-2:]}" # ) + # keep time related variables as it is (i.e. keep time_bnds) + # print('start ' + source_data.name) + + # print('full ' + source_data.name) + # exclude the lon-lat bounds + # Find dimensions to keep - nd = sum([(d not in ['i', 'j', 'x', 'y', 'lon', 'lat', 'longitude', 'latitude', 'cell', 'cells', 'ncells', 'values', 'value']) for d in source_data.dims]) + nd = sum([(d not in ['i', 'j', 'x', 'y', 'lon', 'lat', 'longitude', 'latitude', + 'cell', 'cells', 'ncells', 'values', 'value']) for d in source_data.dims]) kept_shape = list(source_data.shape[0:nd]) kept_dims = list(source_data.dims[0:nd]) + # print(kept_dims) + # print(kept_dims) if weights_matrix is None: weights_matrix = compute_weights_matrix(weights) @@ -382,14 +405,23 @@ def apply_weights(source_data, weights, weights_matrix=None): dask.array.ma.set_fill_value(source_array, 1e20) source_array = dask.array.ma.fix_invalid(source_array) source_array = dask.array.ma.filled(source_array) - target_dask = dask.array.tensordot(source_array, weights_matrix, axes=1) - bmask = numpy.broadcast_to( - dst_mask.data.reshape([1 for d in kept_shape] + [-1]), target_dask.shape - ) + # define and compute the new mask + if masked: + + # target mask is loaded from above + target_mask = dst_mask.data + + # broadcast the mask on all the remaining dimensions + target_mask = numpy.broadcast_to( + target_mask.reshape([1 for d in kept_shape] + [-1]), target_dask.shape + ) - target_dask = dask.array.where(bmask != 0.0, target_dask, numpy.nan) + # apply the mask + target_dask = dask.array.where(target_mask != 0.0, target_dask, numpy.nan) + + # reshape the target DataArray target_dask = dask.array.reshape( target_dask, kept_shape + [dst_grid_shape[1], dst_grid_shape[0]] ) @@ -426,8 +458,11 @@ def apply_weights(source_data, weights, weights_matrix=None): target_da.coords["lon"].attrs["units"] = "degrees_east" target_da.coords["lon"].attrs["standard_name"] = "longitude" + # Copy attributes from the original + target_da.attrs = source_data.attrs + # Now rename to the original coordinate names - #target_da = target_da.rename({"lat": source_lat.name, "lon": source_lon.name}) + # target_da = target_da.rename({"lat": source_lat.name, "lon": source_lon.name}) return target_da @@ -459,18 +494,24 @@ def __init__(self, source_grid=None, target_grid=None, weights=None): # Is there already a weights file? if weights is not None: - #if isinstance(weights, xarray.Dataset): - # self.weights = xarray.open_mfdataset(weights) - #else: - self.weights = weights + + if not isinstance(weights, xarray.Dataset): + self.weights = xarray.open_mfdataset(weights) + else: + self.weights = weights else: # Generate the weights with CDO - _source_grid = identify_grid(source_grid) - _target_grid = identify_grid(target_grid) - self.weights = cdo_generate_weights(_source_grid, _target_grid) + # _source_grid = identify_grid(source_grid) + # _target_grid = identify_grid(target_grid) + # self.weights = cdo_generate_weights(_source_grid, _target_grid) + sys.exit('Missing capability of creating weights...') self.weights_matrix = compute_weights_matrix(self.weights) + # this section is used to create a target mask initializing the CDO weights + self.weights = mask_weigths(self.weights, self.weights_matrix) + self.masked = check_mask(self.weights) + def regrid(self, source_data): """Regrid ``source_data`` to match the target grid @@ -484,11 +525,49 @@ def regrid(self, source_data): """ if isinstance(source_data, xarray.Dataset): - return source_data.apply(self.regrid) - else: + + # apply the regridder on each DataArray + out = source_data.map(self.regrid, keep_attrs=True) + + # clean from degenerated variables + degen_vars = [var for var in out.data_vars if out[var].dims == ()] + return out.drop(degen_vars) + # else: + # sys.exit("source data has different mask, this can lead to unexpected" \ + # "results due to the format in which weights are generated. Aborting...") + + elif isinstance(source_data, xarray.DataArray): + + # print('DataArray access!') return apply_weights( - source_data, self.weights, weights_matrix=self.weights_matrix + source_data, self.weights, weights_matrix=self.weights_matrix, masked=self.masked ) + else: + sys.exit('Cannot process this source_data, sure it is xarray?') + + +def mask_weigths(weights, weights_matrix): + """This functions precompute the mask for the target interpolation + Takes as input the weights from CDO and the precomputed weights matrix + Return the target mask""" + + src_mask = weights.src_grid_imask.data + target_mask = dask.array.tensordot(src_mask, weights_matrix, axes=1) + target_mask = dask.array.where(target_mask < 0.5, 0, 1) + weights.dst_grid_imask.data = target_mask + return weights + + +def check_mask(weights): + """This check if the target mask is empty or full and + return a bool to be passed to the regridder""" + + w = weights.dst_grid_imask + v = w.sum()/len(w) + if v == 1: + return False + else: + return True def regrid(source_data, target_grid=None, weights=None): diff --git a/tests/basic_test.py b/tests/basic_test.py new file mode 100644 index 0000000..edf54b9 --- /dev/null +++ b/tests/basic_test.py @@ -0,0 +1,55 @@ +from smmregrid.checker import check_cdo_regrid +import pytest +import os + + +indir = 'tests/data' +tfile = os.path.join(indir, 'r360x180.nc') + +# test for gaussian reduced grid (only nn) + + +@pytest.mark.parametrize("method", ['nn']) +def test_gaussian_reduced(method): + f = check_cdo_regrid(os.path.join(indir, 'lsm-ifs.grb'), tfile, method=method) + assert f is True + +# test for gaussian grids as EC-Earth cmor + + +@pytest.mark.parametrize("method", ['bil', 'con', 'nn']) +def test_gaussian(method): + f = check_cdo_regrid(os.path.join(indir, 'tas-ecearth.nc'), tfile, method=method) + assert f is True + +# test for lonlt grids + + +@pytest.mark.parametrize("method", ['bil', 'con', 'nn']) +def test_lonlat(method): + f = check_cdo_regrid(os.path.join(indir, '2t-era5.nc'), tfile, method=method) + assert f is True + +# test for unstructured grids as FESOM CMOR (no bilinear) + + +@pytest.mark.parametrize("method", ['con', 'nn']) +def test_unstructured(method): + f = check_cdo_regrid(os.path.join(indir, 'tos-fesom.nc'), tfile, method=method) + assert f is True + +# test for curvilinear grid + + +@pytest.mark.parametrize("method", ['con', 'nn', 'bil']) +def test_curivilinear(method): + f = check_cdo_regrid(os.path.join(indir, 'onlytos-ipsl.nc'), tfile, method=method) + assert f is True + +# test for pressure levels on gaussian grid + + +@pytest.mark.parametrize("method", ['con', 'nn', 'bil']) +def test_plev_gaussian(method): + f = check_cdo_regrid(os.path.join(indir, 'ua-ecearth.nc'), tfile, method=method) + assert f is True diff --git a/tests/data/2t-era5.nc b/tests/data/2t-era5.nc new file mode 100644 index 0000000..6694be1 Binary files /dev/null and b/tests/data/2t-era5.nc differ diff --git a/tests/data/lsm-ifs.grb b/tests/data/lsm-ifs.grb new file mode 100644 index 0000000..09a6b5b Binary files /dev/null and b/tests/data/lsm-ifs.grb differ diff --git a/tests/data/mix-cesm.nc b/tests/data/mix-cesm.nc new file mode 100644 index 0000000..7075466 Binary files /dev/null and b/tests/data/mix-cesm.nc differ diff --git a/tests/data/onlytos-ipsl.nc b/tests/data/onlytos-ipsl.nc new file mode 100644 index 0000000..2bb76dd Binary files /dev/null and b/tests/data/onlytos-ipsl.nc differ diff --git a/tests/data/r360x180.nc b/tests/data/r360x180.nc new file mode 100644 index 0000000..245be7b Binary files /dev/null and b/tests/data/r360x180.nc differ diff --git a/tests/data/tas-ecearth.nc b/tests/data/tas-ecearth.nc new file mode 100644 index 0000000..81fd897 Binary files /dev/null and b/tests/data/tas-ecearth.nc differ diff --git a/tests/data/tos-fesom.nc b/tests/data/tos-fesom.nc new file mode 100644 index 0000000..2a4d379 Binary files /dev/null and b/tests/data/tos-fesom.nc differ diff --git a/tests/data/ua-ecearth.nc b/tests/data/ua-ecearth.nc new file mode 100644 index 0000000..c480b64 Binary files /dev/null and b/tests/data/ua-ecearth.nc differ