From cc7dd766051a3e8b459927d02453c271f63a7586 Mon Sep 17 00:00:00 2001 From: ValentinaHutter <85164505+ValentinaHutter@users.noreply.github.com> Date: Thu, 13 Jun 2024 11:41:29 +0200 Subject: [PATCH] OpenDataCube -> Xarray / Dask Co-authored-by: Michele Claus <31700619+clausmichele@users.noreply.github.com> Co-authored-by: Matthias Mohr --- .vuepress/config.js | 2 +- .vuepress/enhanceApp.js | 7 +- .../1.0/developers/backends/opendatacube.md | 224 ------------------ .../1.0/developers/backends/xarray.md | 95 ++++++++ 4 files changed, 100 insertions(+), 228 deletions(-) delete mode 100644 documentation/1.0/developers/backends/opendatacube.md create mode 100644 documentation/1.0/developers/backends/xarray.md diff --git a/.vuepress/config.js b/.vuepress/config.js index b5f634753..8e5281cd9 100644 --- a/.vuepress/config.js +++ b/.vuepress/config.js @@ -36,7 +36,7 @@ const versions = [ {text: 'Service Providers', items: [ {text: 'Getting Started', link: 'developers/backends/getting-started.html'}, {text: 'Performance Guide', link: 'developers/backends/performance.html'}, - {text: 'Open Data Cube', link: 'developers/backends/opendatacube.html'}, + {text: 'Xarray / Dask Guide', link: 'developers/backends/xarray.html'}, {text: 'Profiles', link: 'developers/profiles/index.html'} ]}, {text: 'Client Developers', items: [ diff --git a/.vuepress/enhanceApp.js b/.vuepress/enhanceApp.js index 8b1a17084..6ebd807d7 100644 --- a/.vuepress/enhanceApp.js +++ b/.vuepress/enhanceApp.js @@ -1,4 +1,4 @@ -const config = require('./config.js'); +opconst config = require('./config.js'); const defaultVersion = config.themeConfig.versions[config.themeConfig.defaultVersion]; @@ -16,6 +16,7 @@ export default ({ router, Vue }) => { { path: '/about', redirect: 'about.html' }, { path: '/software', redirect: 'software.html' }, { path: '/contact', redirect: 'contact.html' }, - { path: '/glossary', redirect: defaultVersion.path + 'glossary.html' } + { path: '/glossary', redirect: defaultVersion.path + 'glossary.html' }, + { path: '/documentation/1.0/developers/backends/opendatacube.html', redirect: '/documentation/1.0/developers/backends/xarray.html' } ]); -} \ No newline at end of file +} diff --git a/documentation/1.0/developers/backends/opendatacube.md b/documentation/1.0/developers/backends/opendatacube.md deleted file mode 100644 index ebd01e497..000000000 --- a/documentation/1.0/developers/backends/opendatacube.md +++ /dev/null @@ -1,224 +0,0 @@ -# Getting started with openEO and Open Data Cube - -As a back-end provider who wants to provide its datasets, processes and infrastructure to a broader audience through a -standardized interface you may want to implement a driver for openEO. - -First of all, you should read carefully the [getting started guide for service providers](./getting-started.md). - -::: tip Note -The Open Data Cube implementation for openEO is not a full-fledged out-of-the-box openEO back-end, -but can be part of the infrastructure for the data management part. -In detail it can be used as data source for [EO Data Discovery](../api/reference.md#tag/EO-Data-Discovery) and e.g. -in combination with a dask cluster as processing back-end for [Data Processing](../api/reference.md#tag/Data-Processing). -In any case, a [HTTP REST interface must be available in front of ODC to properly answer openEO requests](#http-rest-interface). -::: - -There are three main components involved with ODC and openEO: -1. [Process Graph Parser for Python](#process-graph-parser-for-python) -2. [Python Processes for openEO](#python-processes-for-openeo) -3. [openEO to ODC Process Mapper](#openeo-to-odc-process-mapper) - -## Process Graph Parser for Python - -* Repository: [openeo-pg-parser-python](https://github.com/Open-EO/openeo-pg-parser-python) - -The process graph parser translates a process graph in json / dict format into a traversable python object. It -automatically validates that only collections and processes are used which are available at the back-end. - -For example, this [EVI process graph](https://github.com/Open-EO/openeo-odc/blob/master/tests/process_graphs/evi.json) can be -translated using the openEO Process Graph Parser in the following way: - -```python -from openeo_pg_parser.translate import translate_process_graph -import urllib.request -import json - -eviJsonProcessGraphURL = 'https://raw.githubusercontent.com/Open-EO/openeo-odc/master/tests/process_graphs/evi.json' -jsonUrl = urllib.request.urlopen(eviJsonProcessGraphURL) -data = jsonUrl.read() -jsonProcessGraph = json.loads(data) -processGraph = translate_process_graph(jsonProcessGraph).sort(by='result') -for node in processGraph: - print(node.process_id) -``` - -## Python Processes for openEO - -* Repository: [openeo-processes-python](https://github.com/Open-EO/openeo-processes-python) - -This package is a Python implementation of a subset of [all openeo processes](../../processes.md). - -This package includes implementations based on numbers, numpy and xarray. Based on the type of your input data the -algorithm automatically chooses the right implementation. E.g. if your input is an `xarray.Dataarray` it will use the -xarray implementation of the called process. Independent of the input data type a process can therefore simply be -called by: - -```python -import openeo_processes as processes -processes.subtract(**{'x': 3,'y': 5}) -``` - -which in this case would use the number implementation and substract 5 from 3. The exact same function could also be -called with two `xarray.Dataarray`s as input. - -From a technical perspective, each process is implemented as a function and a corresponding class with up to four static -functions `exec_num`, `exec_np`, `exec_xar`, and `exec_da`. Within the context of Open Data Cube the most important -one is `exec_xar` - the xarray implementation. - -::: tip Note -The different data type implementations of a single process are completely independent. So one can easily add the -xarray implementation without editing the implementations for the other data types. -::: - -Implemented ODC processes -* load_collection -* load_result - -Implemented xarray processes: -* array_element -* is_nodata -* is_nan -* is_valid -* gt -* reduce_dimension -* apply -* save_result -* absolute -* sgn -* sqrt -* mean -* min -* max -* median -* sd -* variance -* quantiles -* cummin -* cummax -* cumproduct -* cumsum -* sum -* product -* add -* substract -* multiply -* divide -* apply_kernel - -## openEO to ODC Process Mapper - -* Repository: [openeo-odc](https://github.com/Open-EO/openeo-odc) - -openeo-odc maps an openEO process graph to an executable Python file and thereby ties together the other two packages. -The input is a plain openeo process graph in json / dict format (see below) and the output is a python file composed of -one function call per process (see below). In detail each process in the process graph is mapped to a function call to -[openeo-processes-python](https://github.com/openeo-processes-python) package. - -For this translation openeo-odc uses internally the [openeo-pg-parser-python](https://github.com/openeo-pg-parser-python) -packages. Leveraging the resulting python representation of the process graph [openeo-odc](https://github.com/openeo-odc) -each process in the process graph is mapped separately. Next to the correct mapping of the process itself the main task -of [openeo-odc](https://github.com/openeo-odc) is to correctly understand and pass given parameters and arguments -(this can be simple values but also references previous nodes). - -::: tip Note -opendatacube, xarray and openeo-processes-python are **not** dependencies because this package simply creates a -python file that can be executed in the correct environment where these dependencies are resolved. -::: - -[Sample process graph for an EVI calculation (Input).](https://github.com/Open-EO/openeo-odc/blob/master/tests/process_graphs/evi.json) - -Sample python output file, generated using the [testing script](https://github.com/Open-EO/openeo-odc/blob/master/tests/test_odc.py), -calling a number of processes implemented within [openeo-processes-python](https://github.com/openeo-processes-python) - notice that the -output of a previous function call can easily be used as input for another one. - -```python - -from dask.distributed import Client -import datacube -import openeo_processes as processes - -# Initialize ODC instance -cube = datacube.Datacube() -# Connect to Dask Scheduler -client = Client('tcp://xxx.yyy.zzz.kkk:8786') - -dc_0 = processes.load_collection(odc_cube=cube, **{'product': 'boa_sentinel_2', 'x': (11.2, 12.9), 'y': (47.1, 50.5), 'time': ['2018-06-15', '2018-06-17'], 'dask_chunks': {'time': 'auto', 'x': 1000, 'y': 1000}, 'measurements': ['B08', 'B04', 'B02']}) -nir_2 = processes.array_element(**{'data': dc_0, 'index': 0, 'dimension': 'bands'}) -red_3 = processes.array_element(**{'data': dc_0, 'index': 1, 'dimension': 'bands'}) -blue_4 = processes.array_element(**{'data': dc_0, 'index': 2, 'dimension': 'bands'}) -sub_5 = processes.subtract(**{'x': nir_2,'y': red_3}) -p1_6 = processes.multiply(**{'x': red_3,'y': 6}) -p2_7 = processes.multiply(**{'x': blue_4,'y': -7.5}) -sum_8 = processes.sum(**{'data': [10000, nir_2, p1_6, p2_7]}) -div_9 = processes.divide(**{'x': sub_5,'y': sum_8}) -p3_10 = processes.multiply(**{'x': div_9,'y': 2.5}) -evi_1 = processes.reduce_dimension(**{'data': p3_10, 'dimension': 'spectral', 'reducer': {}}) -min_12 = processes.min(**{'data': evi_1, 'dimension': 'time'}) -mintime_11 = processes.reduce_dimension(**{'data': min_12, 'dimension': 'temporal', 'reducer': {}}) -save_13 = processes.save_result(**{'data': mintime_11, 'format': 'netCDF'}) -``` - -## Adding a new process - -To add a new process which can be used with ODC/Xarray/Dask, there are changes required in two of the components: - -1. [openeo-processes-python](#openeo-processes-python) -2. [openeo-odc](#openeo-odc) - -No changes are required in openeo-pg-parser-python. - -::: warning Attention -Make sure you always have the latest version of the required repositories [openEO Process Graph Parser](https://github.com/Open-EO/openeo-pg-parser-python), [openEO Processes Python](https://github.com/Open-EO/openeo-processes-python) and [openEO ODC Process Mapper](https://github.com/Open-EO/openeo-odc) installed when you test your code. -::: - -### openeo-processes-python - -1. Select a process from [processes.openeo.org](https://processes.openeo.org/) which does not yet have a xarray - implementation in [openeo-processes-python](https://github.com/openeo-processes-python). -2. Clone [openeo-processes-python](https://github.com/openeo-processes-python), checkout a new branch, and start - implementing the missing process. If a function and class already exists for this process just implement the - `exec_xar` method, if not you also need to implement the function and class itself. Make sure you properly handle - all parameters defined for this process. Add a test for your process with xarray data as input (in the `conftest.py` - the `test_data`-fixture is available). -3. Push your code and open a PR. - -### openeo-odc - -1. Clone [openeo-odc](https://github.com/openeo-odc) and checkout a new branch. -2. Ensure that there is a mapper available for your newly implemented process. Currently the mapping is done based - on the input parameters, so some processes may automatically be supported, for some others an additional - mapper function must be implemented. -3. Check your mapping works by adding a test for the new process which correctly translates its dict representation - to the processes function call. -4. Push your code and open a PR. - -## Testing the new environment - -The following steps describe how to test that everything works: - -1. You can create a process graph using the [openEO Web Editor](https://editor.openeo.org/?server=https://openeo.eodc.eu/v1.0&discover=1) and store the corresponding JSON locally. -2. Modify the collection in `load_collection` with one available in your local Open Data Cube instance. Check that bands, temporal and spatial extent are available in your ODC product. Translate the process graph into an executable Python script: - ```python - from openeo_odc.map_to_odc import map_to_odc - from openeo_pg_parser.translate import translate_process_graph - - process_graph_json = 'your_test_process_graph.json' - odc_url = 'tcp://xxx.yyy.zzz.kkk:8786' - graph = translate_process_graph(process_graph_json, - process_defs).sort(by='result') - nodes = map_to_odc(graph, None, odc_url) - # Write to disk - with open(process_graph + ".py", "w") as f: - for node in nodes: - f.write(nodes[node]) - ``` -3. Execute the obtained script and check if it succeeds. - -## HTTP REST Interface - -The next step would be to set up a HTTP REST interface (i.e. an implementation of the openEO HTTP API) for the new openEO ODC environment. -It must be available in front of ODC to properly answer openEO client requests. -Currently, the [EODC](https://openeo.eodc.eu/v1.0) and [EURAC](https://openeo.eurac.edu/) back-ends use ODC and thus -are the first implementations of back-ends to look at. - -If you have any questions, please [contact us](../../../../contact.md). \ No newline at end of file diff --git a/documentation/1.0/developers/backends/xarray.md b/documentation/1.0/developers/backends/xarray.md new file mode 100644 index 000000000..425c0f67e --- /dev/null +++ b/documentation/1.0/developers/backends/xarray.md @@ -0,0 +1,95 @@ +# Getting started with openEO and Xarray and Dask + +As a back-end provider who wants to provide its datasets, processes and infrastructure to a broader audience through a +standardized interface you may want to implement a driver for openEO. + +First of all, you should read carefully the [getting started guide for service providers](./getting-started.md). + +::: tip Note +The Xarray-Dask implementation for openEO is not a full-fledged out-of-the-box openEO back-end, +but can be part of the infrastructure for the data management and processing part. +In detail it can be used as data source for [EO Data Discovery](../api/reference.md#tag/EO-Data-Discovery) and e.g. +in combination with a Dask cluster as processing back-end for [Data Processing](../api/reference.md#tag/Data-Processing). +In any case, a [HTTP REST interface must be available in front of process implementations to properly answer openEO requests](#http-rest-interface). +::: + +There are two main components involved with openEO and Xarray: +1. [Process Graph Parser for Python](#process-graph-parser-for-python) +2. [Python Processes for openEO](#python-processes-for-openeo) + +## Process Graph Parser for Python + +* Repository: [openeo-pg-parser-networkx](https://github.com/Open-EO/openeo-pg-parser-networkx) + +This pg-parser parses OpenEO process graphs from raw JSON into fully traversible networkx graph objects. + +The `ProcessRegistry` can be imported from the pg-parser and includes `Process` objects, that include a +* spec: Process definition (e.g. https://github.com/Open-EO/openeo-processes) +* implementation: Callable process implementation (https://github.com/Open-EO/openeo-processes-dask/tree/main/openeo_processes_dask/process_implementations) +* namespace + +The `ProcessRegistry` automatically maps from the name of a process to the `spec` and to the `implementation`. +Every `Process` in the `ProcessRegistry` requires a `spec`, while `implementation` and `namespace` are optional. + +An example on how to use the pg-parser can be found [here](https://github.com/Open-EO/openeo-pg-parser-networkx/blob/main/examples/01_minibackend_demo.ipynb). + +## Python Processes for openEO + +* Repository: [openeo-processes-dask](https://github.com/Open-EO/openeo-processes-dask) + +This package includes the implementations of openEO processes, using Xarray and Dask. Currently, the `load_collection` and `save_result` process are not included as these implementations can differ widely for different backends. + +The `specs` can be found in the `openeo-processes-dask` as a submodule. That way, the specification and the implementation are stored close to each other. + +## The load_collection and save_result process + +As mentioned before, the `load_collection` and `save_result` processes are back-end-specific and therefore not included in [openeo-processes-dask](https://github.com/Open-EO/openeo-processes-dask). The [load_collection](https://processes.openeo.org/#load_collection) process should return a `raster-cube` object - to be compliant with the `openeo-processes-dask` implementations, this should be realized by a `xarray.DataArray` loaded with `dask`. + +### Connection to ODC and STAC + +For testing purposes with `DataArrays` - which can be loaded from one file - the `xarray.open_dataarray()` function can be used to implement a basic version of `load_collection`. + +Large data sets can be organised as `opendatacube Products` or as `STAC Collections`. + +* `opendatacube Products`: The implementation of `load_collection` can include the `opendatacube` function `datacube.Datacube.load()`. It is recommended to use the `dask_chunks` parameter, when loading the data. The function returns a `xarray DataSet`, in order to be compliant with `openeo-processes-dask`, it can be converted to a `DataArray` using the `Dataset.to_array(dim='bands')` function. A sample `load_collection` process using OpenDatacube [can be found here](https://github.com/Open-EO/openeo_odc_driver/blob/c197387c10f8fef7d5573270a35961a278a18e1d/openeo_odc_driver/processing.py#L38). + +* `STAC Collections`: Alternatively, the `load_collection` process can be implemented using the `odc.stac.load()` function. To make use of `dask`, the `chunks` parameter must be set. Just as in the previous case, the resulting `xarray DataSet` can be converted to a `DataArray` with `Dataset.to_array(dim='bands')`. A similar implementation is the one of the `load_stac` process [available here](https://github.com/Open-EO/openeo-processes-dask/blob/9267e4ccffbbbf755cb7b8a43ba80d9483398314/openeo_processes_dask/process_implementations/cubes/load.py#L83). + +## openEO Client Side Processing + +The client-side processing functionality allows to test and use openEO with its processes locally, i.e. without any connection to an openEO back-end. +It relies on the projects [openeo-pg-parser-networkx](https://github.com/Open-EO/openeo-pg-parser-networkx), which provides an openEO process graph parsing tool, and [openeo-processes-dask](https://github.com/Open-EO/openeo-processes-dask), which provides an Xarray and Dask implementation of most openEO processes. + +You can find more information and usage examples in the openEO Python client documentation [available here](https://open-eo.github.io/openeo-python-client/cookbook/localprocessing.html). + +## Adding a new process + +To add a new process, there are changes required in the [openeo-processes-dask](https://github.com/Open-EO/openeo-processes-dask). + +1. Add the process spec +2. Add the process implementation + +The HTTP rest interface should have a `processes` endpoint that reflects the process specs from `openeo-processes-dask`. + +### Add the process spec + +Currently, [openeo-processes-dask](https://github.com/Open-EO/openeo-processes-dask) includes the process definitions as a `submodule` in the `openeo-processes-dask/specs`. The submodule can be found under https://github.com/eodcgmbh/openeo-processes, which is a fork from https://github.com/Open-EO/openeo-processes to reflect which processes (with their implementations) are actually available in `openeo-processes-dask`. + +### Add the process implementation + +1. Select a process from [processes.openeo.org](https://processes.openeo.org/) which does not yet have an + implementation in [openeo-processes-dask](https://github.com/Open-EO/openeo-processes-dask). +2. Clone [openeo-processes-dask](https://github.com/Open-EO/openeo-processes-dask), checkout a new branch, and start implementing the missing process. Make sure you properly handle all parameters defined for this process. Add a test for your process in `openeo-processes-dask/tests` ideally using dask. The `create_fake_rastercube` from the `openeo-processes-dask/tests/mockdata` can be used for testing, with the `backend` parameter set to `numpy` or `dask`. +3. Push your code and open a PR. + +## HTTP REST Interface + +The next step would be to set up a HTTP REST interface (i.e. an implementation of the openEO HTTP API) for the new openEO environment. +It must be available in front of the process implementations to properly answer openEO client requests. +Currently, the [EODC](https://openeo.eodc.eu/v1.0) and [Eurac Research](https://openeo.eurac.edu/) back-ends use Xarray and Dask and thus +are the first implementations of back-ends to look at. + +- EODC is using a Python implementation, the [openeo-fastapi](https://github.com/eodcgmbh/openeo-fastapi). +- Eurac Research relies on a Java based implementation, the [openeo-spring-driver](https://github.com/Open-EO/openeo-spring-driver) + +If you have any questions, please [contact us](../../../../contact.md).