Create samples of data to reduce the volume of download

ThomasRieutord · Dec 4, 2023 · 56cf264 · 56cf264
1 parent 26978ed
commit 56cf264
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -46,15 +46,17 @@ data
 ```
 
 To download the data, use the following links, uncompress it and store it as indicated above.
+Due to the large volume of data, we recommend to **download it the day before**.
 
-#### All together (recommended)
+#### Download zipped archives (recommended)
+
+Several zipped archives are available for download, depending on your capacity and what you want to do.
+  * The [full data](https://drive.proton.me/urls/9NSPARVBHG#sdRkPZNng72D) contains weights, training data and land cover maps over all Europe (so-called EURAT domain): 45GB downloaded, 160GB uncompressed.
+  * The [sample data](https://drive.proton.me/urls/4JK0X0BQ2R#rdnXGeWbqWYj) contains weights, training data and land cover maps over Ireland: 2GB downloaded, 100GB uncompressed.
+  * The [sample TIF files](https://drive.proton.me/urls/GTKE99CVB4#IMZcMUYyvgJa) contains only land cover maps over Ireland: 150MB downloaded, 15GB uncompressed.
 
-Using this [link](https://drive.proton.me/urls/9NSPARVBHG#sdRkPZNng72D), you can download all the data you need to use he package over Europe (so-called EURAT domain).
-The downloaded volume is 45GB and you will need 160GB locally after decompression.
-Due to the large volume of data, we recommend to **download it the day before**.
-The data is already organised as explained earlier.
 In case you want to focus on some specific part of the data, we provide more detailed information on how to download it separately.
-If you download all the data together, you can skip the next subsections of [Data](#data) and go directly to [Check the installation](#check-the-installation).
+Except for the sample TIF files, all zipped archived are already organised, so you can skip the next subsections of [Data](#data) and go directly to [Check the installation](#check-the-installation).
 
 #### Landcovers
 
@@ -76,7 +78,7 @@ Here is the [link](https://drive.proton.me/urls/DWJ3ATQS9G#i4GptzWdUnC5) to down
 
 #### Training data
 
-Here is the [link](https://drive.proton.me/urls/AA5KJRYPCC#PD5E1XElNMpG) to download the HDF5 files used in the training (1.8 GB)
+Here is the [link](https://drive.proton.me/urls/AA5KJRYPCC#PD5E1XElNMpG) to download the HDF5 files used in the training (1.8 GB downloaded, 85GB uncompressed)
 
 
 ### Check the installation
@@ -91,7 +93,6 @@ python tests/is_data_there.py [--tiff] [--weights] [--hdf5] [--all]
 ```
 
 
-
 Usage
 ------
 

diff --git a/drafts/create_sample_data.py b/drafts/create_sample_data.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Multiple land-cover/land-use Maps Translation (MMT)
+
+Create test dataset
+"""
+
+import os
+import sys
+import rasterio
+import numpy as np
+from torchgeo import samplers
+
+from mmt import _repopath_ as mmt_repopath
+from mmt.datasets import transforms as mmt_transforms
+from mmt.datasets import landcovers
+from mmt.utils import domains
+
+
+# Config
+#--------
+domainname = "ireland"
+dump_dir = os.path.join(mmt_repopath, "sample-data")
+
+
+# Land cover loading
+#--------------------
+esawc = landcovers.ESAWorldCover()
+ecosg = landcovers.EcoclimapSG()
+esgp = landcovers.EcoclimapSGplus()
+esgml = landcovers.EcoclimapSGML()
+qflags = landcovers.QualityFlagsECOSGplus()
+print(f"Landcovers loaded with native CRS and resolution")
+
+
+# Extract and save data
+#-----------------------
+qdomain = getattr(domains, domainname)
+qb = qdomain.to_tgbox()
+
+for lc in [ecosg, esgp, esgml, qflags]:
+
+    tiffiledir = lc.path.replace("/data/", "/sample-data/")
+    if not os.path.exists(tiffiledir):
+        os.makedirs(tiffiledir)
+
+    tiffilename = os.path.join(
+        tiffiledir,
+        ".".join([lc.__class__.__name__, domainname, "tif"])
+    )
+
+    print(f"Extracting {lc.__class__.__name__} over {domainname} in {tiffilename}")
+    x = lc[qb]["mask"].squeeze().numpy()
+
+    xmin, ymin, xmax, ymax = rasterio.warp.transform_bounds(
+        rasterio.crs.CRS.from_epsg(4326), lc.crs, *qdomain.to_lbrt()
+    )
+    width = x.shape[1]
+    height = x.shape[0]
+    transform = rasterio.transform.from_bounds(
+        xmin, ymin, xmax, ymax, width, height
+    )
+    kwargs = {
+        "driver": "gTiff",
+        "dtype": "int8",
+        "nodata": 0,
+        "count": 1,
+        "crs": lc.crs,
+        "transform": transform,
+        "width": width,
+        "height": height,
+    }
+    with rasterio.open(tiffilename, "w", **kwargs) as dst:
+        dst.write(x, 1)
+
+# Extract ESA World Cover
+#-------------------------
+tiffiledir = esawc.path.replace("/data/", "/sample-data/")
+if not os.path.exists(tiffiledir):
+    os.makedirs(tiffiledir)
+
+print(f"Extracting {esawc.__class__.__name__} over {domainname} in {tiffiledir}")
+
+sampler = samplers.GridGeoSampler(
+    esawc, size=9000, stride=8000, roi=qb
+)
+for i, iqb in enumerate(iter(sampler)):
+    tiffilename = os.path.join(
+        tiffiledir,
+        ".".join([esawc.__class__.__name__, domainname, f"i{i}", "tif"])
+    )
+    if i % 10 == 0:
+        print(f"  [{i}/{len(sampler)}] tiffilename={tiffilename}")
+
+    x = esawc[iqb]["mask"].squeeze().numpy()
+
+    xmin = iqb.minx
+    ymin = iqb.miny
+    xmax = iqb.maxx
+    ymax = iqb.maxy
+    width = x.shape[1]
+    height = x.shape[0]
+    transform = rasterio.transform.from_bounds(
+        xmin, ymin, xmax, ymax, width, height
+    )
+    kwargs = {
+        "driver": "gTiff",
+        "dtype": "int16",
+        "nodata": 0,
+        "count": 1,
+        "crs": esawc.crs,
+        "transform": transform,
+        "width": width,
+        "height": height,
+    }
+    with rasterio.open(tiffilename, "w", **kwargs) as dst:
+        dst.write(x, 1)