Skip to content

Commit

Permalink
Create samples of data to reduce the volume of download
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Rieutord committed Dec 4, 2023
1 parent 26978ed commit 56cf264
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 8 deletions.
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,17 @@ data
```

To download the data, use the following links, uncompress it and store it as indicated above.
Due to the large volume of data, we recommend to **download it the day before**.

#### All together (recommended)
#### Download zipped archives (recommended)

Several zipped archives are available for download, depending on your capacity and what you want to do.
* The [full data](https://drive.proton.me/urls/9NSPARVBHG#sdRkPZNng72D) contains weights, training data and land cover maps over all Europe (so-called EURAT domain): 45GB downloaded, 160GB uncompressed.
* The [sample data](https://drive.proton.me/urls/4JK0X0BQ2R#rdnXGeWbqWYj) contains weights, training data and land cover maps over Ireland: 2GB downloaded, 100GB uncompressed.
* The [sample TIF files](https://drive.proton.me/urls/GTKE99CVB4#IMZcMUYyvgJa) contains only land cover maps over Ireland: 150MB downloaded, 15GB uncompressed.

Using this [link](https://drive.proton.me/urls/9NSPARVBHG#sdRkPZNng72D), you can download all the data you need to use he package over Europe (so-called EURAT domain).
The downloaded volume is 45GB and you will need 160GB locally after decompression.
Due to the large volume of data, we recommend to **download it the day before**.
The data is already organised as explained earlier.
In case you want to focus on some specific part of the data, we provide more detailed information on how to download it separately.
If you download all the data together, you can skip the next subsections of [Data](#data) and go directly to [Check the installation](#check-the-installation).
Except for the sample TIF files, all zipped archived are already organised, so you can skip the next subsections of [Data](#data) and go directly to [Check the installation](#check-the-installation).

#### Landcovers

Expand All @@ -76,7 +78,7 @@ Here is the [link](https://drive.proton.me/urls/DWJ3ATQS9G#i4GptzWdUnC5) to down

#### Training data

Here is the [link](https://drive.proton.me/urls/AA5KJRYPCC#PD5E1XElNMpG) to download the HDF5 files used in the training (1.8 GB)
Here is the [link](https://drive.proton.me/urls/AA5KJRYPCC#PD5E1XElNMpG) to download the HDF5 files used in the training (1.8 GB downloaded, 85GB uncompressed)


### Check the installation
Expand All @@ -91,7 +93,6 @@ python tests/is_data_there.py [--tiff] [--weights] [--hdf5] [--all]
```



Usage
------

Expand Down
117 changes: 117 additions & 0 deletions drafts/create_sample_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Multiple land-cover/land-use Maps Translation (MMT)
Create test dataset
"""

import os
import sys
import rasterio
import numpy as np
from torchgeo import samplers

from mmt import _repopath_ as mmt_repopath
from mmt.datasets import transforms as mmt_transforms
from mmt.datasets import landcovers
from mmt.utils import domains


# Config
#--------
domainname = "ireland"
dump_dir = os.path.join(mmt_repopath, "sample-data")


# Land cover loading
#--------------------
esawc = landcovers.ESAWorldCover()
ecosg = landcovers.EcoclimapSG()
esgp = landcovers.EcoclimapSGplus()
esgml = landcovers.EcoclimapSGML()
qflags = landcovers.QualityFlagsECOSGplus()
print(f"Landcovers loaded with native CRS and resolution")


# Extract and save data
#-----------------------
qdomain = getattr(domains, domainname)
qb = qdomain.to_tgbox()

for lc in [ecosg, esgp, esgml, qflags]:

tiffiledir = lc.path.replace("/data/", "/sample-data/")
if not os.path.exists(tiffiledir):
os.makedirs(tiffiledir)

tiffilename = os.path.join(
tiffiledir,
".".join([lc.__class__.__name__, domainname, "tif"])
)

print(f"Extracting {lc.__class__.__name__} over {domainname} in {tiffilename}")
x = lc[qb]["mask"].squeeze().numpy()

xmin, ymin, xmax, ymax = rasterio.warp.transform_bounds(
rasterio.crs.CRS.from_epsg(4326), lc.crs, *qdomain.to_lbrt()
)
width = x.shape[1]
height = x.shape[0]
transform = rasterio.transform.from_bounds(
xmin, ymin, xmax, ymax, width, height
)
kwargs = {
"driver": "gTiff",
"dtype": "int8",
"nodata": 0,
"count": 1,
"crs": lc.crs,
"transform": transform,
"width": width,
"height": height,
}
with rasterio.open(tiffilename, "w", **kwargs) as dst:
dst.write(x, 1)

# Extract ESA World Cover
#-------------------------
tiffiledir = esawc.path.replace("/data/", "/sample-data/")
if not os.path.exists(tiffiledir):
os.makedirs(tiffiledir)

print(f"Extracting {esawc.__class__.__name__} over {domainname} in {tiffiledir}")

sampler = samplers.GridGeoSampler(
esawc, size=9000, stride=8000, roi=qb
)
for i, iqb in enumerate(iter(sampler)):
tiffilename = os.path.join(
tiffiledir,
".".join([esawc.__class__.__name__, domainname, f"i{i}", "tif"])
)
if i % 10 == 0:
print(f" [{i}/{len(sampler)}] tiffilename={tiffilename}")

x = esawc[iqb]["mask"].squeeze().numpy()

xmin = iqb.minx
ymin = iqb.miny
xmax = iqb.maxx
ymax = iqb.maxy
width = x.shape[1]
height = x.shape[0]
transform = rasterio.transform.from_bounds(
xmin, ymin, xmax, ymax, width, height
)
kwargs = {
"driver": "gTiff",
"dtype": "int16",
"nodata": 0,
"count": 1,
"crs": esawc.crs,
"transform": transform,
"width": width,
"height": height,
}
with rasterio.open(tiffilename, "w", **kwargs) as dst:
dst.write(x, 1)

0 comments on commit 56cf264

Please sign in to comment.