Skip to content

Commit

Permalink
Update selection of vintages
Browse files Browse the repository at this point in the history
  • Loading branch information
tgrandje committed Sep 25, 2024
1 parent fee9e23 commit c816c77
Showing 1 changed file with 67 additions and 65 deletions.
132 changes: 67 additions & 65 deletions argo-pipeline/src/select_downstream_vintage_to_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,97 +14,99 @@
import json

from cartiflette.config import (
BUCKET,
PATH_WITHIN_BUCKET,
FS,
DATASETS_HIGH_RESOLUTION,
)
from cartiflette.pipeline_constants import COG_TERRITOIRE, IRIS

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

logger.info("=" * 50)
logger.info("\n" + __doc__)
logger.info("\n%s", __doc__)
logger.info("=" * 50)

parser = argparse.ArgumentParser(description="Select vintage to update")
parser.add_argument(
"--download_results",
type=str,
default=None,
default="{}",
help="Results of download pipeline",
)

args = parser.parse_args()
download_results = args.download_results

# TODO : add bucket/path_within_bucket/fs to pipeline args
download_results = json.loads(download_results)

# Example of download_results
# {"IGN": {"ADMINEXPRESS": {"EXPRESS-COG-TERRITOIRE": {"guadeloupe": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=5490/origin=raw/vectorfile_format=shp/territory=guadeloupe/simplification=0/COMMUNE.shp"]}}}, "martinique": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=5490/origin=raw/vectorfile_format=shp/territory=martinique/simplification=0/COMMUNE.shp"]}}}, "guyane": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=2972/origin=raw/vectorfile_format=shp/territory=guyane/simplification=0/COMMUNE.shp"]}}}, "reunion": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=2975/origin=raw/vectorfile_format=shp/territory=reunion/simplification=0/COMMUNE.shp"]}}}, "mayotte": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=4326/origin=raw/vectorfile_format=shp/territory=mayotte/simplification=0/COMMUNE.shp"]}}}, "metropole": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=2154/origin=raw/vectorfile_format=shp/territory=metropole/simplification=0/COMMUNE.shp"]}}}}}}, "Insee": {"COG": {"DEPARTEMENT": {"france_entiere": {"2024": {"downloaded": false, "paths": null}}}, "REGION": {"france_entiere": {"2024": {"downloaded": false, "paths": null}}}}, "TAGC": {"APPARTENANCE": {"france_entiere": {"2024": {"downloaded": true, "paths": {"table-appartenance-geo-communes-2024": ["projet-cartiflette/test/provider=Insee/dataset_family=TAGC/source=APPARTENANCE/year=2024/administrative_level=None/crs=None/origin=raw/vectorfile_format=xlsx/territory=france_entiere/simplification=0/table-appartenance-geo-communes-2024.xlsx"]}}}}}}}

download_results = json.loads(download_results)

if os.environ.get("ENVIRONMENT", None) == "dev":
logging.warning("dev environment -> force generation of each vintage")
json_md5 = f"{BUCKET}/{PATH_WITHIN_BUCKET}/md5.json"
with FS.open(json_md5, "r") as f:
all_md5 = json.load(f)
datasets = all_md5["IGN"]["ADMINEXPRESS"]["EXPRESS-COG-CARTO-TERRITOIRE"]
years = list(
{
year
for (_territory, vintaged_datasets) in datasets.items()
for year in vintaged_datasets.keys()
}
)
logging.warning("dev environment -> force generation of only 2023 & 2024")

# Example of download_results
# {"IGN": {"ADMINEXPRESS": {"EXPRESS-COG-TERRITOIRE": {"guadeloupe": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=5490/origin=raw/vectorfile_format=shp/territory=guadeloupe/simplification=0/COMMUNE.shp"]}}}, "martinique": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=5490/origin=raw/vectorfile_format=shp/territory=martinique/simplification=0/COMMUNE.shp"]}}}, "guyane": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=2972/origin=raw/vectorfile_format=shp/territory=guyane/simplification=0/COMMUNE.shp"]}}}, "reunion": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=2975/origin=raw/vectorfile_format=shp/territory=reunion/simplification=0/COMMUNE.shp"]}}}, "mayotte": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=4326/origin=raw/vectorfile_format=shp/territory=mayotte/simplification=0/COMMUNE.shp"]}}}, "metropole": {"2024": {"downloaded": true, "paths": {"COMMUNE": ["projet-cartiflette/test/provider=IGN/dataset_family=ADMINEXPRESS/source=EXPRESS-COG-TERRITOIRE/year=2024/administrative_level=None/crs=2154/origin=raw/vectorfile_format=shp/territory=metropole/simplification=0/COMMUNE.shp"]}}}}}}, "Insee": {"COG": {"DEPARTEMENT": {"france_entiere": {"2024": {"downloaded": false, "paths": null}}}, "REGION": {"france_entiere": {"2024": {"downloaded": false, "paths": null}}}}, "TAGC": {"APPARTENANCE": {"france_entiere": {"2024": {"downloaded": true, "paths": {"table-appartenance-geo-communes-2024": ["projet-cartiflette/test/provider=Insee/dataset_family=TAGC/source=APPARTENANCE/year=2024/administrative_level=None/crs=None/origin=raw/vectorfile_format=xlsx/territory=france_entiere/simplification=0/table-appartenance-geo-communes-2024.xlsx"]}}}}}}}

# Only need to update geodata if IGN files fom EXPRESS-COG-TERRITOIRE have been
# updated
years_geodata = set()
try:
raw_geodatasets = download_results["IGN"]["ADMINEXPRESS"][
"EXPRESS-COG-CARTO-TERRITOIRE"
]
except KeyError:
years_geodata = []
else:
for _territory, dict_results in raw_geodatasets.items():
for year, dict_results_this_year in dict_results.items():
if dict_results_this_year["downloaded"]:
years_geodata.add(year)

finally:
years_geodata = sorted(list(years_geodata))
def store_to_json(name, years):
"util function to store vintage selections to json for argo results"
with open(name, "w", encoding="utf8") as out:
json.dump(years, out)
return years


def filter_geodata(results):
"filter the downloaded vintages of geodatasets"
if os.environ.get("ENVIRONMENT", None) == "dev":
years_geodata = years

with open("geodatasets_years.json", "w") as out:
json.dump(years_geodata, out)
logger.info("selected downstream geodatasets : %s", years_geodata)


years_metadata = set()
try:
raw_datasets = download_results["Insee"]["COG"]
except KeyError:
years_metadata = []
else:
targets = ["DEPARTEMENT", "REGION", "TAGC"]
for target in targets:
try:
for _territory, dict_results in raw_datasets[target].items():
return store_to_json("geodatasets_years.json", [2023, 2024])

years = set()
keys_geo = (
("ADMINEXPRESS", COG_TERRITOIRE[DATASETS_HIGH_RESOLUTION]),
("IRIS", IRIS[DATASETS_HIGH_RESOLUTION]),
)
try:
raw = [results["IGN"][family][geo] for family, geo in keys_geo]
except KeyError:
years = []
else:
for dset in raw:
for dict_results in dset.values():
for year, dict_results_this_year in dict_results.items():
if dict_results_this_year["downloaded"]:
years_metadata.add(year)
except KeyError:
pass
finally:
years_metadata = sorted(list(years_metadata))
years.add(year)

years = sorted(list(years))
logger.info("selected downstream geodatasets : %s", years)
return store_to_json("geodatasets_years.json", years)

if os.environ.get("ENVIRONMENT", None) == "dev":
years_metadata = years

with open("metadata_years.json", "w") as out:
json.dump(years_metadata, out)
logger.info("selected downstream metadata : %s", years_metadata)
def filter_metadata(results):
"filter the downloaded vintages of metadatasets"
if os.environ.get("ENVIRONMENT", None) == "dev":
return store_to_json("metadata_years.json", [2023, 2024])

years = set()
try:
raw = [dset for provider, dset in results.items() if provider != "IGN"]
except KeyError:
years = []
else:

for dset_provider in raw:
for dset_family in dset_provider.values():
for dset in dset_family.values():
for dict_results in dset.values():
for (
year,
dict_results_this_year,
) in dict_results.items():
if dict_results_this_year["downloaded"]:
years.add(year)

years = sorted(list(years))
logger.info("selected downstream metadatasets : %s", years)
return store_to_json("metadata_years.json", years)


if __name__ == "__main__":
filter_geodata(download_results)
filter_metadata(download_results)

0 comments on commit c816c77

Please sign in to comment.