Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overwriting files that were lazily loaded results in permission error… #241

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion hydromt_sfincs/quadtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import xugrid as xu
from pyproj import CRS, Transformer

from hydromt_sfincs.utils import xu_open_dataset
from hydromt_sfincs.utils import xu_open_dataset, check_exists_and_lazy

# optional dependency
try:
Expand Down Expand Up @@ -125,6 +125,8 @@ def write(self, file_name: Union[str, Path] = "sfincs.nc", version: int = 0):
)

ds.attrs = attrs
# before writing, check if the file already exists while data is still lazily loaded
check_exists_and_lazy(ds, file_name)
ds.to_netcdf(file_name)

def map_overlay(self, file_name, xlim=None, ylim=None, color="black", width=800):
Expand Down
2 changes: 2 additions & 0 deletions hydromt_sfincs/sfincs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3337,6 +3337,8 @@ def write_forcing(self, data_vars: Union[List, str] = None, fmt: str = "%7.2f"):
self.write_vector(variables=f"forcing.{list(rename.keys())[0]}")
# write 2D gridded timeseries
else:
# before writing, check if the file already exists while data is still lazily loaded
utils.check_exists_and_lazy(ds, fn)
ds.to_netcdf(fn, encoding=encoding)

def read_states(self):
Expand Down
2 changes: 2 additions & 0 deletions hydromt_sfincs/subgrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,8 @@ def write(self, file_name):
# fix names to match SFINCS convention
# ds = ds.rename_vars({"uv_navg": "uv_navg_w", "uv_ffit": "uv_fnfit"})

# before writing, check if the file already exists while data is still lazily loaded
utils.check_exists_and_lazy(ds, file_name)
ds.to_netcdf(file_name)


Expand Down
29 changes: 29 additions & 0 deletions hydromt_sfincs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import copy
import io
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Union
Expand Down Expand Up @@ -53,6 +54,8 @@
"rotated_grid",
"build_overviews",
"find_uv_indices",
"xu_open_dataset",
"check_exists_and_lazy",
]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1338,3 +1341,29 @@ def xu_open_dataset(*args, **kwargs):
"""
with xr.open_dataset(*args, **kwargs) as ds:
return xu.UgridDataset(ds)


def check_exists_and_lazy(ds, file_name):
"""If a netcdf file is read lazily, the file can not be overwritten.
This function checks whether the file already exists, if so, it checks
if the data is lazily loaded. If so, data should be loaded before writing.

Parameters
----------
ds : xarray.Dataset, xu.UgridDataset
The dataset to be written to a netcdf file.
file_name : str
The path to the netcdf file.
"""
if not os.path.exists(file_name):
return

# Check for lazy loading
lazy_vars = [not data_array._in_memory for data_array in ds.data_vars.values()]

# if all(lazy_vars):
# return # All variables are lazy-loaded, skip writing?

if any(lazy_vars):
ds.load() # Some variables are lazy-loaded, load them into memory
return
Comment on lines +1346 to +1369
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure to call ds.close() the file after loading the contents

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed earlier that ds.close() does not work for xugrid.UgridDatasets right?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, I keep forgetting.
Unfortunately, for the same reason ds.load() will also not work then :s

34 changes: 34 additions & 0 deletions tests/test_quadtree.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from os.path import join, dirname, abspath
import numpy as np
import os
from pyproj import CRS
import shutil

from hydromt_sfincs import utils
from hydromt_sfincs.quadtree import QuadtreeGrid

TESTDATADIR = join(dirname(abspath(__file__)), "data")
Expand Down Expand Up @@ -35,3 +38,34 @@ def test_quadtree_io(tmpdir):
assert np.sum(qtr2.data["msk"].values) == 4298
# assert the dep variable is the same
assert np.sum(qtr.data["dep"].values) == np.sum(qtr2.data["dep"].values)


def test_overwrite_quadtree_nc(tmpdir):
ncfile = join(TESTDATADIR, "sfincs_test_quadtree", "sfincs.nc")
nc_copy = join(str(tmpdir), "sfincs.nc")

# Create file + copy
shutil.copy(ncfile, nc_copy)

# Open the copy with xu_open_dataset
# This opens the file lazily
ds = utils.xu_open_dataset(nc_copy)

# Convert to dataset
ds = ds.ugrid.to_dataset()
Comment on lines +52 to +55
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here you are overwriting the ds variable with another object, which could trigger the garbage collector and release the files. m not sure if this breaks anything, but I would stay away from using the same variable name for two different objects.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty sure this happens in more places in the code, I could change that.


# Try to write
# NOTE this should fail because it still has lazy references to the file
try:
ds.to_netcdf(nc_copy)
except PermissionError:
pass
Comment on lines +59 to +62
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to_netcdf() can succeed or fail, doesn't affect the test at all?

use

with pytest.raises(PermissionError) as e:
  ds.to_netcdf(nc_copy)
  assert .... in str(e)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good one, I was looking for something like that, but then forgot


# Now perform the check and lazy loading check
utils.check_exists_and_lazy(ds, nc_copy)

# Try to overwrite the file
ds.to_netcdf(nc_copy)

# Remove the copied file
os.remove(nc_copy)
Loading