diff --git a/h5pyd/_apps/utillib.py b/h5pyd/_apps/utillib.py index dd0b966..8ccc098 100755 --- a/h5pyd/_apps/utillib.py +++ b/h5pyd/_apps/utillib.py @@ -338,7 +338,7 @@ def copy_array(src_arr, ctx): if has_reference(src_arr.dtype): # flatten array to simplify iteration - count = np.product(src_arr.shape) + count = int(np.prod(src_arr.shape)) tgt_arr_flat = tgt_arr.reshape((count,)) src_arr_flat = src_arr.reshape((count,)) for i in range(count): diff --git a/h5pyd/_hl/dataset.py b/h5pyd/_hl/dataset.py index 5077bfb..ac07e37 100644 --- a/h5pyd/_hl/dataset.py +++ b/h5pyd/_hl/dataset.py @@ -1350,17 +1350,16 @@ def __setitem__(self, args, val): # Attempt to directly convert the input array of vlen data to its base class val = numpy.asarray(val, dtype=vlen_base_class) - except ValueError as ve: + except (ValueError, TypeError): # Failed to convert input array to vlen base class directly, instead create a new array where # each element is an array of the Dataset's dtype - self.log.debug(f"asarray ValueError: {ve}") try: # Force output shape tmp = numpy.empty(shape=val.shape, dtype=self.dtype) tmp[:] = [numpy.array(x, dtype=self.dtype) for x in val] val = tmp - except ValueError as e: - msg = f"ValueError converting value element by element: {e}" + except (ValueError, TypeError): + msg = "ValueError converting value element by element" self.log.debug(msg) if vlen_base_class == val.dtype: @@ -1589,19 +1588,21 @@ def write_direct(self, source, source_sel=None, dest_sel=None): data = source.__getitem__(slices) self.__setitem__(dest_sel, data) - def __array__(self, dtype=None): - """Create a Numpy array containing the whole dataset. DON'T THINK - THIS MEANS DATASETS ARE INTERCHANGABLE WITH ARRAYS. For one thing, - you have to read the whole dataset everytime this method is called. - """ - arr = numpy.empty(self._shape, dtype=self.dtype if dtype is None else dtype) + def __array__(self, dtype=None, copy=True): + if copy is False: + raise ValueError( + f"AstypeWrapper.__array__ received {copy=} " + f"but memory allocation cannot be avoided on read" + ) # Special case for (0,)*-shape datasets if self._shape is None or numpy.prod(self._shape) == 0: - return arr + return numpy.empty(self._shape, dtype=self.dtype if dtype is None else dtype) - self.read_direct(arr) - return arr + data = self[:] + if dtype is not None: + return data.astype(dtype, copy=False) + return data def __repr__(self): if not self: diff --git a/pyproject.toml b/pyproject.toml index cfdbdd6..8f23017 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ requires-python = ">=3.8" version = "0.18.0" dependencies = [ - "numpy >= 1.17.3, < 2.0.0", + "numpy >=2.0.0, <3", "requests_unixsocket", "pytz", "pyjwt", diff --git a/test/hl/test_attribute.py b/test/hl/test_attribute.py index 4c524c7..21d2290 100644 --- a/test/hl/test_attribute.py +++ b/test/hl/test_attribute.py @@ -26,7 +26,7 @@ class TestAttribute(TestCase): def test_create(self): - filename = self.getFileName("create_attribfute") + filename = self.getFileName("create_attribute") print("filename:", filename) f = h5py.File(filename, 'w') @@ -61,7 +61,7 @@ def test_create(self): self.assertEqual(value, "Hello HDF") # create attribute with as a fixed length string - g1.attrs.create('d1', np.string_("This is a numpy string")) + g1.attrs.create('d1', np.bytes_("This is a numpy string")) value = g1.attrs['d1'] self.assertEqual(value, b"This is a numpy string") @@ -89,7 +89,7 @@ def test_create(self): self.assertEqual(arr[i], 1) # array of strings - g1.attrs['strings'] = [np.string_("Hello"), np.string_("Good-bye")] + g1.attrs['strings'] = [np.bytes_("Hello"), np.bytes_("Good-bye")] arr = g1.attrs['strings'] self.assertEqual(arr.shape, (2,)) self.assertEqual(arr[0], b"Hello") diff --git a/test/hl/test_dataset.py b/test/hl/test_dataset.py index d10c304..30a101b 100644 --- a/test/hl/test_dataset.py +++ b/test/hl/test_dataset.py @@ -985,7 +985,7 @@ def test_vlen_unicode(self): def test_string_fixed(self): """ Assignment of fixed-length byte string produces a fixed-length ascii dataset """ - self.f['x'] = np.string_("Hello there") + self.f['x'] = np.bytes_("Hello there") ds = self.f['x'] self.assert_string_type(ds, 'H5T_CSET_ASCII', variable=False) if self.is_hsds(): @@ -1278,7 +1278,7 @@ def test_roundtrip_fixed_bytes(self): data = b"Hello\xef" ds[0] = data out = ds[0] - self.assertEqual(type(out), np.string_) + self.assertEqual(type(out), np.bytes_) self.assertEqual(out, data) def test_retrieve_vlen_unicode(self): @@ -1727,7 +1727,8 @@ def test_convert(self): self.assertArrayEqual(ds[0], np.array([1, 1])) self.assertArrayEqual(ds[1], np.array([1])) self.assertArrayEqual(ds[2], np.array([1, 2, 3])) - ds[0:2] = np.array([[0.1, 1.1, 2.1, 3.1, 4], np.arange(4)], dtype=object) + test_arr = np.array([[0.1, 1.1, 2.1, 3.1, 4], np.arange(4)], dtype=object) + ds[0:2] = test_arr self.assertArrayEqual(ds[0], np.arange(5)) self.assertArrayEqual(ds[1], np.arange(4)) ds[0:2] = np.array([np.array([0.1, 1.2, 2.2]), @@ -1987,494 +1988,6 @@ def test_basetype_commutative(self,): assert (val != dset) == (dset != val) -@ut.skipIf(config.get('use_h5py'), "h5py does not support MultiManager") -class TestMultiManager(BaseDataset): - def test_multi_read_scalar_dataspaces(self): - """ - Test reading from multiple datasets with scalar dataspaces - """ - shape = () - count = 3 - dt = np.int32 - - # Create datasets - data_in = np.array(1, dtype=dt) - datasets = [] - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape, - dtype=dt, data=(data_in + i)) - datasets.append(dset) - - mm = MultiManager(datasets) - - # Select via empty tuple - data_out = mm[()] - - self.assertEqual(len(data_out), count) - - for i in range(count): - np.testing.assert_array_equal(data_out[i], data_in + i) - - # Select via Ellipsis - data_out = mm[...] - - self.assertEqual(len(data_out), count) - - for i in range(count): - np.testing.assert_array_equal(data_out[i], data_in + i) - - def test_multi_read_non_scalar_dataspaces(self): - """ - Test reading from multiple datasets with non-scalar dataspaces - """ - shape = (10, 10, 10) - count = 3 - dt = np.int32 - - # Create datasets - data_in = np.reshape(np.arange(np.prod(shape)), shape) - datasets = [] - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape, - dtype=dt, data=(data_in + i)) - datasets.append(dset) - - mm = MultiManager(datasets) - data_out = mm[...] - - self.assertEqual(len(data_out), count) - - for i in range(count): - np.testing.assert_array_equal(data_out[i], data_in + i) - - # Partial Read - data_out = mm[:, :, 0] - - self.assertEqual(len(data_out), count) - - for i in range(count): - np.testing.assert_array_equal(data_out[i], (data_in + i)[:, :, 0]) - - def test_multi_read_mixed_dataspaces(self): - """ - Test reading from multiple datasets with scalar and - non-scalar dataspaces - """ - scalar_shape = () - shape = (10, 10, 10) - count = 3 - dt = np.int32 - - # Create datasets - data_scalar_in = np.array(1) - data_nonscalar_in = np.reshape(np.arange(np.prod(shape)), shape) - data_in = [data_scalar_in, data_nonscalar_in, - data_nonscalar_in, data_nonscalar_in] - datasets = [] - - for i in range(count): - if i == 0: - dset = self.f.create_dataset("data" + str(0), scalar_shape, - dtype=dt, data=data_scalar_in) - else: - dset = self.f.create_dataset("data" + str(i), shape, - dtype=dt, data=(data_nonscalar_in + i)) - datasets.append(dset) - - # Set up MultiManager for read - mm = MultiManager(datasets=datasets) - - # Select via empty tuple - data_out = mm[()] - - self.assertEqual(len(data_out), count) - - for i in range(count): - if i == 0: - np.testing.assert_array_equal(data_out[i], data_in[i]) - else: - np.testing.assert_array_equal(data_out[i], data_in[i] + i) - - # Select via Ellipsis - data_out = mm[...] - - self.assertEqual(len(data_out), count) - - for i in range(count): - if i == 0: - np.testing.assert_array_equal(data_out[i], data_in[i]) - else: - np.testing.assert_array_equal(data_out[i], data_in[i] + i) - - def test_multi_read_mixed_types(self): - """ - Test reading from multiple datasets with different types - """ - shape = (10, 10, 10) - count = 4 - dts = [np.int32, np.int64, np.float64, np.dtype("S10")] - - # Create datasets - data_in = np.reshape(np.arange(np.prod(shape)), shape) - data_in_fixed_str = np.full(shape, "abcdefghij", dtype=dts[3]) - datasets = [] - - for i in range(count): - if i < 3: - dset = self.f.create_dataset("data" + str(i), shape, - dtype=dts[i], data=(data_in + i)) - else: - dset = self.f.create_dataset("data" + str(i), shape, - dtype=dts[i], data=data_in_fixed_str) - - datasets.append(dset) - - # Set up MultiManager for read - mm = MultiManager(datasets=datasets) - - # Perform read - data_out = mm[...] - - self.assertEqual(len(data_out), count) - - for i in range(count): - if i < 3: - np.testing.assert_array_equal(data_out[i], np.array(data_in + i, dtype=dts[i])) - else: - np.testing.assert_array_equal(data_out[i], data_in_fixed_str) - - self.assertEqual(data_out[i].dtype, dts[i]) - - def test_multi_read_vlen_str(self): - """ - Test reading from multiple datasets with a vlen string type - """ - shape = (10, 10, 10) - count = 3 - dt = h5py.string_dtype(encoding='utf-8') - data_in = np.full(shape, "abcdefghij", dt) - datasets = [] - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape=shape, - data=data_in, dtype=dt) - datasets.append(dset) - - mm = MultiManager(datasets=datasets) - out = mm[...] - - self.assertEqual(len(out), count) - - for i in range(count): - self.assertEqual(out[i].dtype, dt) - out[i] = np.reshape(out[i], newshape=np.prod(shape)) - out[i] = np.reshape(np.array([s.decode() for s in out[i]], dtype=dt), - newshape=shape) - np.testing.assert_array_equal(out[i], data_in) - - def test_multi_read_mixed_shapes(self): - """ - Test reading a selection from multiple datasets with different shapes - """ - shapes = [(150), (10, 15), (5, 5, 6)] - count = 3 - dt = np.int32 - data = np.arange(150, dtype=dt) - data_in = [np.reshape(data, newshape=s) for s in shapes] - datasets = [] - sel_idx = 2 - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape=shapes[i], - dtype=dt, data=data_in[i]) - datasets.append(dset) - - mm = MultiManager(datasets=datasets) - # Perform multi read with selection - out = mm[sel_idx] - - # Verify - for i in range(count): - np.testing.assert_array_equal(out[i], data_in[i][sel_idx]) - - def test_multi_write_scalar_dataspaces(self): - """ - Test writing to multiple scalar datasets - """ - shape = () - count = 3 - dt = np.int32 - - # Create datasets - zeros = np.zeros(shape, dtype=dt) - data_in = [] - datasets = [] - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape, - dtype=dt, data=zeros) - datasets.append(dset) - - data_in.append(np.array([i])) - - mm = MultiManager(datasets) - # Perform write - mm[...] = data_in - - # Read back and check - for i in range(count): - data_out = self.f["data" + str(i)][...] - np.testing.assert_array_equal(data_out, data_in[i]) - - def test_multi_write_non_scalar_dataspaces(self): - """ - Test writing to multiple non-scalar datasets - """ - shape = (10, 10, 10) - count = 3 - dt = np.int32 - - # Create datasets - zeros = np.zeros(shape, dtype=dt) - data_in = [] - datasets = [] - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape, - dtype=dt, data=zeros) - datasets.append(dset) - - d_in = np.array(np.reshape(np.arange(np.prod(shape)), shape) + i, dtype=dt) - data_in.append(d_in) - - mm = MultiManager(datasets) - # Perform write - mm[...] = data_in - - # Read back and check - for i in range(count): - data_out = np.array(self.f["data" + str(i)][...], dtype=dt) - np.testing.assert_array_equal(data_out, data_in[i]) - - def test_multi_write_mixed_dataspaces(self): - """ - Test writing to multiple scalar and non-scalar datasets - """ - scalar_shape = () - shape = (10, 10, 10) - count = 3 - dt = np.int32 - - # Create datasets - data_in = [] - data_scalar_in = np.array(1, dtype=dt) - data_nonscalar_in = np.array(np.reshape(np.arange(np.prod(shape)), shape), dtype=dt) - datasets = [] - - for i in range(count): - if i == 0: - dset = self.f.create_dataset("data" + str(0), scalar_shape, - dtype=dt, data=np.array(0, dtype=dt)) - data_in.append(data_scalar_in) - else: - dset = self.f.create_dataset("data" + str(i), shape, - dtype=dt, data=np.zeros(shape)) - data_in.append(data_nonscalar_in) - datasets.append(dset) - - # Set up MultiManager for write - mm = MultiManager(datasets=datasets) - - # Select via empty tuple - mm[()] = data_in - - for i in range(count): - data_out = self.f["data" + str(i)][...] - np.testing.assert_array_equal(data_out, data_in[i]) - - # Reset datasets - for i in range(count): - if i == 0: - zeros = np.array([0]) - else: - zeros = np.zeros(shape) - self.f["data" + str(i)][...] = zeros - - # Select via Ellipsis - mm[...] = data_in - - for i in range(count): - data_out = self.f["data" + str(i)][...] - - if i == 0: - np.testing.assert_array_equal(data_out, data_in[i]) - else: - np.testing.assert_array_equal(data_out, data_in[i]) - - def test_multi_write_vlen_str(self): - """ - Test writing to multiple datasets with a vlen string type - """ - shape = (10, 10, 10) - count = 3 - dt = h5py.string_dtype(encoding='utf-8') - data_initial_vlen = np.full(shape, "aaaabbbbcc", dtype=dt) - data_in_vlen = np.full(shape, "abcdefghij", dtype=dt) - datasets = [] - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape=shape, - data=data_initial_vlen, dtype=dt) - datasets.append(dset) - - mm = MultiManager(datasets=datasets) - # Perform write - mm[...] = [data_in_vlen, data_in_vlen, data_in_vlen] - - # Verify - for i in range(count): - out = self.f["data" + str(i)][...] - self.assertEqual(out.dtype, dt) - - out = np.reshape(out, newshape=np.prod(shape)) - out = np.reshape(np.array([s.decode() for s in out], dtype=dt), - newshape=shape) - np.testing.assert_array_equal(out, data_in_vlen) - - def test_multi_write_mixed_shapes(self): - """ - Test writing to a selection in multiple datasets with different shapes - """ - shapes = [(50, 5), (15, 10), (20, 15)] - count = 3 - dt = np.int32 - data_in = 99 - datasets = [] - sel_idx = 2 - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape=shapes[i], - dtype=dt, data=np.zeros(shapes[i], dtype=dt)) - datasets.append(dset) - - mm = MultiManager(datasets=datasets) - # Perform multi write with selection - mm[sel_idx, sel_idx] = [data_in, data_in + 1, data_in + 2] - - # Verify - for i in range(count): - out = self.f["data" + str(i)][...] - np.testing.assert_array_equal(out[sel_idx, sel_idx], data_in + i) - - def test_multi_selection(self): - """ - Test using a different selection - for each dataset in a MultiManager - """ - shape = (10, 10, 10) - count = 3 - dt = np.int32 - - # Create datasets - data_in = np.reshape(np.arange(np.prod(shape), dtype=dt), shape) - data_in_original = data_in.copy() - datasets = [] - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape=shape, - dtype=dt, data=data_in) - datasets.append(dset) - - mm = h5py.MultiManager(datasets=datasets) - - # Selections to read from - sel = [np.s_[0:10, 0:10, 0:10], np.s_[0:5, 5:10, 1:4:2], np.s_[4, 5, 6]] - data_out = mm[sel] - - for i in range(count): - np.testing.assert_array_equal(data_out[i], data_in[sel[i]]) - - # If selection list has only a single element, apply it to all dsets - sel = [np.s_[0:10, 0:10, 0:10]] - data_out = mm[sel] - - for d in data_out: - np.testing.assert_array_equal(d, data_in[sel[0]]) - - # Selections to write to - sel = [np.s_[0:10, 0:10, 0:10], np.s_[0:5, 0:5, 0:5], np.s_[0, 0, 0]] - data_in = [np.zeros_like(data_in), np.ones_like(data_in), np.full_like(data_in, 2)] - mm[sel] = [data_in[i][sel[i]] for i in range(count)] - - for i in range(count): - np.testing.assert_array_equal(self.f["data" + str(i)][sel[i]], data_in[i][sel[i]]) - - # Check that unselected regions are unmodified - np.testing.assert_array_equal(self.f["data1"][5:, 5:, 5:], data_in_original[5:, 5:, 5:]) - np.testing.assert_array_equal(self.f["data2"][1:, 1:, 1:], data_in_original[1:, 1:, 1:]) - - # Save for later comparison - data_in_original = mm[...] - - # If selection list has only a single element, apply it to all dsets - sel = [np.s_[0:6, 0:6, 0:6]] - data_in = np.full(shape, 3, dtype=dt) - mm[sel] = [data_in[sel[0]]] * count - - for i in range(count): - np.testing.assert_array_equal(self.f["data" + str(i)][sel[0]], data_in[sel[0]]) - - # Check that unselected regions are unmodified - data_out = mm[...] - - for i in range(count): - np.testing.assert_array_equal(data_out[i][6:, 6:, 6:], data_in_original[i][6:, 6:, 6:]) - - def test_multi_field_selection(self): - """ - Test reading/writing to a field selection on multiple datasets - """ - dt = np.dtype([('a', np.float32), ('b', np.int32), ('c', np.float32)]) - shape = (100,) - data = np.ones(shape, dtype=dt) - count = 3 - datasets = [] - - for i in range(count): - dset = self.f.create_dataset("data" + str(i), shape=shape, - data=np.zeros(shape, dtype=dt), - dtype=dt) - datasets.append(dset) - - # Perform read from field 'b' - mm = MultiManager(datasets=datasets) - out = mm[..., 'b'] - - # Verify data returned - for i in range(count): - np.testing.assert_array_equal(out[i], np.zeros(shape, dtype=dt['b'])) - - # Perform write to field 'b' - mm = MultiManager(datasets=datasets) - mm[..., 'b'] = [data['b'], data['b'], data['b']] - - for i in range(count): - out = np.array(self.f["data" + str(i)], dtype=dt) - np.testing.assert_array_equal(out['a'], np.zeros(shape, dtype=dt['a'])) - np.testing.assert_array_equal(out['b'], data['b']) - np.testing.assert_array_equal(out['c'], np.zeros(shape, dtype=dt['c'])) - - # Test writing to entire compound type - data = np.zeros(shape, dtype=dt) - mm[...] = [data, data, data] - - for i in range(count): - out = np.array(self.f["data" + str(i)], dtype=dt) - np.testing.assert_array_equal(out, data) - - if __name__ == '__main__': loglevel = logging.ERROR logging.basicConfig(format='%(asctime)s %(message)s', level=loglevel) diff --git a/test/hl/test_dataset_create.py b/test/hl/test_dataset_create.py index 287015c..c68f577 100644 --- a/test/hl/test_dataset_create.py +++ b/test/hl/test_dataset_create.py @@ -99,7 +99,8 @@ def test_create_float16_dset(self): dset[...] = arr arr = dset[...] # read back - val = arr[2, 4] # test one value + # test one value + val = float(arr[2, 4]) # convert to float since np.float16 types not comparable self.assertTrue(val > 20.4 - 0.01) self.assertTrue(val < 20.4 + 0.01) diff --git a/test/hl/test_dataset_fancyselect.py b/test/hl/test_dataset_fancyselect.py index e1ed8d1..3e1fd3d 100644 --- a/test/hl/test_dataset_fancyselect.py +++ b/test/hl/test_dataset_fancyselect.py @@ -104,6 +104,39 @@ def test_dset_3d(self): f.close() + def test_bigdset(self): + filename = self.getFileName("fancy_select_dset_3d") + print("filename:", filename) + f = h5py.File(filename, "w") + # create a dataset + dset = f.create_dataset("dset", (5, 1000, 1000), dtype="i4", compression="gzip") + print(dset.id.id) + # write some values to the dataset + dset[:, 1, 10] = [95, 96, 97, 98, 99] + dset[:, 10, 100] = [195, 196, 197, 198, 199] + dset[:, 100, 500] = [295, 296, 297, 298, 299] + + # single coordinate, increasing + arr = dset[:, 10, [10, 100, 500]] + self.assertEqual(arr.shape, (5, 3)) + self.assertTrue((arr[:, 0] == [0, 0, 0, 0, 0]).all()) + self.assertTrue((arr[:, 1] == [195, 196, 197, 198, 199]).all()) + self.assertTrue((arr[:, 2] == [0, 0, 0, 0, 0]).all()) + + # non-increasing indexes + arr = dset[:, 10, [100, 10, 500]] + self.assertEqual(arr.shape, (5, 3)) + self.assertTrue((arr[:, 0] == [195, 196, 197, 198, 199]).all()) + self.assertTrue((arr[:, 1] == [0, 0, 0, 0, 0]).all()) + self.assertTrue((arr[:, 2] == [0, 0, 0, 0, 0]).all()) + + # test multiple coordinates + arr = dset[:, [1, 10, 100], [10, 100, 500]] + self.assertEqual(arr.shape, (5, 3)) + self.assertTrue((arr[:, 0] == [95, 96, 97, 98, 99]).all()) + self.assertTrue((arr[:, 1] == [195, 196, 197, 198, 199]).all()) + self.assertTrue((arr[:, 2] == [295, 296, 297, 298, 299]).all()) + if __name__ == '__main__': ut.main() diff --git a/testall.py b/testall.py index fe7e4ef..10efbf1 100755 --- a/testall.py +++ b/testall.py @@ -23,8 +23,9 @@ 'test_dataset_create', 'test_dataset_extend', 'test_dataset_fancyselect', - 'test_dataset_objref', 'test_dataset_getitem', + 'test_dataset_multi', + 'test_dataset_objref', 'test_dataset_pointselect', 'test_dataset_scalar', 'test_dataset_setitem', @@ -32,11 +33,11 @@ 'test_datatype', 'test_dimscale', 'test_file', + 'test_folder', 'test_group', 'test_table', 'test_visit', - 'test_vlentype', - 'test_folder') + 'test_vlentype',) app_tests = ('test_hsinfo', 'test_tall_inspect', 'test_diamond_inspect',