-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
148 lines (121 loc) · 4.63 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import zarr
import xarray
import datatree
import shutil
import os
import zarr
def get_all_array_paths(group, path=''):
"""
Recursively find all array paths in a Zarr group.
Parameters:
- group: The Zarr group to traverse.
- path: The current path in the traversal.
Returns:
- A list of all array paths.
"""
array_paths = []
for name, item in group.items():
current_path = f"{path}/{name}" if path else name
if isinstance(item, zarr.core.Array):
array_paths.append(current_path)
elif isinstance(item, zarr.hierarchy.Group):
# Recursively get arrays in sub-groups
array_paths.extend(get_all_array_paths(item, current_path))
return array_paths
def get_leaf_group_paths(group, path=''):
"""
Recursively find all leaf group paths in a Zarr group.
Parameters:
- group: The Zarr group to traverse.
- path: The current path in the traversal.
Returns:
- A list of all leaf group paths.
"""
paths = []
has_subgroups = False
for name, item in group.items():
if isinstance(item, zarr.hierarchy.Group):
has_subgroups = True
current_path = f"{path}/{name}"
# Recursively get sub-groups
paths.extend(get_leaf_group_paths(item, current_path))
# If no sub-groups were found, this is a leaf group
if not has_subgroups:
paths.append(path)
return paths
def get_all_subgroup_paths(group, path=''):
"""
Recursively find all subgroup paths in a Zarr group.
Parameters:
- group: The Zarr group to traverse.
- path: The current path in the traversal.
Returns:
- A list of all subgroup paths.
"""
paths = []
for name, item in group.items():
if isinstance(item, zarr.hierarchy.Group):
current_path = f"{path}/{name}"
paths.append(current_path)
# Recursively get sub-groups
paths.extend(get_all_subgroup_paths(item, current_path))
return paths
BUCKET_URL = 's3://janelia-cosem-datasets'
REFORMATTED_CACHE = './janelia/janelia-reformatted'
shutil.rmtree(REFORMATTED_CACHE, ignore_errors=True)
def reformat_zarr_group(original_group, new_group):
subgroups = list(original_group.groups())
if len(subgroups):
return {
gname: reformat_zarr_group(g, new_group) for gname, g in subgroups
}
else:
name = original_group.name.split('/')[-1]
arr = original_group['s0'] # in janelia's schema, s0 is the full-resolution data
# print(dir(arr))
print(arr.dtype, arr.chunks, arr.shape, dict(arr.attrs))
# print(arr[0])
# zarr.copy(
# arr,
# new_group,
# name=name,
# shallow=True,
# )
# new_group.attrs.update({
# '_ARRAY_DIMENSIONS': ['x', 'y', 'z'], # axes are ordered 'xyz' instead of 'zyx'
# })
def cache_reformatted(original_group, dataset_name):
reformatted_path = f'{REFORMATTED_CACHE}/{dataset_name}'
if not os.path.exists(reformatted_path):
new_group = zarr.open(zarr.DirectoryStore(reformatted_path))
reformat_zarr_group(original_group, new_group)
# zarr.consolidate_metadata(new_group)
return reformatted_path
dataset_urls = [
'aic_desmosome-1/aic_desmosome-1.n5/'
]
"""
for dataset_url in dataset_urls:
original_group = zarr.open(zarr.N5FSStore(f'{BUCKET_URL}/{dataset_url}', anon=True))
arrs = list(original_group['/em/fibsem-uint8/'].arrays())
vararray = arrs[5][1]
#arrays = get_all_array_paths(original_group)
#print(arrays)
x = xarray.DataArray(vararray, name="s5")
print(x)
reformatted_path = cache_reformatted(original_group, dataset_url.replace("/", "_"))
print(reformatted_path)
new_group = zarr.open(reformatted_path)
print("Printing new group")
print(new_group)
print("Printing new group tree")
print(zarr.tree(new_group))
"""
import fsspec, zarr
import dask.array as da # we import dask to help us manage parallel access to the big dataset
#group = zarr.open(zarr.N5FSStore('s3://janelia-cosem-datasets/jrc_hela-2/jrc_hela-2.n5', anon=True)) # access the root of the n5 container
group = zarr.open(zarr.N5FSStore('s3://janelia-cosem-datasets/aic_desmosome-3/aic_desmosome-3.n5', anon=True)) # access the root of the n5 container
zdata = group['em/fibsem-uint8/s4'] # s0 is the the full-resolution data for this particular volume
ddata = da.from_array(zdata, chunks=zdata.chunks)
result = ddata.compute() # get the first slice of the data as a numpy array
print(result, result.shape)