Skip to content

Commit

Permalink
Update documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
actions-user committed Aug 15, 2023
1 parent 199d7d7 commit b237718
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 11 deletions.
28 changes: 23 additions & 5 deletions _sources/advanced.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,16 +156,34 @@ one go and may be faster, if you have a Dask cluster available.
from kerchunk import hdf, combine, df
import fsspec.implementations.reference
from fsspec.implementations.reference import LazyReferenceMapper
from tempfile import TemporaryDirectory
import xarray as xr
files = fsspec.open(location_of_data)
# Create LazyReferenceMapper to pass to MultiZarrToZarr
fs = fsspec.filesystem("file")
os.makedirs("combined.parq")
out = LazyReferenceMapper.create(1000, "combined.parq", fs)
# Create references from input files
single_ref_sets = [hdf.SingleHdf5ToZarr(_).translate() for _ in files]
out_dict = combine.MultiZarrToZarr(single_ref_sets, concat_dims=["time"]).translate()
os.mkdir("combined.parq")
df.refs_to_dataframe(out_dict, "combined.parq", partition=True)
fs = fsspec.implementations.reference.DFReferenceFileSystem(
"combined.parq", lazy=True)
out_dict = MultiZarrToZarr(
single_ref_sets,
remote_protocol="memory",
concat_dims=["time"],
out=out).translate()
out.flush()
df.refs_to_dataframe(out_dict, "combined.parq")
fs = fsspec.implementations.reference.ReferenceFileSystem(
"combined.parq", remote_protocol="s3", target_protocol="file", lazy=True)
ds = xr.open_dataset(
fs.get_mapper(), engine="zarr",
backend_kwargs={"consolidated": False}
Expand Down
28 changes: 23 additions & 5 deletions advanced.html
Original file line number Diff line number Diff line change
Expand Up @@ -219,16 +219,34 @@ <h2>Parquet Storage<a class="headerlink" href="#parquet-storage" title="Permalin
one go and may be faster, if you have a Dask cluster available.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">kerchunk</span> <span class="kn">import</span> <span class="n">hdf</span><span class="p">,</span> <span class="n">combine</span><span class="p">,</span> <span class="n">df</span>
<span class="kn">import</span> <span class="nn">fsspec.implementations.reference</span>
<span class="kn">from</span> <span class="nn">fsspec.implementations.reference</span> <span class="kn">import</span> <span class="n">LazyReferenceMapper</span>
<span class="kn">from</span> <span class="nn">tempfile</span> <span class="kn">import</span> <span class="n">TemporaryDirectory</span>

<span class="kn">import</span> <span class="nn">xarray</span> <span class="k">as</span> <span class="nn">xr</span>

<span class="n">files</span> <span class="o">=</span> <span class="n">fsspec</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">location_of_data</span><span class="p">)</span>

<span class="c1"># Create LazyReferenceMapper to pass to MultiZarrToZarr</span>
<span class="n">fs</span> <span class="o">=</span> <span class="n">fsspec</span><span class="o">.</span><span class="n">filesystem</span><span class="p">(</span><span class="s2">&quot;file&quot;</span><span class="p">)</span>

<span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="s2">&quot;combined.parq&quot;</span><span class="p">)</span>
<span class="n">out</span> <span class="o">=</span> <span class="n">LazyReferenceMapper</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="mi">1000</span><span class="p">,</span> <span class="s2">&quot;combined.parq&quot;</span><span class="p">,</span> <span class="n">fs</span><span class="p">)</span>

<span class="c1"># Create references from input files</span>
<span class="n">single_ref_sets</span> <span class="o">=</span> <span class="p">[</span><span class="n">hdf</span><span class="o">.</span><span class="n">SingleHdf5ToZarr</span><span class="p">(</span><span class="n">_</span><span class="p">)</span><span class="o">.</span><span class="n">translate</span><span class="p">()</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">files</span><span class="p">]</span>
<span class="n">out_dict</span> <span class="o">=</span> <span class="n">combine</span><span class="o">.</span><span class="n">MultiZarrToZarr</span><span class="p">(</span><span class="n">single_ref_sets</span><span class="p">,</span> <span class="n">concat_dims</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;time&quot;</span><span class="p">])</span><span class="o">.</span><span class="n">translate</span><span class="p">()</span>
<span class="n">os</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="s2">&quot;combined.parq&quot;</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">refs_to_dataframe</span><span class="p">(</span><span class="n">out_dict</span><span class="p">,</span> <span class="s2">&quot;combined.parq&quot;</span><span class="p">,</span> <span class="n">partition</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>

<span class="n">fs</span> <span class="o">=</span> <span class="n">fsspec</span><span class="o">.</span><span class="n">implementations</span><span class="o">.</span><span class="n">reference</span><span class="o">.</span><span class="n">DFReferenceFileSystem</span><span class="p">(</span>
<span class="s2">&quot;combined.parq&quot;</span><span class="p">,</span> <span class="n">lazy</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">out_dict</span> <span class="o">=</span> <span class="n">MultiZarrToZarr</span><span class="p">(</span>
<span class="n">single_ref_sets</span><span class="p">,</span>
<span class="n">remote_protocol</span><span class="o">=</span><span class="s2">&quot;memory&quot;</span><span class="p">,</span>
<span class="n">concat_dims</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;time&quot;</span><span class="p">],</span>
<span class="n">out</span><span class="o">=</span><span class="n">out</span><span class="p">)</span><span class="o">.</span><span class="n">translate</span><span class="p">()</span>

<span class="n">out</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>

<span class="n">df</span><span class="o">.</span><span class="n">refs_to_dataframe</span><span class="p">(</span><span class="n">out_dict</span><span class="p">,</span> <span class="s2">&quot;combined.parq&quot;</span><span class="p">)</span>

<span class="n">fs</span> <span class="o">=</span> <span class="n">fsspec</span><span class="o">.</span><span class="n">implementations</span><span class="o">.</span><span class="n">reference</span><span class="o">.</span><span class="n">ReferenceFileSystem</span><span class="p">(</span>
<span class="s2">&quot;combined.parq&quot;</span><span class="p">,</span> <span class="n">remote_protocol</span><span class="o">=</span><span class="s2">&quot;s3&quot;</span><span class="p">,</span> <span class="n">target_protocol</span><span class="o">=</span><span class="s2">&quot;file&quot;</span><span class="p">,</span> <span class="n">lazy</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">ds</span> <span class="o">=</span> <span class="n">xr</span><span class="o">.</span><span class="n">open_dataset</span><span class="p">(</span>
<span class="n">fs</span><span class="o">.</span><span class="n">get_mapper</span><span class="p">(),</span> <span class="n">engine</span><span class="o">=</span><span class="s2">&quot;zarr&quot;</span><span class="p">,</span>
<span class="n">backend_kwargs</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;consolidated&quot;</span><span class="p">:</span> <span class="kc">False</span><span class="p">}</span>
Expand Down
Loading

0 comments on commit b237718

Please sign in to comment.