diff --git a/python_scripts/README.md b/python_scripts/README.md index 5a462e8c1..acda923b7 100644 --- a/python_scripts/README.md +++ b/python_scripts/README.md @@ -5,15 +5,8 @@ You will likely develop more customized, robust, and flexible scripts for your o These simple scripts here are intended to help you understand the basics of the generated data from Cholla. ## Merging HDF5 files -Multi-processor runs generate HDF5 files per-timestep per-processor. -To treat each timestep together we want to merge those per-processor HDF5 files. -| Script | Concatenate | -| ------ | ----------- | -`cat_dset_3d.py` | 3D HDF5 datasets -`cat_projection.py` | The on-axis projection data created when the -DPROJECTION flag is turned on -`cat_rotated_projection.py` | The rotated projection data created when the -DROTATED_PROJECTION flag is turned on -`cat_slice.py` | The on-axis slice data created when the -DSLICES flag is turned on +Multi-processor runs generate HDF5 files per-timestep per-processor. Merging these per process output into a single file can be done with the concatenation scripts detailed in the "Outputs" section of the wiki. ## Plotting data We here present simple Python matplotlib-based scripts to plot density, velocity, energy, and pressure. diff --git a/python_scripts/cat.py b/python_scripts/cat.py deleted file mode 100755 index dc840c570..000000000 --- a/python_scripts/cat.py +++ /dev/null @@ -1,406 +0,0 @@ -# Utils for concat cholla output - -import h5py -import numpy as np -import os - -verbose = True - -def parse(argv): - # Determine prefix - if 'h5' in argv: - preprefix = argv.split('.h5')[0] - prefix = preprefix +'.h5' - - else: - prefix = './{}.h5'.format(argv) - - # Check existing - firstfile = prefix+'.0' - if not os.path.isfile(firstfile): - print(firstfile,' is missing') - exit() - - # Set dirnames - dnamein = os.path.dirname(firstfile)+'/' - dnameout = os.path.dirname(firstfile) + '/' - return dnamein,dnameout - -def hydro(n,dnamein,dnameout,double=True): - """ - n: integer, output number of file - dnamein: string, directory name of input files, should include '/' at end or leave blank for current directory - dnameout: string, directory name of output files, should include '/' at end or leave blank for current directory - double: optional bool, double precision (float64) if True, single precision (float32) if False - - Reads files of form dnamein{n}.h5.{rank}, looping over rank, outputting to file dnameout{n}.h5. - """ - - fileout = h5py.File(dnameout+str(n)+'.h5', 'a') - - i = -1 - # loops over all files - while True: - i += 1 - - fileinname = dnamein+str(n)+'.h5.'+str(i) - - if not os.path.isfile(fileinname): - break - print('Load:',fileinname,flush=True) - - # open the input file for reading - filein = h5py.File(fileinname,'r') - - # read in the header data from the input file - head = filein.attrs - - # if it's the first input file, write the header attributes - # and create the datasets in the output file - if (i == 0): - nx = head['dims'][0] - ny = head['dims'][1] - nz = head['dims'][2] - nxl = head['dims_local'][0] - nyl = head['dims_local'][1] - nzl = head['dims_local'][2] - fileout.attrs['dims'] = [nx, ny, nz] - fileout.attrs['gamma'] = [head['gamma'][0]] - fileout.attrs['t'] = [head['t'][0]] - fileout.attrs['dt'] = [head['dt'][0]] - fileout.attrs['n_step'] = [head['n_step'][0]] - - units = ['time_unit', 'mass_unit', 'length_unit', 'energy_unit', 'velocity_unit', 'densit\ -y_unit'] - for unit in units: - fileout.attrs[unit] = [head[unit][0]] - keys = list(filein.keys()) - #['density','momentum_x','momentum_y','momentum_z','Energy','GasEnergy','scalar0'] - - for key in keys: - if key not in fileout: - # WARNING: If you don't set dataset dtype it will default to 32-bit, but CHOLLA likes to be 64-bit - if double: - dtype = filein[key].dtype - else: - dtype = None - if nz > 1: - fileout.create_dataset(key, (nx, ny, nz), chunks=(nxl,nyl,nzl), dtype=dtype) - elif ny > 1: - fileout.create_dataset(key, (nx, ny), chunks=(nxl,nyl), dtype=dtype) - elif nx > 1: - fileout.create_dataset(key, (nx,), chunks=(nxl,), dtype=dtype) - #fileout.create_dataset(key, (nx, ny, nz)) - - # write data from individual processor file to - # correct location in concatenated file - nxl = head['dims_local'][0] - nyl = head['dims_local'][1] - nzl = head['dims_local'][2] - xs = head['offset'][0] - ys = head['offset'][1] - zs = head['offset'][2] - for key in keys: - if key in filein: - if nz > 1: - fileout[key][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein[key] - elif ny > 1: - fileout[key][xs:xs+nxl,ys:ys+nyl] = filein[key] - elif nx > 1: - fileout[key][xs:xs+nxl] = filein[key] - filein.close() - - # end loop over all files - fileout.close() - - -def projection(n,dnamein,dnameout): - """ - n: integer, output number of file - dnamein: string, directory name of input files, should include '/' at end or leave blank for current directory - dnameout: string, directory name of output files, should include '/' at end or leave blank for current directory - double: optional bool, double precision (float64) if True, single precision (float32) if False - - Reads files of form dnamein{n}.h5.{rank}, looping over rank, outputting to file dnameout{n}.h5. - """ - - # open the output file for writing - fileout = h5py.File(dnameout+str(n)+'_proj.h5', 'w') - i = -1 - while True: - i += 1 - - fileinname = dnamein+str(n)+'_proj.h5.'+str(i) - - if not os.path.isfile(fileinname): - break - - if verbose: - print(fileinname) - # open the input file for reading - filein = h5py.File(fileinname,'r') - # read in the header data from the input file - head = filein.attrs - - # if it's the first input file, write the header attributes - # and create the datasets in the output file - if (i == 0): - nx = head['dims'][0] - ny = head['dims'][1] - nz = head['dims'][2] - fileout.attrs['dims'] = [nx, ny, nz] - fileout.attrs['gamma'] = [head['gamma'][0]] - fileout.attrs['t'] = [head['t'][0]] - fileout.attrs['dt'] = [head['dt'][0]] - fileout.attrs['n_step'] = [head['n_step'][0]] - - dxy = np.zeros((nx,ny)) - dxz = np.zeros((nx,nz)) - Txy = np.zeros((nx,ny)) - Txz = np.zeros((nx,nz)) - - # write data from individual processor file to - # correct location in concatenated file - nxl = head['dims_local'][0] - nyl = head['dims_local'][1] - nzl = head['dims_local'][2] - xs = head['offset'][0] - ys = head['offset'][1] - zs = head['offset'][2] - - dxy[xs:xs+nxl,ys:ys+nyl] += filein['d_xy'] - dxz[xs:xs+nxl,zs:zs+nzl] += filein['d_xz'] - Txy[xs:xs+nxl,ys:ys+nyl] += filein['T_xy'] - Txz[xs:xs+nxl,zs:zs+nzl] += filein['T_xz'] - - filein.close() - - # write out the new datasets - fileout.create_dataset('d_xy', data=dxy) - fileout.create_dataset('d_xz', data=dxz) - fileout.create_dataset('T_xy', data=Txy) - fileout.create_dataset('T_xz', data=Txz) - - fileout.close() - return - -def slice(n,dnamein,dnameout): - """ - n: integer, output number of file - dnamein: string, directory name of input files, should include '/' at end or leave blank for current directory - dnameout: string, directory name of output files, should include '/' at end or leave blank for current directory - double: optional bool, double precision (float64) if True, single precision (float32) if False - - Reads files of form dnamein{n}_slice.h5.{rank}, looping over rank, outputting to file dnameout{n}_slice.h5. - """ - - # open the output file for writing - fileout = h5py.File(dnameout+str(n)+'_slice.h5', 'w') - - i = -1 - while True: - # loop over files for a given output time - i += 1 - - fileinname = dnamein+str(n)+'_slice.h5.'+str(i) - if not os.path.isfile(fileinname): - break - - if verbose: - print(fileinname) - # open the input file for reading - filein = h5py.File(fileinname,'r') - # read in the header data from the input file - head = filein.attrs - - # Detect DE - DE = 'GE_xy' in filein - SCALAR = 'scalar_xy' in filein - - # if it's the first input file, write the header attributes - # and create the datasets in the output file - if (i == 0): - gamma = head['gamma'] - t = head['t'] - dt = head['dt'] - n_step = head['n_step'] - nx = head['dims'][0] - ny = head['dims'][1] - nz = head['dims'][2] - fileout.attrs['gamma'] = gamma - fileout.attrs['t'] = t - fileout.attrs['dt'] = dt - fileout.attrs['n_step'] = n_step - fileout.attrs['dims'] = [nx, ny, nz] - - d_xy = np.zeros((nx,ny)) - d_xz = np.zeros((nx,nz)) - d_yz = np.zeros((ny,nz)) - mx_xy = np.zeros((nx,ny)) - mx_xz = np.zeros((nx,nz)) - mx_yz = np.zeros((ny,nz)) - my_xy = np.zeros((nx,ny)) - my_xz = np.zeros((nx,nz)) - my_yz = np.zeros((ny,nz)) - mz_xy = np.zeros((nx,ny)) - mz_xz = np.zeros((nx,nz)) - mz_yz = np.zeros((ny,nz)) - E_xy = np.zeros((nx,ny)) - E_xz = np.zeros((nx,nz)) - E_yz = np.zeros((ny,nz)) - if DE: - GE_xy = np.zeros((nx,ny)) - GE_xz = np.zeros((nx,nz)) - GE_yz = np.zeros((ny,nz)) - if SCALAR: - scalar_xy = np.zeros((nx,ny)) - scalar_xz = np.zeros((nx,nz)) - scalar_yz = np.zeros((ny,nz)) - - # write data from individual processor file to - # correct location in concatenated file - nxl = head['dims_local'][0] - nyl = head['dims_local'][1] - nzl = head['dims_local'][2] - xs = head['offset'][0] - ys = head['offset'][1] - zs = head['offset'][2] - - d_xy[xs:xs+nxl,ys:ys+nyl] += filein['d_xy'] - d_xz[xs:xs+nxl,zs:zs+nzl] += filein['d_xz'] - d_yz[ys:ys+nyl,zs:zs+nzl] += filein['d_yz'] - mx_xy[xs:xs+nxl,ys:ys+nyl] += filein['mx_xy'] - mx_xz[xs:xs+nxl,zs:zs+nzl] += filein['mx_xz'] - mx_yz[ys:ys+nyl,zs:zs+nzl] += filein['mx_yz'] - my_xy[xs:xs+nxl,ys:ys+nyl] += filein['my_xy'] - my_xz[xs:xs+nxl,zs:zs+nzl] += filein['my_xz'] - my_yz[ys:ys+nyl,zs:zs+nzl] += filein['my_yz'] - mz_xy[xs:xs+nxl,ys:ys+nyl] += filein['mz_xy'] - mz_xz[xs:xs+nxl,zs:zs+nzl] += filein['mz_xz'] - mz_yz[ys:ys+nyl,zs:zs+nzl] += filein['mz_yz'] - E_xy[xs:xs+nxl,ys:ys+nyl] += filein['E_xy'] - E_xz[xs:xs+nxl,zs:zs+nzl] += filein['E_xz'] - E_yz[ys:ys+nyl,zs:zs+nzl] += filein['E_yz'] - if DE: - GE_xy[xs:xs+nxl,ys:ys+nyl] += filein['GE_xy'] - GE_xz[xs:xs+nxl,zs:zs+nzl] += filein['GE_xz'] - GE_yz[ys:ys+nyl,zs:zs+nzl] += filein['GE_yz'] - if SCALAR: - scalar_xy[xs:xs+nxl,ys:ys+nyl] += filein['scalar_xy'] - scalar_xz[xs:xs+nxl,zs:zs+nzl] += filein['scalar_xz'] - scalar_yz[ys:ys+nyl,zs:zs+nzl] += filein['scalar_yz'] - - filein.close() - - # wrte out the new datasets - fileout.create_dataset('d_xy', data=d_xy) - fileout.create_dataset('d_xz', data=d_xz) - fileout.create_dataset('d_yz', data=d_yz) - fileout.create_dataset('mx_xy', data=mx_xy) - fileout.create_dataset('mx_xz', data=mx_xz) - fileout.create_dataset('mx_yz', data=mx_yz) - fileout.create_dataset('my_xy', data=my_xy) - fileout.create_dataset('my_xz', data=my_xz) - fileout.create_dataset('my_yz', data=my_yz) - fileout.create_dataset('mz_xy', data=mz_xy) - fileout.create_dataset('mz_xz', data=mz_xz) - fileout.create_dataset('mz_yz', data=mz_yz) - fileout.create_dataset('E_xy', data=E_xy) - fileout.create_dataset('E_xz', data=E_xz) - fileout.create_dataset('E_yz', data=E_yz) - if DE: - fileout.create_dataset('GE_xy', data=GE_xy) - fileout.create_dataset('GE_xz', data=GE_xz) - fileout.create_dataset('GE_yz', data=GE_yz) - if SCALAR: - fileout.create_dataset('scalar_xy', data=scalar_xy) - fileout.create_dataset('scalar_xz', data=scalar_xz) - fileout.create_dataset('scalar_yz', data=scalar_yz) - - fileout.close() - return - -def rot_proj(n,dnamein,dnameout): - """ - n: integer, output number of file - dnamein: string, directory name of input files, should include '/' at end or leave blank for current directory - dnameout: string, directory name of output files, should include '/' at end or leave blank for current directory - double: optional bool, double precision (float64) if True, single precision (float32) if False - - Reads files of form dnamein{n}_rot_proj.h5.{rank}, looping over rank, outputting to file dnameout{n}_rot_proj.h5. - """ - - fileout = h5py.File(dnameout+str(n)+'_rot_proj.h5', 'w') - i = -1 - - while True: - # loop over files for a given output time - i += 1 - fileinname = dnamein+str(n)+'_rot_proj.h5.'+str(i) - if not os.path.isfile(fileinname): - break - - if verbose: - print(fileinname) - - filein = h5py.File(dnamein+fileinname,'r') - head = filein.attrs - # if it's the first input file, write the header attributes - # and create the arrays to hold the output data - if (i == 0): - - nxr = int(head['nxr']) - nzr = int(head['nzr']) - Lx = head['Lx'] - Lz = head['Lz'] - delta = head['delta'] - theta = head['theta'] - phi = head['phi'] - gamma = head['gamma'] - t = head['t'] - dt = head['dt'] - n_step = head['n_step'] - fileout.attrs['nxr'] = nxr - fileout.attrs['nzr'] = nzr - fileout.attrs['Lx'] = Lx - fileout.attrs['Lz'] = Lz - fileout.attrs['delta'] = delta - fileout.attrs['theta'] = theta - fileout.attrs['phi'] = phi - fileout.attrs['gamma'] = gamma - fileout.attrs['t'] = t - fileout.attrs['dt'] = dt - fileout.attrs['n_step'] = n_step - - d_xzr = np.zeros((nxr, nzr)) - vx_xzr = np.zeros((nxr, nzr)) - vy_xzr = np.zeros((nxr, nzr)) - vz_xzr = np.zeros((nxr, nzr)) - T_xzr = np.zeros((nxr, nzr)) - - # end first input file - - # write data from individual processor file to - # correct location in concatenated file - nx_min = int(head['nx_min']) - nx_max = int(head['nx_max']) - nz_min = int(head['nz_min']) - nz_max = int(head['nz_max']) - - d_xzr[nx_min:nx_max,nz_min:nz_max] += filein['d_xzr'][:] - vx_xzr[nx_min:nx_max,nz_min:nz_max] += filein['vx_xzr'][:] - vy_xzr[nx_min:nx_max,nz_min:nz_max] += filein['vy_xzr'][:] - vz_xzr[nx_min:nx_max,nz_min:nz_max] += filein['vz_xzr'][:] - T_xzr[nx_min:nx_max,nz_min:nz_max] += filein['T_xzr'][:] - - filein.close() - # end while loop - - # write out the new datasets - fileout.create_dataset("d_xzr", data=d_xzr) - fileout.create_dataset("vx_xzr", data=vx_xzr) - fileout.create_dataset("vy_xzr", data=vy_xzr) - fileout.create_dataset("vz_xzr", data=vz_xzr) - fileout.create_dataset("T_xzr", data=T_xzr) - - fileout.close() diff --git a/python_scripts/cat_dset_3D.py b/python_scripts/cat_dset_3D.py deleted file mode 100755 index 4cff6dc9a..000000000 --- a/python_scripts/cat_dset_3D.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python3 -""" -Python script for concatenating 3D hdf5 datasets. Includes a CLI for concatenating Cholla HDF5 datasets and can be -imported into other scripts where the `concat_3d` function can be used to concatenate the datasets. - -Generally the easiest way to import this script is to add the `python_scripts` directory to your python path in your -script like this: -``` -import sys -sys.path.append('/PATH/TO/CHOLLA/python_scripts') -import cat_dset_3D -``` -""" - -import h5py -import numpy as np -import argparse -import pathlib - -def main(): - """This function handles the CLI argument parsing and is only intended to be used when this script is invoked from the - command line. If you're importing this file then use the `concat_3d` function directly. - """ - # Argument handling - cli = argparse.ArgumentParser() - # Required Arguments - cli.add_argument('-s', '--start_num', type=int, required=True, help='The first output step to concatenate') - cli.add_argument('-e', '--end_num', type=int, required=True, help='The last output step to concatenate') - cli.add_argument('-n', '--num_processes', type=int, required=True, help='The number of processes that were used') - # Optional Arguments - cli.add_argument('-i', '--input_dir', type=pathlib.Path, default=pathlib.Path.cwd(), help='The input directory.') - cli.add_argument('-o', '--output_dir', type=pathlib.Path, default=pathlib.Path.cwd(), help='The output directory.') - args = cli.parse_args() - - # Perform the concatenation - concat_3d(start_num=args.start_num, - end_num=args.end_num, - num_processes=args.num_processes, - input_dir=args.input_dir, - output_dir=args.output_dir) - - -# ====================================================================================================================== -def concat_3d(start_num: int, - end_num: int, - num_processes: int, - input_dir: pathlib.Path = pathlib.Path.cwd(), - output_dir: pathlib.Path = pathlib.Path.cwd()): - """Concatenate 3D HDF5 Cholla datasets. i.e. take the single files generated per process and concatenate them into a - single, large file. All outputs from start_num to end_num will be concatenated. - - Args: - start_num (int): The first output step to concatenate - end_num (int): The last output step to concatenate - num_processes (int): The number of processes that were used - input_dir (pathlib.Path, optional): The input directory. Defaults to pathlib.Path.cwd(). - output_dir (pathlib.Path, optional): The output directory. Defaults to pathlib.Path.cwd(). - """ - - # Error checking - assert start_num >= 0, 'start_num must be greater than or equal to 0' - assert end_num >= 0, 'end_num must be greater than or equal to 0' - assert start_num <= end_num, 'end_num should be greater than or equal to start_num' - assert num_processes > 1, 'num_processes must be greater than 1' - - # loop over outputs - for n in range(start_num, end_num+1): - - # loop over files for a given output - for i in range(0, num_processes): - - # open the output file for writing (don't overwrite if exists) - fileout = h5py.File(output_dir / f'{n}.h5', 'a') - # open the input file for reading - filein = h5py.File(input_dir / f'{n}.h5.{i}', 'r') - # read in the header data from the input file - head = filein.attrs - - # if it's the first input file, write the header attributes - # and create the datasets in the output file - if (i == 0): - nx = head['dims'][0] - ny = head['dims'][1] - nz = head['dims'][2] - fileout.attrs['dims'] = [nx, ny, nz] - fileout.attrs['gamma'] = [head['gamma'][0]] - fileout.attrs['t'] = [head['t'][0]] - fileout.attrs['dt'] = [head['dt'][0]] - fileout.attrs['n_step'] = [head['n_step'][0]] - - units = ['time_unit', 'mass_unit', 'length_unit', 'energy_unit', 'velocity_unit', 'density_unit'] - for unit in units: - fileout.attrs[unit] = [head[unit][0]] - - d = fileout.create_dataset("density", (nx, ny, nz), chunks=True, dtype=filein['density'].dtype) - mx = fileout.create_dataset("momentum_x", (nx, ny, nz), chunks=True, dtype=filein['momentum_x'].dtype) - my = fileout.create_dataset("momentum_y", (nx, ny, nz), chunks=True, dtype=filein['momentum_y'].dtype) - mz = fileout.create_dataset("momentum_z", (nx, ny, nz), chunks=True, dtype=filein['momentum_z'].dtype) - E = fileout.create_dataset("Energy", (nx, ny, nz), chunks=True, dtype=filein['Energy'].dtype) - try: - GE = fileout.create_dataset("GasEnergy", (nx, ny, nz), chunks=True, dtype=filein['GasEnergy'].dtype) - except KeyError: - print('No Dual energy data present'); - try: - bx = fileout.create_dataset("magnetic_x", (nx+1, ny, nz), chunks=True, dtype=filein['magnetic_x'].dtype) - by = fileout.create_dataset("magnetic_y", (nx, ny+1, nz), chunks=True, dtype=filein['magnetic_y'].dtype) - bz = fileout.create_dataset("magnetic_z", (nx, ny, nz+1), chunks=True, dtype=filein['magnetic_z'].dtype) - except KeyError: - print('No magnetic field data present'); - - # write data from individual processor file to - # correct location in concatenated file - nxl = head['dims_local'][0] - nyl = head['dims_local'][1] - nzl = head['dims_local'][2] - xs = head['offset'][0] - ys = head['offset'][1] - zs = head['offset'][2] - fileout['density'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['density'] - fileout['momentum_x'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['momentum_x'] - fileout['momentum_y'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['momentum_y'] - fileout['momentum_z'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['momentum_z'] - fileout['Energy'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['Energy'] - try: - fileout['GasEnergy'][xs:xs+nxl,ys:ys+nyl,zs:zs+nzl] = filein['GasEnergy'] - except KeyError: - print('No Dual energy data present'); - try: - fileout['magnetic_x'][xs:xs+nxl+1, ys:ys+nyl, zs:zs+nzl] = filein['magnetic_x'] - fileout['magnetic_y'][xs:xs+nxl, ys:ys+nyl+1, zs:zs+nzl] = filein['magnetic_y'] - fileout['magnetic_z'][xs:xs+nxl, ys:ys+nyl, zs:zs+nzl+1] = filein['magnetic_z'] - except KeyError: - print('No magnetic field data present'); - - filein.close() - - fileout.close() -# ====================================================================================================================== - -if __name__ == '__main__': - main() diff --git a/python_scripts/cat_particles.py b/python_scripts/cat_particles.py deleted file mode 100644 index 03cbcd71c..000000000 --- a/python_scripts/cat_particles.py +++ /dev/null @@ -1,91 +0,0 @@ -# Example file for concatenating particle data - -import h5py -import numpy as np - -ns = 0 -ne = 300 -n_procs = 4 # number of processors that did the cholla calculation -dnamein = '/gpfs/alpine/proj-shared/csc380/orlandow/o_cholla/out.21Sep20-Mon-12.49-356588-SOR_ONLY_PARTICLES_DISK/raw/' -dnameout = '/gpfs/alpine/proj-shared/csc380/orlandow/o_cholla/out.21Sep20-Mon-12.49-356588-SOR_ONLY_PARTICLES_DISK/particles_cat/' - -# loop over the output times -for n in range(ns, ne+1): - - # open the output file for writing - fileout = h5py.File(dnameout+str(n)+'_particles.h5', 'w') - - if (n % 10 == 0): print(str(n)) - - # loop over files for a given output time - for i in range(0, n_procs): - - # open the input file for reading - filein = h5py.File(dnamein+str(n)+'_particles.h5.'+str(i), 'r') - # read in the header data from the input file - head = filein.attrs - - # if it's the first input file, write the header attributes - # and create the datasets in the output file - if (i == 0): - gamma = head['gamma'] - t = head['t'] - dt = head['dt'] - n_step = head['n_step'] - nx = head['dims'][0] - ny = head['dims'][1] - nz = head['dims'][2] - fileout.attrs['gamma'] = gamma - fileout.attrs['t'] = t - fileout.attrs['dt'] = dt - fileout.attrs['n_step'] = n_step - fileout.attrs['dims'] = [nx, ny, nz] - fileout.attrs['velocity_unit'] = head['velocity_unit'] - fileout.attrs['length_unit'] = head['length_unit'] - fileout.attrs['particle_mass'] = head['particle_mass'] - fileout.attrs['density_unit'] = head['density_unit'] - - x = np.array([]) - y = np.array([]) - z = np.array([]) - vx = np.array([]) - vy = np.array([]) - vz = np.array([]) - particle_ids = np.array([]) - density = np.zeros((nx, ny, nz)) - n_total_particles = 0 - - - # write data from individual processor file to - # correct location in concatenated file - nxl = head['dims_local'][0] - nyl = head['dims_local'][1] - nzl = head['dims_local'][2] - xs = head['offset'][0] - ys = head['offset'][1] - zs = head['offset'][2] - - n_total_particles += head['n_particles_local'] - density[xs:xs+nxl, ys:ys+nyl, zs:zs+nzl] += filein['density'] - x = np.append(x, filein['pos_x']) - y = np.append(y, filein['pos_y']) - z = np.append(z, filein['pos_z']) - vx = np.append(vx, filein['vel_x']) - vy = np.append(vy, filein['vel_y']) - vz = np.append(vz, filein['vel_z']) - particle_ids = np.append(particle_ids, filein['particle_IDs']) - - filein.close() - - # write out the new datasets - fileout.create_dataset('x', data=x) - fileout.create_dataset('y', data=y) - fileout.create_dataset('z', data=z) - fileout.create_dataset('vx', data=vx) - fileout.create_dataset('vy', data=vy) - fileout.create_dataset('vz', data=vz) - fileout.create_dataset('particle_ids', data=particle_ids) - fileout.create_dataset('density', data=density) - fileout.attrs['n_total_particles'] = n_total_particles - - fileout.close() diff --git a/python_scripts/cat_projection.py b/python_scripts/cat_projection.py deleted file mode 100755 index 29b56a416..000000000 --- a/python_scripts/cat_projection.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python3 -# Example file for concatenating on-axis projection data -# created when the -DPROJECTION flag is turned on - -import h5py -import numpy as np - -ns = 0 -ne = 0 -n_procs = 16 # number of processors that did the cholla calculation -dnamein = './hdf5/raw/' -dnameout = './hdf5/' - -# loop over the output times -for n in range(ns, ne+1): - - # open the output file for writing - fileout = h5py.File(dnameout+str(n)+'_proj.h5', 'w') - - # loop over files for a given output time - for i in range(0, n_procs): - - # open the input file for reading - filein = h5py.File(dnamein+str(n)+'_proj.h5.'+str(i), 'r') - # read in the header data from the input file - head = filein.attrs - - # if it's the first input file, write the header attributes - # and create the datasets in the output file - if (i == 0): - nx = head['dims'][0] - ny = head['dims'][1] - nz = head['dims'][2] - fileout.attrs['dims'] = [nx, ny, nz] - fileout.attrs['gamma'] = [head['gamma'][0]] - fileout.attrs['t'] = [head['t'][0]] - fileout.attrs['dt'] = [head['dt'][0]] - fileout.attrs['n_step'] = [head['n_step'][0]] - - dxy = np.zeros((nx,ny)) - dxz = np.zeros((nx,nz)) - Txy = np.zeros((nx,ny)) - Txz = np.zeros((nx,nz)) - - # write data from individual processor file to - # correct location in concatenated file - nxl = head['dims_local'][0] - nyl = head['dims_local'][1] - nzl = head['dims_local'][2] - xs = head['offset'][0] - ys = head['offset'][1] - zs = head['offset'][2] - - dxy[xs:xs+nxl,ys:ys+nyl] += filein['d_xy'] - dxz[xs:xs+nxl,zs:zs+nzl] += filein['d_xz'] - Txy[xs:xs+nxl,ys:ys+nyl] += filein['T_xy'] - Txz[xs:xs+nxl,zs:zs+nzl] += filein['T_xz'] - - filein.close() - - # write out the new datasets - fileout.create_dataset('d_xy', data=dxy) - fileout.create_dataset('d_xz', data=dxz) - fileout.create_dataset('T_xy', data=Txy) - fileout.create_dataset('T_xz', data=Txz) - - fileout.close() diff --git a/python_scripts/cat_rotated_projection.py b/python_scripts/cat_rotated_projection.py deleted file mode 100755 index 6e769ce55..000000000 --- a/python_scripts/cat_rotated_projection.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 -# Example file for concatenating rotated projection data -# created when the -DROTATED_PROJECTION flag is turned on - -import h5py -import numpy as np - -ns = 0 -ne = 0 -n_procs = 16 # number of processors that did the cholla calculation -dnamein = './hdf5/raw/' -dnameout = './hdf5/' - -# loop over the output times -for n in range(ns, ne+1): - - # open the output file for writing - fileout = h5py.File(dnameout+str(n)+'_rot_proj.h5', 'w') - - # loop over files for a given output time - for i in range(0, n_procs): - - # open the input file for reading - filein = h5py.File(dnamein+str(n)+'_rot_proj.h5.'+str(i), 'r') - # read in the header data from the input file - head = filein.attrs - - # if it's the first input file, write the header attributes - # and create the arrays to hold the output data - if (i == 0): - nxr = int(head['nxr']) - nzr = int(head['nzr']) - Lx = head['Lx'] - Lz = head['Lz'] - delta = head['delta'] - theta = head['theta'] - phi = head['phi'] - gamma = head['gamma'] - t = head['t'] - dt = head['dt'] - n_step = head['n_step'] - fileout.attrs['nxr'] = nxr - fileout.attrs['nzr'] = nzr - fileout.attrs['Lx'] = Lx - fileout.attrs['Lz'] = Lz - fileout.attrs['delta'] = delta - fileout.attrs['theta'] = theta - fileout.attrs['phi'] = phi - fileout.attrs['gamma'] = gamma - fileout.attrs['t'] = t - fileout.attrs['dt'] = dt - fileout.attrs['n_step'] = n_step - - d_xzr = np.zeros((nxr, nzr)) - vx_xzr = np.zeros((nxr, nzr)) - vy_xzr = np.zeros((nxr, nzr)) - vz_xzr = np.zeros((nxr, nzr)) - T_xzr = np.zeros((nxr, nzr)) - - # write data from individual processor file to - # correct location in concatenated file - nx_min = int(head['nx_min']) - nx_max = int(head['nx_max']) - nz_min = int(head['nz_min']) - nz_max = int(head['nz_max']) - - d_xzr[nx_min:nx_max,nz_min:nz_max] += filein['d_xzr'][:] - vx_xzr[nx_min:nx_max,nz_min:nz_max] += filein['vx_xzr'][:] - vy_xzr[nx_min:nx_max,nz_min:nz_max] += filein['vy_xzr'][:] - vz_xzr[nx_min:nx_max,nz_min:nz_max] += filein['vz_xzr'][:] - T_xzr[nx_min:nx_max,nz_min:nz_max] += filein['T_xzr'][:] - - filein.close() - - # write out the new datasets - fileout.create_dataset("d_xzr", data=d_xzr) - fileout.create_dataset("vx_xzr", data=vx_xzr) - fileout.create_dataset("vy_xzr", data=vy_xzr) - fileout.create_dataset("vz_xzr", data=vz_xzr) - fileout.create_dataset("T_xzr", data=T_xzr) - - fileout.close() - - - diff --git a/python_scripts/cat_slice.py b/python_scripts/cat_slice.py deleted file mode 100644 index 7b6d15e12..000000000 --- a/python_scripts/cat_slice.py +++ /dev/null @@ -1,130 +0,0 @@ -# Example file for concatenating on-axis slice data -# created when the -DSLICES flag is turned on - -import h5py -import numpy as np - -ns = 0 -ne = 2 -n_procs = 4 # number of processors that did the cholla calculation -dnamein = '/gpfs/alpine/proj-shared/csc380/orlandow/o_cholla/out.21Sep20-Mon-14.17-357075-SOR_HYDRO_DISK/raw/' -dnameout = '/gpfs/alpine/proj-shared/csc380/orlandow/o_cholla/out.21Sep20-Mon-14.17-357075-SOR_HYDRO_DISK/catted_files' - -DE = True # set to True if Dual Energy flag was used -SCALAR = False # set to True if Scalar was used - -# loop over the output times -for n in range(ns, ne+1): - - # open the output file for writing - fileout = h5py.File(dnameout+str(n)+'_slice.h5', 'w') - - # loop over files for a given output time - for i in range(0, n_procs): - - # open the input file for reading - filein = h5py.File(dnamein+str(n)+'_slice.h5.'+str(i), 'r') - # read in the header data from the input file - head = filein.attrs - - # if it's the first input file, write the header attributes - # and create the datasets in the output file - if (i == 0): - gamma = head['gamma'] - t = head['t'] - dt = head['dt'] - n_step = head['n_step'] - nx = head['dims'][0] - ny = head['dims'][1] - nz = head['dims'][2] - fileout.attrs['gamma'] = gamma - fileout.attrs['t'] = t - fileout.attrs['dt'] = dt - fileout.attrs['n_step'] = n_step - fileout.attrs['dims'] = [nx, ny, nz] - - d_xy = np.zeros((nx,ny)) - d_xz = np.zeros((nx,nz)) - d_yz = np.zeros((ny,nz)) - mx_xy = np.zeros((nx,ny)) - mx_xz = np.zeros((nx,nz)) - mx_yz = np.zeros((ny,nz)) - my_xy = np.zeros((nx,ny)) - my_xz = np.zeros((nx,nz)) - my_yz = np.zeros((ny,nz)) - mz_xy = np.zeros((nx,ny)) - mz_xz = np.zeros((nx,nz)) - mz_yz = np.zeros((ny,nz)) - E_xy = np.zeros((nx,ny)) - E_xz = np.zeros((nx,nz)) - E_yz = np.zeros((ny,nz)) - if DE: - GE_xy = np.zeros((nx,ny)) - GE_xz = np.zeros((nx,nz)) - GE_yz = np.zeros((ny,nz)) - if SCALAR: - scalar_xy = np.zeros((nx,ny)) - scalar_xz = np.zeros((nx,nz)) - scalar_yz = np.zeros((ny,nz)) - - # write data from individual processor file to - # correct location in concatenated file - nxl = head['dims_local'][0] - nyl = head['dims_local'][1] - nzl = head['dims_local'][2] - xs = head['offset'][0] - ys = head['offset'][1] - zs = head['offset'][2] - - d_xy[xs:xs+nxl,ys:ys+nyl] += filein['d_xy'] - d_xz[xs:xs+nxl,zs:zs+nzl] += filein['d_xz'] - d_yz[ys:ys+nyl,zs:zs+nzl] += filein['d_yz'] - mx_xy[xs:xs+nxl,ys:ys+nyl] += filein['mx_xy'] - mx_xz[xs:xs+nxl,zs:zs+nzl] += filein['mx_xz'] - mx_yz[ys:ys+nyl,zs:zs+nzl] += filein['mx_yz'] - my_xy[xs:xs+nxl,ys:ys+nyl] += filein['my_xy'] - my_xz[xs:xs+nxl,zs:zs+nzl] += filein['my_xz'] - my_yz[ys:ys+nyl,zs:zs+nzl] += filein['my_yz'] - mz_xy[xs:xs+nxl,ys:ys+nyl] += filein['mz_xy'] - mz_xz[xs:xs+nxl,zs:zs+nzl] += filein['mz_xz'] - mz_yz[ys:ys+nyl,zs:zs+nzl] += filein['mz_yz'] - E_xy[xs:xs+nxl,ys:ys+nyl] += filein['E_xy'] - E_xz[xs:xs+nxl,zs:zs+nzl] += filein['E_xz'] - E_yz[ys:ys+nyl,zs:zs+nzl] += filein['E_yz'] - if DE: - GE_xy[xs:xs+nxl,ys:ys+nyl] += filein['GE_xy'] - GE_xz[xs:xs+nxl,zs:zs+nzl] += filein['GE_xz'] - GE_yz[ys:ys+nyl,zs:zs+nzl] += filein['GE_yz'] - if SCALAR: - scalar_xy[xs:xs+nxl,ys:ys+nyl] += filein['scalar_xy'] - scalar_xz[xs:xs+nxl,zs:zs+nzl] += filein['scalar_xz'] - scalar_yz[ys:ys+nyl,zs:zs+nzl] += filein['scalar_yz'] - - filein.close() - - # wrte out the new datasets - fileout.create_dataset('d_xy', data=d_xy) - fileout.create_dataset('d_xz', data=d_xz) - fileout.create_dataset('d_yz', data=d_yz) - fileout.create_dataset('mx_xy', data=mx_xy) - fileout.create_dataset('mx_xz', data=mx_xz) - fileout.create_dataset('mx_yz', data=mx_yz) - fileout.create_dataset('my_xy', data=my_xy) - fileout.create_dataset('my_xz', data=my_xz) - fileout.create_dataset('my_yz', data=my_yz) - fileout.create_dataset('mz_xy', data=mz_xy) - fileout.create_dataset('mz_xz', data=mz_xz) - fileout.create_dataset('mz_yz', data=mz_yz) - fileout.create_dataset('E_xy', data=E_xy) - fileout.create_dataset('E_xz', data=E_xz) - fileout.create_dataset('E_yz', data=E_yz) - if DE: - fileout.create_dataset('GE_xy', data=GE_xy) - fileout.create_dataset('GE_xz', data=GE_xz) - fileout.create_dataset('GE_yz', data=GE_yz) - if SCALAR: - fileout.create_dataset('scalar_xy', data=scalar_xy) - fileout.create_dataset('scalar_xz', data=scalar_xz) - fileout.create_dataset('scalar_yz', data=scalar_yz) - - fileout.close() diff --git a/python_scripts/concat_2d_data.py b/python_scripts/concat_2d_data.py new file mode 100755 index 000000000..5cf6fde55 --- /dev/null +++ b/python_scripts/concat_2d_data.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +""" +Python script for concatenating 2D hdf5 datasets for when -DSLICES, +-DPROJECTION, or -DROTATED_PROJECTION is turned on in Cholla. Includes a CLI for +concatenating Cholla HDF5 datasets and can be imported into other scripts where +the `concat_2d_dataset` function can be used to concatenate the HDF5 files. + +Generally the easiest way to import this script is to add the `python_scripts` +directory to your python path in your script like this: +``` +import sys +sys.path.append('/PATH/TO/CHOLLA/python_scripts') +import concat_2d_data +``` +""" + +import h5py +import pathlib +import numpy as np + +import concat_internals + +# ============================================================================== +def concat_2d_dataset(source_directory: pathlib.Path, + output_directory: pathlib.Path, + num_processes: int, + output_number: int, + dataset_kind: str, + concat_xy: bool = True, + concat_yz: bool = True, + concat_xz: bool = True, + skip_fields: list = [], + destination_dtype: np.dtype = None, + compression_type: str = None, + compression_options: str = None, + chunking = None) -> None: + """Concatenate 2D HDF5 Cholla datasets. i.e. take the single files + generated per process and concatenate them into a single, large file. This + function concatenates a single output time and can be called multiple times, + potentially in parallel, to concatenate multiple output times. + + Parameters + ---------- + source_directory : pathlib.Path + The directory containing the unconcatenated files + output_directory : pathlib.Path + The directory containing the new concatenated files + num_processes : int + The number of ranks that Cholla was run with + output_number : int + The output number to concatenate + dataset_kind : str + The type of 2D dataset to concatenate. Can be 'slice', 'proj', or 'rot_proj'. + concat_xy : bool + If True then concatenate the XY slices/projections. Defaults to True. + concat_yz : bool + If True then concatenate the YZ slices/projections. Defaults to True. + concat_xz : bool + If True then concatenate the XZ slices/projections. Defaults to True. + skip_fields : list + List of fields to skip concatenating. Defaults to []. + destination_dtype : np.dtype + The data type of the output datasets. Accepts most numpy types. Defaults to the same as the input datasets. + compression_type : str + What kind of compression to use on the output data. Defaults to None. + compression_options : str + What compression settings to use if compressing. Defaults to None. + chunking : bool or tuple + Whether or not to use chunking and the chunk size. Defaults to None. + source_directory: pathlib.Path : + + output_directory: pathlib.Path : + + num_processes: int : + + output_number: int : + + dataset_kind: str : + + concat_xy: bool : + (Default value = True) + concat_yz: bool : + (Default value = True) + concat_xz: bool : + (Default value = True) + skip_fields: list : + (Default value = []) + destination_dtype: np.dtype : + (Default value = None) + compression_type: str : + (Default value = None) + compression_options: str : + (Default value = None) + + Returns + ------- + + """ + + # Error checking + assert num_processes > 1, 'num_processes must be greater than 1' + assert output_number >= 0, 'output_number must be greater than or equal to 0' + assert dataset_kind in ['slice', 'proj', 'rot_proj'], '`dataset_kind` can only be one of "slice", "proj", "rot_proj".' + + # Open destination file + destination_file = concat_internals.destination_safe_open(output_directory / f'{output_number}_{dataset_kind}.h5') + + # Setup the destination file + with h5py.File(source_directory / f'{output_number}_{dataset_kind}.h5.0', 'r') as source_file: + # Copy over header + destination_file = concat_internals.copy_header(source_file, destination_file) + + # Get a list of all datasets in the source file + datasets_to_copy = list(source_file.keys()) + + # Filter the datasets to only include those that need to be copied + if not concat_xy: + datasets_to_copy = [dataset for dataset in datasets_to_copy if not 'xy' in dataset] + if not concat_yz: + datasets_to_copy = [dataset for dataset in datasets_to_copy if not 'yz' in dataset] + if not concat_xz: + datasets_to_copy = [dataset for dataset in datasets_to_copy if not 'xz' in dataset] + datasets_to_copy = [dataset for dataset in datasets_to_copy if not dataset in skip_fields] + + # Create the datasets in the destination file + zero_array = np.zeros(1) + for dataset in datasets_to_copy: + dtype = source_file[dataset].dtype if (destination_dtype == None) else destination_dtype + + dataset_shape = __get_2d_dataset_shape(source_file, dataset) + + # Create array to initialize data to zero, this is required for projections + if zero_array.shape != dataset_shape: + zero_array = np.zeros(dataset_shape) + + destination_file.create_dataset(name=dataset, + shape=dataset_shape, + data=zero_array, + dtype=dtype, + chunks=chunking, + compression=compression_type, + compression_opts=compression_options) + + # Copy data + for rank in range(num_processes): + # Open source file + source_file = h5py.File(source_directory / f'{output_number}_{dataset_kind}.h5.{rank}', 'r') + + # Loop through and copy datasets + for dataset in datasets_to_copy: + # Determine locations and shifts for writing + (i0_start, i0_end, i1_start, i1_end), file_in_slice = __write_bounds_2d_dataset(source_file, dataset) + + # If this is a slice dataset we can skip loading the source file if that + # file isn't in the slice + if dataset_kind == 'slice' and not file_in_slice: + continue + + # Copy the data, the summation is required for projections but not slices + destination_file[dataset][i0_start:i0_end, + i1_start:i1_end] += source_file[dataset] + + # Now that the copy is done we close the source file + source_file.close() + + # Close destination file now that it is fully constructed + destination_file.close() +# ============================================================================== + +# ============================================================================== +def __get_2d_dataset_shape(source_file: h5py.File, dataset: str) -> tuple: + """Determine the shape of the full 2D dataset + + Args: + source_file (h5py.File): The source file the get the shape information from + dataset (str): The dataset to get the shape of + + Raises: + ValueError: If the dataset name isn't a 2D dataset name + + Returns: + tuple: The dimensions of the dataset + """ + + if 'xzr' in dataset: + return (source_file.attrs['nxr'][0], source_file.attrs['nzr'][0]) + + nx, ny, nz = source_file.attrs['dims'] + if 'xy' in dataset: + dimensions = (nx, ny) + elif 'yz' in dataset: + dimensions = (ny, nz) + elif 'xz' in dataset: + dimensions = (nx, nz) + else: + raise ValueError(f'Dataset "{dataset}" is not a slice.') + + return dimensions +# ============================================================================== + +# ============================================================================== +def __write_bounds_2d_dataset(source_file: h5py.File, dataset: str) -> tuple: + """Determine the bounds of the concatenated file to write to + + Args: + source_file (h5py.File): The source file to read from + dataset (str): The name of the dataset to read from the source file + + Raises: + ValueError: If the dataset name isn't a 2D dataset name + + Returns: + tuple: The write bounds for the concatenated file to be used like + `output_file[dataset][return[0]:return[1], return[2]:return[3]]` followed by a bool to indicate if the file is + in the slice if concatenating a slice + """ + + if 'xzr' in dataset: + return (source_file.attrs['nx_min'][0], source_file.attrs['nx_max'][0], + source_file.attrs['nz_min'][0], source_file.attrs['nz_max'][0]), True + + nx, ny, nz = source_file.attrs['dims'] + nx_local, ny_local, nz_local = source_file.attrs['dims_local'] + x_start, y_start, z_start = source_file.attrs['offset'] + + if 'xy' in dataset: + file_in_slice = z_start <= nz//2 <= z_start+nz_local + bounds = (x_start, x_start+nx_local, y_start, y_start+ny_local) + elif 'yz' in dataset: + file_in_slice = x_start <= nx//2 <= x_start+nx_local + bounds = (y_start, y_start+ny_local, z_start, z_start+nz_local) + elif 'xz' in dataset: + file_in_slice = y_start <= ny//2 <= y_start+ny_local + bounds = (x_start, x_start+nx_local, z_start, z_start+nz_local) + else: + raise ValueError(f'Dataset "{dataset}" is not a slice or projection.') + + return bounds, file_in_slice +# ============================================================================== + +if __name__ == '__main__': + from timeit import default_timer + start = default_timer() + + cli = concat_internals.common_cli() + cli.add_argument('-d', '--dataset-kind', type=str, required=True, help='What kind of 2D dataset to concatnate. Options are "slice", "proj", and "rot_proj"') + cli.add_argument('--disable-xy', default=True, action='store_false', help='Disables concating the XY datasets.') + cli.add_argument('--disable-yz', default=True, action='store_false', help='Disables concating the YZ datasets.') + cli.add_argument('--disable-xz', default=True, action='store_false', help='Disables concating the XZ datasets.') + args = cli.parse_args() + + # Perform the concatenation + for output in args.concat_outputs: + concat_2d_dataset(source_directory=args.source_directory, + output_directory=args.output_directory, + num_processes=args.num_processes, + output_number=output, + dataset_kind=args.dataset_kind, + concat_xy=args.disable_xy, + concat_yz=args.disable_yz, + concat_xz=args.disable_xz, + skip_fields=args.skip_fields, + destination_dtype=args.dtype, + compression_type=args.compression_type, + compression_options=args.compression_opts, + chunking=args.chunking) + + print(f'\nTime to execute: {round(default_timer()-start,2)} seconds') diff --git a/python_scripts/concat_3d_data.py b/python_scripts/concat_3d_data.py new file mode 100755 index 000000000..599a4a8d1 --- /dev/null +++ b/python_scripts/concat_3d_data.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +""" +Python script for concatenating 3D hdf5 datasets. Includes a CLI for concatenating Cholla HDF5 datasets and can be +imported into other scripts where the `concat_3d_dataset` function can be used to concatenate the datasets. + +Generally the easiest way to import this script is to add the `python_scripts` directory to your python path in your +script like this: +``` +import sys +sys.path.append('/PATH/TO/CHOLLA/python_scripts') +import concat_3d_data +``` +""" + +import h5py +import numpy as np +import pathlib + +import concat_internals + +# ============================================================================== +def concat_3d_dataset(source_directory: pathlib.Path, + output_directory: pathlib.Path, + num_processes: int, + output_number: int, + skip_fields: list = [], + destination_dtype: np.dtype = None, + compression_type: str = None, + compression_options: str = None, + chunking = None) -> None: + """Concatenate a single 3D HDF5 Cholla dataset. i.e. take the single files + generated per process and concatenate them into a single, large file. + + Parameters + ---------- + source_directory : pathlib.Path + The directory containing the unconcatenated files + output_directory : pathlib.Path + The directory containing the new concatenated files + num_processes : int + The number of ranks that Cholla was run with + output_number : int + The output number to concatenate + skip_fields : list + List of fields to skip concatenating. Defaults to []. + destination_dtype : np.dtype + The data type of the output datasets. Accepts most numpy types. Defaults to the same as the input datasets. + compression_type : str + What kind of compression to use on the output data. Defaults to None. + compression_options : str + What compression settings to use if compressing. Defaults to None. + chunking : bool or tuple + Whether or not to use chunking and the chunk size. Defaults to None. + source_directory: pathlib.Path : + + output_directory: pathlib.Path : + + num_processes: int : + + output_number: int : + + skip_fields: list : + (Default value = []) + destination_dtype: np.dtype : + (Default value = None) + compression_type: str : + (Default value = None) + compression_options: str : + (Default value = None) + + Returns + ------- + + """ + + # Error checking + assert num_processes > 1, 'num_processes must be greater than 1' + assert output_number >= 0, 'output_number must be greater than or equal to 0' + + # Open the output file for writing + destination_file = concat_internals.destination_safe_open(output_directory / f'{output_number}.h5') + + # Setup the output file + with h5py.File(source_directory / f'{output_number}.h5.0', 'r') as source_file: + # Copy header data + destination_file = concat_internals.copy_header(source_file, destination_file) + + # Create the datasets in the output file + datasets_to_copy = list(source_file.keys()) + datasets_to_copy = [dataset for dataset in datasets_to_copy if not dataset in skip_fields] + + for dataset in datasets_to_copy: + dtype = source_file[dataset].dtype if (destination_dtype == None) else destination_dtype + + data_shape = source_file.attrs['dims'] + + if dataset == 'magnetic_x': data_shape[0] += 1 + if dataset == 'magnetic_y': data_shape[1] += 1 + if dataset == 'magnetic_z': data_shape[2] += 1 + + destination_file.create_dataset(name=dataset, + shape=data_shape, + dtype=dtype, + chunks=chunking, + compression=compression_type, + compression_opts=compression_options) + + # loop over files for a given output + for i in range(0, num_processes): + # open the input file for reading + source_file = h5py.File(source_directory / f'{output_number}.h5.{i}', 'r') + + # Compute the offset slicing + nx_local, ny_local, nz_local = source_file.attrs['dims_local'] + x_start, y_start, z_start = source_file.attrs['offset'] + x_end, y_end, z_end = x_start+nx_local, y_start+ny_local, z_start+nz_local + + # write data from individual processor file to correct location in concatenated file + for dataset in datasets_to_copy: + magnetic_offset = [0,0,0] + if dataset == 'magnetic_x': magnetic_offset[0] = 1 + if dataset == 'magnetic_y': magnetic_offset[1] = 1 + if dataset == 'magnetic_z': magnetic_offset[2] = 1 + + destination_file[dataset][x_start:x_end+magnetic_offset[0], + y_start:y_end+magnetic_offset[1], + z_start:z_end+magnetic_offset[2]] = source_file[dataset] + + # Now that the copy is done we close the source file + source_file.close() + + # Close destination file now that it is fully constructed + destination_file.close() +# ============================================================================== + +if __name__ == '__main__': + from timeit import default_timer + start = default_timer() + + cli = concat_internals.common_cli() + args = cli.parse_args() + + # Perform the concatenation + for output in args.concat_outputs: + concat_3d_dataset(source_directory=args.source_directory, + output_directory=args.output_directory, + num_processes=args.num_processes, + output_number=output, + skip_fields=args.skip_fields, + destination_dtype=args.dtype, + compression_type=args.compression_type, + compression_options=args.compression_opts, + chunking=args.chunking) + + print(f'\nTime to execute: {round(default_timer()-start,2)} seconds') diff --git a/python_scripts/concat_internals.py b/python_scripts/concat_internals.py new file mode 100755 index 000000000..6f90f0211 --- /dev/null +++ b/python_scripts/concat_internals.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Contains all the common tools for the various concatnation functions/scipts +""" + +import h5py +import argparse +import pathlib + +# ============================================================================== +def destination_safe_open(filename: pathlib.Path) -> h5py.File: + """Opens a HDF5 file safely and provides useful error messages for some common failure modes + + Parameters + ---------- + filename : pathlib.Path + + The full path and name of the file to open : + + filename: pathlib.Path : + + + Returns + ------- + h5py.File + + The opened HDF5 file object + """ + + try: + destination_file = h5py.File(filename, 'w-') + except FileExistsError: + # It might be better for this to simply print the error message and return + # rather than exiting. That way if a single call fails in a parallel + # environment it doesn't take down the entire job + raise FileExistsError(f'File "{filename}" already exists and will not be overwritten, skipping.') + + return destination_file +# ============================================================================== + +# ============================================================================== +def copy_header(source_file: h5py.File, destination_file: h5py.File) -> h5py.File: + """Copy the attributes of one HDF5 file to another, skipping all fields that are specific to an individual rank + + Parameters + ---------- + source_file : h5py.File + The source file + destination_file : h5py.File + The destination file + source_file: h5py.File : + + destination_file: h5py.File : + + + Returns + ------- + h5py.File + The destination file with the new header attributes + """ + fields_to_skip = ['dims_local', 'offset', 'n_particles_local'] + + for attr_key in source_file.attrs.keys(): + if attr_key not in fields_to_skip: + destination_file.attrs[attr_key] = source_file.attrs[attr_key] + + return destination_file +# ============================================================================== + +# ============================================================================== +def common_cli() -> argparse.ArgumentParser: + """This function provides the basis for the common CLI amongst the various concatenation scripts. It returns an + `argparse.ArgumentParser` object to which additional arguments can be passed before the final `.parse_args()` method + is used. + + Parameters + ---------- + + Returns + ------- + argparse.ArgumentParser + The common components of the CLI for the concatenation scripts + """ + + # ============================================================================ + def concat_output(raw_argument: str) -> list: + """Function used to parse the `--concat-output` argument + """ + # Check if the string is empty + if len(raw_argument) < 1: + raise ValueError('The --concat-output argument must not be of length zero.') + + # Strip unneeded characters + cleaned_argument = raw_argument.replace(' ', '') + cleaned_argument = cleaned_argument.replace('[', '') + cleaned_argument = cleaned_argument.replace(']', '') + + # Check that it only has the allowed characters + allowed_charaters = set('0123456789,-') + if not set(cleaned_argument).issubset(allowed_charaters): + raise ValueError("Argument contains incorrect characters. Should only contain '0-9', ',', and '-'.") + + # Split on commas + cleaned_argument = cleaned_argument.split(',') + + # Generate the final list + iterable_argument = set() + for arg in cleaned_argument: + if '-' not in arg: + if int(arg) < 0: + raise ValueError() + iterable_argument.add(int(arg)) + else: + start, end = arg.split('-') + start, end = int(start), int(end) + if end < start: + raise ValueError('The end of a range must be larger than the start of the range.') + if start < 0: + raise ValueError() + iterable_argument = iterable_argument.union(set(range(start, end+1))) + + return iterable_argument + # ============================================================================ + + # ============================================================================ + def positive_int(raw_argument: str) -> int: + arg = int(raw_argument) + if arg < 0: + raise ValueError('Argument must be 0 or greater.') + + return arg + # ============================================================================ + + # ============================================================================ + def skip_fields(raw_argument: str) -> list: + # Strip unneeded characters + cleaned_argument = raw_argument.replace(' ', '') + cleaned_argument = cleaned_argument.replace('[', '') + cleaned_argument = cleaned_argument.replace(']', '') + cleaned_argument = cleaned_argument.split(',') + + return cleaned_argument + # ============================================================================ + + # ============================================================================ + def chunk_arg(raw_argument: str) -> tuple: + # Strip unneeded characters + cleaned_argument = raw_argument.replace(' ', '') + cleaned_argument = cleaned_argument.replace('(', '') + cleaned_argument = cleaned_argument.replace(')', '') + + # Check that it only has the allowed characters + allowed_charaters = set('0123456789,') + if not set(cleaned_argument).issubset(allowed_charaters): + raise ValueError("Argument contains incorrect characters. Should only contain '0-9', ',', and '-'.") + + # Convert to a tuple and return + return tuple([int(i) for i in cleaned_argument.split(',')]) + # ============================================================================ + + # Initialize the CLI + cli = argparse.ArgumentParser() + + # Required Arguments + cli.add_argument('-s', '--source-directory', type=pathlib.Path, required=True, help='The path to the directory for the source HDF5 files.') + cli.add_argument('-o', '--output-directory', type=pathlib.Path, required=True, help='The path to the directory to write out the concatenated HDF5 files.') + cli.add_argument('-n', '--num-processes', type=positive_int, required=True, help='The number of processes that were used') + cli.add_argument('-c', '--concat-outputs', type=concat_output, required=True, help='Which outputs to concatenate. Can be a single number (e.g. 8), a range (e.g. 2-9), or a list (e.g. [1,2,3]). Ranges are inclusive') + + # Optional Arguments + cli.add_argument('--skip-fields', type=skip_fields, default=[], help='List of fields to skip concatenating. Defaults to empty.') + cli.add_argument('--dtype', type=str, default=None, help='The data type of the output datasets. Accepts most numpy types. Defaults to the same as the input datasets.') + cli.add_argument('--compression-type', type=str, default=None, help='What kind of compression to use on the output data. Defaults to None.') + cli.add_argument('--compression-opts', type=str, default=None, help='What compression settings to use if compressing. Defaults to None.') + cli.add_argument('--chunking', type=chunk_arg, default=None, nargs='?', const=True, help='Enable chunking of the output file. Default is `False`. If set without an argument then the chunk size will be automatically chosen or a tuple can be passed to indicate the chunk size desired.') + + return cli +# ============================================================================== diff --git a/python_scripts/concat_particles.py b/python_scripts/concat_particles.py new file mode 100755 index 000000000..8a916f08e --- /dev/null +++ b/python_scripts/concat_particles.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +Python script for concatenating particle hdf5 datasets. Includes a CLI for concatenating Cholla HDF5 datasets and can be +imported into other scripts where the `concat_particles_dataset` function can be used to concatenate the datasets. + +Generally the easiest way to import this script is to add the `python_scripts` directory to your python path in your +script like this: +``` +import sys +sys.path.append('/PATH/TO/CHOLLA/python_scripts') +import concat_particles +``` +""" + +import h5py +import numpy as np +import pathlib + +import concat_internals + +# ====================================================================================================================== +def concat_particles_dataset(source_directory: pathlib.Path, + output_directory: pathlib.Path, + num_processes: int, + output_number: int, + skip_fields: list = [], + destination_dtype: np.dtype = None, + compression_type: str = None, + compression_options: str = None, + chunking = None) -> None: + """Concatenate a single particle HDF5 Cholla dataset. i.e. take the single + files generated per process and concatenate them into a single, large file. + + Parameters + ---------- + source_directory : pathlib.Path + The directory containing the unconcatenated files + output_directory : pathlib.Path + The directory containing the new concatenated files + num_processes : int + The number of ranks that Cholla was run with + output_number : int + The output number to concatenate + skip_fields : list + List of fields to skip concatenating. Defaults to []. + destination_dtype : np.dtype + The data type of the output datasets. Accepts most numpy types. Defaults to the same as the input datasets. + compression_type : str + What kind of compression to use on the output data. Defaults to None. + compression_options : str + What compression settings to use if compressing. Defaults to None. + chunking : bool or tuple + Whether or not to use chunking and the chunk size. Defaults to None. + source_directory: pathlib.Path : + + output_directory: pathlib.Path : + + num_processes: int : + + output_number: int : + + skip_fields: list : + (Default value = []) + destination_dtype: np.dtype : + (Default value = None) + compression_type: str : + (Default value = None) + compression_options: str : + (Default value = None) + + Returns + ------- + + """ + + # Error checking + assert num_processes > 1, 'num_processes must be greater than 1' + assert output_number >= 0, 'output_number must be greater than or equal to 0' + + # Open the output file for writing + destination_file = concat_internals.destination_safe_open(output_directory / f'{output_number}_particles.h5') + + # Setup the output file + # Note that the call to `__get_num_particles` is potentially expensive as it + # opens every single file to read the number of particles in that file + num_particles = __get_num_particles(source_directory, num_processes, output_number) + destination_file = __setup_destination_file(source_directory, + destination_file, + output_number, + num_particles, + skip_fields, + destination_dtype, + compression_type, + compression_options, + chunking) + + # loop over files for a given output + particles_offset = 0 + for i in range(0, num_processes): + # open the input file for reading + source_file = h5py.File(source_directory / f'{output_number}_particles.h5.{i}', 'r') + + # Compute the offset slicing for the 3D data + nx_local, ny_local, nz_local = source_file.attrs['dims_local'] + x_start, y_start, z_start = source_file.attrs['offset'] + x_end, y_end, z_end = x_start+nx_local, y_start+ny_local, z_start+nz_local + + # Get the local number of particles + num_particles_local = source_file.attrs['n_particles_local'][0] + + # write data from individual processor file to correct location in concatenated file + for dataset in list(destination_file.keys()): + + if dataset == 'density': + destination_file[dataset][x_start:x_end, + y_start:y_end, + z_start:z_end] = source_file[dataset] + else: + start = particles_offset + end = particles_offset + num_particles_local + destination_file[dataset][start:end] = source_file[dataset] + + # Update the particles offset + particles_offset += num_particles_local + + # Now that the copy is done we close the source file + source_file.close() + + # Close destination file now that it is fully constructed + destination_file.close() +# ============================================================================== + +# ============================================================================== +def __get_num_particles(source_directory: pathlib.Path, + num_processes: int, + output_number: int) -> int: + """Get the total number of particles in the output. This function is heavily + I/O bound and might benefit from utilizing threads. + + Parameters + ---------- + source_directory : pathlib.Path + The directory of the unconcatenated files + num_processes : int + The number of processes + output_number : int + The output number to get data from + + Returns + ------- + int + The total number of particles in the output + """ + # loop over files for a given output + num_particles = 0 + for i in range(0, num_processes): + # open the input file for reading + with h5py.File(source_directory / f'{output_number}_particles.h5.{i}', 'r') as source_file: + num_particles += source_file.attrs['n_particles_local'] + + return num_particles +# ============================================================================== + +# ============================================================================== +def __setup_destination_file(source_directory: pathlib.Path, + destination_file: h5py.File, + output_number: int, + num_particles: int, + skip_fields: list, + destination_dtype: np.dtype, + compression_type: str, + compression_options: str, + chunking) -> h5py.File: + """Setup the destination file by copying the header and setting up the datasets + + Parameters + ---------- + source_directory : pathlib.Path + The directory containing the unconcatenated files + destination_file : h5py.File + The destination file + output_number : int + The output number to concatenate + num_particles : int + The total number of particles in the output + skip_fields : list + List of fields to skip concatenating. + destination_dtype : np.dtype + The data type of the output datasets. Accepts most numpy types. + compression_type : str + What kind of compression to use on the output data. + compression_options : str + What compression settings to use if compressing. + chunking : _type_ + Whether or not to use chunking and the chunk size. + + Returns + ------- + h5py.File + The fully set up destination file + """ + with h5py.File(source_directory / f'{output_number}_particles.h5.0', 'r') as source_file: + # Copy header data + destination_file = concat_internals.copy_header(source_file, destination_file) + + # Make list of datasets to copy + datasets_to_copy = list(source_file.keys()) + datasets_to_copy = [dataset for dataset in datasets_to_copy if not dataset in skip_fields] + + # Create the datasets in the output file + for dataset in datasets_to_copy: + dtype = source_file[dataset].dtype if (destination_dtype == None) else destination_dtype + + # Determine the shape of the dataset + if dataset == 'density': + data_shape = source_file.attrs['dims'] + else: + data_shape = num_particles + + # Create the dataset + destination_file.create_dataset(name=dataset, + shape=data_shape, + dtype=dtype, + chunks=chunking, + compression=compression_type, + compression_opts=compression_options) + + return destination_file +# ============================================================================== + +if __name__ == '__main__': + from timeit import default_timer + start = default_timer() + + cli = concat_internals.common_cli() + args = cli.parse_args() + + # Perform the concatenation + for output in args.concat_outputs: + concat_particles_dataset(source_directory=args.source_directory, + output_directory=args.output_directory, + num_processes=args.num_processes, + output_number=output, + skip_fields=args.skip_fields, + destination_dtype=args.dtype, + compression_type=args.compression_type, + compression_options=args.compression_opts, + chunking=args.chunking) + + print(f'\nTime to execute: {round(default_timer()-start,2)} seconds') diff --git a/python_scripts/dask_distributed_template.py b/python_scripts/dask_distributed_template.py new file mode 100755 index 000000000..ac40294b2 --- /dev/null +++ b/python_scripts/dask_distributed_template.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +""" +This is the skeleton for how to run a Dask script on Andes at the OLCF. The CLI +commands required are in the docstring at the top, major Dask steps are in +functions, and `main` is mostly empty with a clear area on where to do your +computations. + +Requirements: - Verified working with Dask v2023.6.0 - Install graphviz for +python + - 'conda install -c conda-forge python-graphviz graphviz' + - Make sure your version of msgpack-python is at least v1.0.5; v1.0.3 had a bug + - `conda install -c conda-forge msgpack-python=1.0.5` + +Notes: +- This is entirely focused on getting Dask to run on Andes, Crusher, and + Frontier. Other systems will likely need similar steps but not identical +- Between each python script the Dask scheduler and workers need to be + restarted. +- "--interface ib0" does not seem to be required but likely does improve + transfer speeds. On Crusher it throws an error, just omit it +- It likes to spit out lots of ugly messages on shutdown that look like + something failed. Odds are that it worked fine and just didn't shutdown + gracefully +- On OLCF systems Dask seems to hang on setup if you use more than 256 + processes. I haven't dug too deeply into it but for now it might be better to + limit jobs to that size and run them longer or run multiple jobs, potentially + an array job +- On OLCF systems it doesn't always end the job properly and the job will just + keep running and do nothing. Either set short walltimes so it times out or + just keep an eye on it. Maybe end with the script sending an exit command + +################################################################################ +#!/usr/bin/env bash + +#SBATCH -A +#SBATCH -J +#SBATCH -o /%x-%j.out +#SBATCH -t 04:00:00 +#SBATCH -p batch +#SBATCH -N 32 +#SBATCH --mail-user= #SBATCH --mail-type=ALL + +# Setup some parameters DASK_SCHEDULE_FILE=$(pwd)/dask_schedule_file.json +DASK_NUM_WORKERS=$((SLURM_JOB_NUM_NODES*8)) + +# Add any scripts that you're importing to the PYTHONPATH, even ones in the same +# directory. The worker tasks have their own directories and won't find any of +# your scripts unless they're in the PYTHONPATH +export PYTHONPATH="${PYTHONPATH}:/your/path/here" + +INTERFACE='--interface ib0' # For Andes +# INTERFACE='' # For Crusher + +srun --exclusive --ntasks=1 dask scheduler $INTERFACE --scheduler-file $DASK_SCHEDULE_FILE --no-dashboard --no-show & + +# Wait for the dask-scheduler to start +sleep 30 + +srun --exclusive --ntasks=$DASK_NUM_WORKERS dask worker --scheduler-file $DASK_SCHEDULE_FILE --memory-limit='auto' --worker-class distributed.Worker $INTERFACE --no-dashboard --local-directory & + +# Wait for workers to start +sleep 10 + +python -u ./dask-distributed-template.py --scheduler-file $DASK_SCHEDULE_FILE --num-workers $DASK_NUM_WORKERS + +wait +################################################################################ +""" + +import dask +import dask.array as da +import dask.dataframe as dd +from dask.distributed import Client +from dask import graph_manipulation + +import pathlib +import argparse + +# ============================================================================== +def main(): + # Get command line arguments + cli = argparse.ArgumentParser() + # Required Arguments + cli.add_argument('-N', '--num-workers', type=int, required=True, help='The number of workers to use') + cli.add_argument('-s', '--scheduler-file', type=pathlib.Path, required=True, help='The path to the scheduler file') + # Optional Arguments + # none yet, feel free to add your own + args = cli.parse_args() + + # Setup the Dask cluster + client = startup_dask(args.scheduler_file, args.num_workers) + + # Perform your computation + # ... + # ... + # ... + # Some suggestions: + # - If you're using Delayed then append all tasks to a list and execute them with `dask.compute(*command_list)` + # - Visualize task tree with `dask.visualize(*command_list, filename=str('filename.pdf')) + # - Add dependencies manually with `dask.graph_manipulation.bind(dependent_task, list_of_dependencies)` + # End of Computation + + # Shutdown the Dask cluster + shutdown_dask(client) +# ============================================================================== + +# ============================================================================== +def startup_dask(scheduler_file, num_workers): + # Connect to the dask-cluster + client = Client(scheduler_file=scheduler_file) + print('client information ', client) + + # Block until num_workers are ready + print(f'Waiting for {num_workers} workers...') + client.wait_for_workers(n_workers=num_workers) + + num_connected_workers = len(client.scheduler_info()['workers']) + print(f'{num_connected_workers} workers connected') + + return client +# ============================================================================== + +# ============================================================================== +def shutdown_dask(client): + print('Shutting down the cluster') + workers_list = list(client.scheduler_info()['workers']) + client.retire_workers(workers_list, close_workers=True) + client.shutdown() +# ============================================================================== + +if __name__ == '__main__': + main() diff --git a/python_scripts/dask_single_machine_template.py b/python_scripts/dask_single_machine_template.py new file mode 100755 index 000000000..7816ec791 --- /dev/null +++ b/python_scripts/dask_single_machine_template.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +""" +================================================================================ + Written by Robert Caddy. + + A simple template for Dask scripts running on a single machine +================================================================================ +""" + +import dask +import dask.array as da +import dask.dataframe as dd +from dask import graph_manipulation + +import argparse +import pathlib + +# ============================================================================== +def main(): + cli = argparse.ArgumentParser() + # Required Arguments + # Optional Arguments + cli.add_argument('-n', '--num-workers', type=int, default=8, help='The number of workers to use.') + args = cli.parse_args() + + # Set scheduler type. Options are 'threads', 'processes', 'single-threaded', and 'distributed'. + # - 'threads' uses threads that share memory, often fastest on single machines, can run into issuse with the GIL + # - 'processes' uses multiple processes that do not share memory, can be used to get around issues with the GIL + # - `single-threaded` is great for debugging + dask.config.set(scheduler='processes', num_workers=args.num_workers) + + # Perform your computation + # ... + # ... + # ... + # Some suggestions: + # - If you're using Delayed then append all tasks to a list and execute them with `dask.compute(*command_list)` + # - Visualize task tree with `dask.visualize(*command_list, filename=str('filename.pdf')) + # - Add dependencies manually with `dask.graph_manipulation.bind(dependent_task, list_of_dependencies)` + # End of Computation +# ============================================================================== + +if __name__ == '__main__': + from timeit import default_timer + start = default_timer() + main() + print(f'\nTime to execute: {round(default_timer()-start,2)} seconds')