diff --git a/README.rst b/README.rst index e97f337..000bd78 100644 --- a/README.rst +++ b/README.rst @@ -18,7 +18,7 @@ CESM simulations. :AUTHORS: Haiying Xu, Allison Baker, Daniel Milroy, Dorit Hammerling -:COPYRIGHT: 2015-2022 University Corporation for Atmospheric Research +:COPYRIGHT: 2015-2023 University Corporation for Atmospheric Research :LICENSE: Apache 2.0 diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/der_test_pyEnsSumMPAS.sh b/der_test_pyEnsSumMPAS.sh new file mode 100644 index 0000000..f086c34 --- /dev/null +++ b/der_test_pyEnsSumMPAS.sh @@ -0,0 +1,13 @@ +#!/bin/bash -l +#PBS -A NTDD0004 +#PBS -N ensSumM +#PBS -q main +#PBS -l select=1:ncpus=36:mpiprocs=36 +#PBS -l walltime=0:20:00 +#PBS -j oe +#PBS -M abaker@ucar.edu + +module load conda +conda activate npl + +mpiexec -n 36 -ppn 36 python pyEnsSumMPAS.py --esize 100 --indir /glade/p/cisl/asap/abaker/mpas/large_ens --sumfile mpas_sumt4.nc --tslice 4 --tag v7.1 --model mpas --mach cheyenne --verbose --jsonfile empty_excluded.json diff --git a/docs/source/pyEnsSum.rst b/docs/source/pyEnsSum.rst index 4c14e69..f11213a 100644 --- a/docs/source/pyEnsSum.rst +++ b/docs/source/pyEnsSum.rst @@ -60,7 +60,7 @@ To use pyEnsSum: --tslice : the index into the time dimension (default = 1) --mach : Machine name used in the metadata (default = cheyenne) --jsonfile : Jsonfile to provide that a list of variables that will be excluded - or included (default = exclude_empty.json) + (default = exclude_empty.json) --mpi_disable : Disable mpi mode to run in serial (off by default) --fIndex : Use this to start at ensemble member instead of 000 (so ensembles with numbers less than are excluded from summary file) @@ -98,21 +98,20 @@ Notes: 7. You must specify a json file (via ``--jsonfile``) that indicates the variables in the ensemble - output files that you want to include or exclude from the summary file - statistics (see the example json files). We recommend excluding variables, as - this is typically less work and pyEnsSum will let you know if you have not + output files that you want to exclude from the summary file + statistics (see the example json files). The pyEnsSum routine + will let you know if you have not listed variables that need to be excluded (see next note). Keep in mind that you must have *fewer* variables included than ensemble members. 8. *IMPORTANT:* If there are variables that need to be excluded (that are not in - the .json file already), pyEnsSum will exit early and provide a list of the - variables to exclude in the output. These should be added to your exclude - variable list (or removed from an include list), and then pyEnsSum can - be re-run. Note that additional problematic variables may be found by - pyEnsSum as variables are detected in three stages. (First any variables that - are constant across the ensemble are identified. Once these are removed, - linearly dependant variables are indentified for removal. Finally, variables - that are not constant but have very few unique values are identified.) + the .json file already) for the summary to be generated, pyEnsSum will list these + variables in the output. These variables will also be added to a copy of + your exclude variable list (prefixed with "NEW.") for future reference and use. + The summary file will be geberated with all listed variables excluded. + Note that the following types of variables will be removed: any variables that + are constant across the ensemble, are not floating-point (e.g., integer), + are linearly dependant, or have very few (< 3%) unique values. Example: diff --git a/empty_included.json b/empty_included.json deleted file mode 100644 index f2169ba..0000000 --- a/empty_included.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "IncludedVar": [] -} diff --git a/mpas_ex_with_pv_vars.json b/mpas_ex_with_pv_vars.json new file mode 100644 index 0000000..ac9f7f4 --- /dev/null +++ b/mpas_ex_with_pv_vars.json @@ -0,0 +1,31 @@ +{ + "ExcludedVar": [ + "i_rainc", + "i_rainnc", + "xice", + "xtime", + "xland", + "vegfra", + "depv_dt_diab", + "kpbl", + "iLev_DT", + "u", + "sst", + "u_pv", + "v_pv", + "theta_pv", + "vort_pv", + "depv_dt_lw", + "depv_dt_lw", + "depv_dt_sw", + "depv_dt_bl", + "depv_dt_cu", + "depv_dt_mix", + "dtheta_dt_mp", + "depv_dt_mp", + "depv_dt_fric", + "depv_dt_diab_pv", + "depv_dt_fric_pv", + "ertel_pv" + ] +} diff --git a/notebooks/ab_mpas_data.ipynb b/notebooks/ab_mpas_data.ipynb index dc89fab..599f63f 100644 --- a/notebooks/ab_mpas_data.ipynb +++ b/notebooks/ab_mpas_data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "92b3b345-f06f-4752-9a66-50739f398946", "metadata": {}, "outputs": [], @@ -12,7 +12,8 @@ "import netCDF4 as nc\n", "import numpy as np\n", "\n", - "filedir = '/glade/scratch/abaker/mpas_hist/'" + "filedir = '/glade/scratch/abaker/mpas_hist/'\n", + "pydir = '/glade/u/home/abaker/repos/PyCECT/'" ] }, { @@ -146,11 +147,58 @@ "in_files" ] }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8217d907-6824-46bb-b5ea-ef170dc835dd", + "metadata": {}, + "outputs": [], + "source": [ + "sumfile_o = nc.Dataset(pydir + 'mpas_sum_ts12.nc', 'r')\n", + "sumfile_n = nc.Dataset(pydir + 'mpas_sum_ts12_new.nc', 'r')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0fd2e582-a39a-4409-b4da-cd13aa15e4d5", + "metadata": {}, + "outputs": [], + "source": [ + "vars_n = sumfile_n.variables\n", + "vars_o = sumfile_o.variables\n", + "gm_n = vars_n[\"global_mean\"][:, :]\n", + "gm_o = vars_o[\"global_mean\"][:, :]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "478930f9-c1b6-4ab4-82d1-e5eaa849d955", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.max(gm_n - gm_o)" + ] + }, { "cell_type": "code", "execution_count": 4, "id": "f4ea1560-03d7-4f4a-a34d-12fea6b2b3a4", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "myfile = nc.Dataset(filedir + in_files[0], 'r')\n", @@ -521,9 +569,9 @@ ], "metadata": { "kernelspec": { - "display_name": "NPL (conda)", + "display_name": "NPL 2023b", "language": "python", - "name": "npl-conda" + "name": "npl-2023b" }, "language_info": { "codemirror_mode": { @@ -535,7 +583,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.12" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/pyCECT.py b/pyCECT.py index bd300fa..783f4ac 100755 --- a/pyCECT.py +++ b/pyCECT.py @@ -28,7 +28,7 @@ def main(argv): minPCFail= minRunFail= numRunFile= printVars popens mpas pop cam jsonfile= mpi_enable nbin= minrange= maxrange= outfile= casejson= npick= pepsi_gm pop_tol= web_enabled - base_year= pop_threshold= printStdMean fIndex= lev= eet= saveResults json_case= """ + base_year= pop_threshold= printStdMean fIndex= lev= eet= saveResults json_case= savePCAMat= saveEET=""" optkeys = s.split() try: opts, args = getopt.getopt(argv, 'h', optkeys) @@ -72,18 +72,24 @@ def main(argv): opts_dict['web_enabled'] = False opts_dict['saveResults'] = False opts_dict['base_year'] = 1 + opts_dict['savePCAMat'] = '' + opts_dict['saveEET'] = '' # Call utility library getopt_parseconfig to parse the option keys # and save to the dictionary caller = 'CECT' opts_dict = pyEnsLib.getopt_parseconfig(opts, optkeys, caller, opts_dict) + print(opts_dict) + # ens type # cam = opts_dict['cam'] popens = opts_dict['popens'] pop = opts_dict['pop'] mpas = opts_dict['mpas'] + print(f'!test mpas:{mpas}') + if pop or popens: ens = 'pop' elif mpas: @@ -164,7 +170,7 @@ def main(argv): sys.exit() print('in_files=', in_files) else: - wildname = '*' + str(opts_dict['input_globs']) + '*' + wildname = '*' + str(opts_dict['input_globs']) + '*.nc' # Open all input files if os.path.exists(opts_dict['indir']): full_glob_str = os.path.join(opts_dict['indir'], wildname) @@ -340,144 +346,117 @@ def main(argv): countgm = np.zeros(len(ifiles), dtype=np.int32) # Calculate the new run global mean - mean3d, mean2d, varlist = pyEnsLib.generate_global_mean_for_summary( - ifiles, var_name3d, var_name2d, is_SE, opts_dict['pepsi_gm'], opts_dict + mean3d, mean2d = pyEnsLib.generate_global_mean_for_summary( + ifiles, var_name3d, var_name2d, is_SE, opts_dict ) means = np.concatenate((mean3d, mean2d), axis=0) # end cam # NOW this the same for MPAS and CAM + # extra info # Add the new run global mean to the dictionary "results" for i in range(means.shape[1]): for j in range(means.shape[0]): pyEnsLib.addresults(results, 'means', means[j][i], ens_var_name[j], 'f' + str(i)) - - # Evaluate the new run global mean if it is in the range of the ensemble summary global mean range + # Evaluate the new run global mean if it is in the range of the ensemble summary global mea?n range for fcount, fid in enumerate(ifiles): countgm[fcount] = pyEnsLib.evaluatestatus( 'means', 'gmRange', variables, 'gm', results, 'f' + str(fcount) ) + # end extra # Calculate the PCA scores of the new run - new_scores, var_list, comp_std_gm = pyEnsLib.standardized( + new_scores, sum_std_mean, comp_std_gm = pyEnsLib.standardized( means, mu_gm, sigma_gm, loadings_gm, ens_var_name, opts_dict, me ) run_index, decision = pyEnsLib.comparePCAscores( ifiles, new_scores, sigma_scores_gm, opts_dict, me ) - # If there is failure, plot out standardized mean and compared standardized mean in box plots - # if opts_dict['printStdMean'] and decision == 'FAILED': - if opts_dict['printStdMean']: + # which vars are most outside the standardize mean (should be zero) + sort_index = np.argsort(sum_std_mean)[::-1] + sorted_sum = sum_std_mean[sort_index] + sorted_name = np.array(ens_var_name)[sort_index] + sorted_comp_std_gm = comp_std_gm[sort_index] - import matplotlib - import seaborn as sns - - matplotlib.use('Agg') # don't display figures - import matplotlib.pyplot as plt + if opts_dict['printStdMean'] or decision == 'FAILED': print(' ') print('***************************************************************************** ') - print( - 'Test run variable standardized means (for reference only - not used to determine pass/fail)' - ) + print('Test run variable standardized mean information') print('***************************************************************************** ') print(' ') - category = { - 'all_outside99': [], - 'two_outside99': [], - 'one_outside99': [], - 'all_oneside_outside1QR': [], - } - b = list(pyEnsLib.chunk(ens_var_name, 10)) - for f, alist in enumerate(b): - for fc, avar in enumerate(alist): - dist_995 = np.percentile(std_gm[avar], 99.5) - dist_75 = np.percentile(std_gm[avar], 75) - dist_25 = np.percentile(std_gm[avar], 25) - dist_05 = np.percentile(std_gm[avar], 0.5) - c = 0 - d = 0 - p = 0 - q = 0 - for i in range(comp_std_gm[f + fc].size): - if comp_std_gm[f + fc][i] > dist_995: - c = c + 1 - elif comp_std_gm[f + fc][i] < dist_05: - d = d + 1 - elif comp_std_gm[f + fc][i] < dist_995 and comp_std_gm[f + fc][i] > dist_75: - p = p + 1 - elif comp_std_gm[f + fc][i] > dist_05 and comp_std_gm[f + fc][i] < dist_25: - q = q + 1 - if c == 3 or d == 3: - category['all_outside99'].append((avar, f + fc)) - elif c == 2 or d == 2: - category['two_outside99'].append((avar, f + fc)) - elif c == 1 or d == 1: - category['one_outside99'].append((avar, f + fc)) - if p == 3 or q == 3: - category['all_oneside_outside1QR'].append((avar, f + fc)) - part_name = opts_dict['indir'].split('/')[-1] - if not part_name: - part_name = opts_dict['indir'].split('/')[-2] - for key in sorted(category): - list_array = [] - list_array2 = [] - list_var = [] - value = category[key] - - if key == 'all_outside99': - print( - '*** ', - len(value), - ' variables have 3 test run global means outside of the 99th percentile.', - ) - elif key == 'two_outside99': - print( - '*** ', - len(value), - ' variables have 2 test run global means outside of the 99th percentile.', - ) - elif key == 'one_outside99': - print( - '*** ', - len(value), - ' variables have 1 test run global mean outside of the 99th percentile.', - ) - elif key == 'all_oneside_outside1QR': - print( - '*** ', - len(value), - ' variables have all test run global means outside of the first quartile (but not outside the 99th percentile).', - ) - - if len(value) > 0: - print(' => generating plot ...') - if len(value) > 20: - print(' NOTE: truncating to only plot the first 20 variables.') - value = value[0:20] - - for each_var in value: - list_array.append(std_gm[each_var[0]]) - list_array2.append(comp_std_gm[each_var[1]]) - name = each_var[0] - if isinstance(name, str) is False: - name = name.decode('utf-8') - - list_var.append(name) - - if len(value) != 0: - # ax = sns.boxplot(data=list_array, whis=[0.5, 99.5], fliersize=0.0) - sns.stripplot(data=list_array2, jitter=True, color='r') - plt.xticks(list(range(len(list_array))), list_var, fontsize=8, rotation=-45) - - if decision == 'FAILED': - plt.savefig(part_name + '_' + key + '_fail.png') - else: - plt.savefig(part_name + '_' + key + '_pass.png') - plt.close() + all_outside99 = [] + two_outside99 = [] + one_outside99 = [] + + # std_gm is a dictionary + tsize = comp_std_gm.shape[1] + b = list(ens_var_name) + for f, avar in enumerate(b): + if np.ma.is_masked(std_gm[avar]): + tempa = std_gm[avar] + else: + tempa = np.array(std_gm[avar]) + dist_995 = np.percentile(tempa, 99.5) + dist_005 = np.percentile(tempa, 0.5) + # print(avar, " = ", dist_005, dist_995) + count = 0 + for i in range(tsize): + if comp_std_gm[f, i] > dist_995 or comp_std_gm[f, i] < dist_005: + count = count + 1 + if count == 1: + one_outside99.append(avar) + elif count == 2: + two_outside99.append(avar) + elif count == tsize: + all_outside99.append(avar) + + if len(all_outside99) > 0: + print( + '*** ', + len(all_outside99), + ' variable(s) have all test run global means outside of the 99th percentile.', + ) + print(all_outside99) + if len(two_outside99) > 0: + print( + '*** ', + len(two_outside99), + ' variable(s) have 2 test run global means outside of the 99th percentile.', + ) + print(two_outside99) + if len(one_outside99) > 0: + print( + '*** ', + len(one_outside99), + ' variable(s) have 1 test run global means outside of the 99th percentile.', + ) + print(one_outside99) + + # count = len(all_outside99) + len(two_outside99) + len(one_outside99) + # count = max(10, count) + count = 20 + count = min(count, means.shape[0]) + + print('') + print('***************************************************************************** ') + print( + 'Top 20 test run variables in decreasing order of (abs) standardized mean sum (note: ensemble is standardized such that mean = 0 and std_dev = 1)' + ) + for i in range(count): + print( + sorted_name[i], + ': ', + 'sum =', + sorted_sum[i], + ' (indiv. test run values = ', + sorted_comp_std_gm[i, :], + ')', + ) + print('***************************************************************************** ') ## # Print file with info about new test runs....to a netcdf file @@ -513,6 +492,9 @@ def main(argv): v_ens_sigma_scores = nc_savefile.createVariable('ens_sigma_scores', 'f8', ('nvars',)) v_ens_std_gm = nc_savefile.createVariable('ens_std_gm', 'f8', ('nvars', 'ens_size')) + # v_ens_loadings = nc_savefile.createVariable('ens_loadings', 'f8', ('nvars', 'nvars')) + v_gm = nc_savefile.createVariable('gm', 'f8', ('nvars', 'test_size')) + # hard-coded size ssize = 'S' + str(str_size) str_out = nc.stringtochar(np.array(ens_var_name, ssize)) @@ -523,36 +505,13 @@ def main(argv): v_ens_sigma_scores[:] = sigma_scores_gm[:] v_ens_std_gm[:, :] = std_gm_array[:, :] + # v_ens_loadings[:,:] = loadings_gm[:,:] + v_gm[:, :] = means[:, :] + nc_savefile.close() # end of CAM and MPAS - # Print variables (optional) - if opts_dict['printVars']: - print(' ') - print('***************************************************************************** ') - print( - 'Variable global mean information (for reference only - not used to determine pass/fail)' - ) - print('***************************************************************************** ') - for fcount, fid in enumerate(ifiles): - print(' ') - print('Run ' + str(fcount + 1) + ':') - print(' ') - print( - '***' + str(countgm[fcount]), - ' of ' - + str(len(ens_var_name)) - + ' variables are outside of ensemble global mean distribution***', - ) - pyEnsLib.printsummary( - results, 'gm', 'means', 'gmRange', fcount, variables, 'global mean' - ) - print(' ') - print( - '----------------------------------------------------------------------------' - ) - if me.get_rank() == 0: print(' ') print('Testing complete.') diff --git a/pyEnsLib.py b/pyEnsLib.py index cfdd4c5..81e806b 100644 --- a/pyEnsLib.py +++ b/pyEnsLib.py @@ -348,8 +348,8 @@ def search_sumfile(opts_dict, ifiles): # Create some variables and call a function to calculate PCA # now gm comes in at 64 bits... - -def pre_PCA(gm_orig, all_var_names, whole_list, me): +# pas in exclude list in case we have to add to id +def pre_PCA(gm_orig, all_var_names, ex_list, me): # initialize b_exit = False @@ -361,56 +361,33 @@ def pre_PCA(gm_orig, all_var_names, whole_list, me): else: gm = gm_orig[:] - mu_gm = np.average(gm, axis=1) sigma_gm = np.std(gm, axis=1, ddof=1) - standardized_global_mean = np.zeros(gm.shape, dtype=np.float64) - scores_gm = np.zeros(gm.shape, dtype=np.float64) + # keep track of orig vars in exclude file + new_ex_list = ex_list.copy() + orig_len = len(ex_list) - # AB: 4/19: whole list contains variables to be removed due to very small global means (calc elsewhere), but this is not currently needed - # and whole_list will be len = 0 - orig_len = len(whole_list) - if orig_len > 0: - if me.get_rank() == 0: - print('\n') - print( - '***************************************************************************************' - ) - print( - ( - 'Warning: these ', - orig_len, - ' variables have ~0 means (< O(e-15)) for each ensemble member, please exclude them via the json file (--jsonfile) :', - ) - ) - print((','.join(['"{0}"'.format(item) for item in whole_list]))) - print( - '***************************************************************************************' - ) - print('\n') - - # check for constants across ensemble + ##### check for constants across ensemble + print('STATUS: checking for constant values across ensemble') for var in range(nvar): for file in range(nfile): - if np.any(sigma_gm[var] == 0.0) and all_var_names[var] not in set(whole_list): - # keep track of zeros standard deviations - whole_list.append(all_var_names[var]) + if np.any(sigma_gm[var] == 0.0) and all_var_names[var] not in set(new_ex_list): + # keep track of zeros standard deviations and append + new_ex_list.append(all_var_names[var]) - # print list - new_len = len(whole_list) + # did we add vars to exclude? + new_len = len(new_ex_list) if new_len > orig_len: - sub_list = whole_list[orig_len:] + sub_list = new_ex_list[orig_len:] if me.get_rank() == 0: print('\n') print( '*************************************************************************************' ) print( - ( - 'Warning: these ', - new_len - orig_len, - ' variables are constant across ensemble members, please exclude them via the json file (--jsonfile): ', - ) + 'Warning: these ', + new_len - orig_len, + ' variables are constant across ensemble members, and will be excluded and added to a copy of the json file (--jsonfile): ', ) print('\n') print((','.join(['"{0}"'.format(item) for item in sub_list]))) @@ -419,83 +396,28 @@ def pre_PCA(gm_orig, all_var_names, whole_list, me): ) print('\n') - # exit if non-zero length whole_list - if new_len > 0: - print('=> Exiting ...') - b_exit = True - - # check for linear dependent vars - if not b_exit: - - for var in range(nvar): - for file in range(nfile): - standardized_global_mean[var, file] = (gm[var, file] - mu_gm[var]) / sigma_gm[var] - - eps = np.finfo(np.float32).eps - norm = np.linalg.norm(standardized_global_mean, ord=2) - sh = max(standardized_global_mean.shape) - mytol = sh * norm * eps - - standardized_rank = np.linalg.matrix_rank(standardized_global_mean, mytol) - print('STATUS: checking for dependent vars using QR...') - print(('STATUS: standardized_global_mean rank = ', standardized_rank)) - - dep_var_list = get_dependent_vars_index(standardized_global_mean, standardized_rank) - num_dep = len(dep_var_list) - orig_len = len(whole_list) - - for i in dep_var_list: - whole_list.append(all_var_names[i]) - - if num_dep > 0: - sub_list = whole_list[orig_len:] - - print('\n') - print( - '********************************************************************************************' - ) - print( - ( - 'Warning: these ', - num_dep, - ' variables are linearly dependent, please exclude them via the json file (--jsonfile): ', - ) - ) - print('\n') - print((','.join(['"{0}"'.format(item) for item in sub_list]))) - print( - '********************************************************************************************' - ) - print('\n') - print('=> EXITING....') - - # need to exit - b_exit = True - - # now check for any variables that have less than 3% (of the ensemble size) unique values - if not b_exit: - print('STATUS: checking for unique values across ensemble') - - cts = np.count_nonzero(np.diff(np.sort(standardized_global_mean)), axis=1) + 1 - # thresh = .02* standardized_global_mean.shape[1] - thresh = 0.03 * standardized_global_mean.shape[1] - result = np.where(cts < thresh) - indices = result[0] - if len(indices) > 0: - nu_list = [] - for i in indices: + #### now check for any variables that have less than 3% (of the ensemble size) unique values + print('STATUS: checking for unique values across ensemble') + cts = np.count_nonzero(np.diff(np.sort(gm)), axis=1) + 1 + thresh = 0.03 * gm.shape[1] + result = np.where(cts < thresh) + indices = result[0] + if len(indices) > 0: + nu_list = [] + for i in indices: + # only add if not in ex_list already + if all_var_names[i] not in set(new_ex_list): nu_list.append(all_var_names[i]) + if len(nu_list) > 0: print('\n') print( '********************************************************************************************' ) print( - ( - 'Warning: these ', - len(indices), - ' variables contain fewer than 3% unique values across the ensemble, please exclude them via the json file (--jsonfile): ', - ) + 'Warning: these ', + len(nu_list), + ' variables contain fewer than 3% unique values across the ensemble, and will be excluded and added to a copy of thee json file (--jsonfile): ', ) print('\n') print((','.join(['"{0}"'.format(item) for item in nu_list]))) @@ -503,24 +425,106 @@ def pre_PCA(gm_orig, all_var_names, whole_list, me): '********************************************************************************************' ) print('\n') - print('=> EXITING....') - # need to exit - b_exit = True + new_ex_list.extend(nu_list) - if not b_exit: - # find principal components - loadings_gm = princomp(standardized_global_mean) - # now do coord transformation on the standardized means to get the scores - scores_gm = np.dot(loadings_gm.T, standardized_global_mean) - sigma_scores_gm = np.std(scores_gm, axis=1, ddof=1) - else: - loadings_gm = np.zeros(gm.shape, dtype=np.float64) - sigma_scores_gm = np.zeros(gm.shape, dtype=np.float64) + ### REMOVE newly excluded stuff before the check for linear dependence + # remove var from nvar, all_var_names, gm, and recalculate: mu_gm, sigma_gm + new_len = len(new_ex_list) + indx = [] + if new_len > orig_len: + print('Updating ...') + sub_list = new_ex_list[orig_len:] + for i in sub_list: + indx.append(all_var_names.index(i)) + # now delete the rows from gm and names from list + gm_del = np.delete(gm, indx, axis=0) + all_var_names_del = np.delete(all_var_names, indx).tolist() + + gm = gm_del + all_var_names = all_var_names_del + nvar = gm.shape[0] + + mu_gm = np.average(gm, axis=1) + sigma_gm = np.std(gm, axis=1, ddof=1) + standardized_global_mean = np.zeros(gm.shape, dtype=np.float64) + + ####### check for linear dependent vars + print('STATUS: checking for linear dependence across ensemble') + for var in range(nvar): + for file in range(nfile): + standardized_global_mean[var, file] = (gm[var, file] - mu_gm[var]) / sigma_gm[var] + + eps = np.finfo(np.float32).eps + norm = np.linalg.norm(standardized_global_mean, ord=2) + sh = max(standardized_global_mean.shape) + mytol = sh * norm * eps + + standardized_rank = np.linalg.matrix_rank(standardized_global_mean, mytol) + print('STATUS: using QR...') + print(('STATUS: standardized_global_mean rank = ', standardized_rank)) - # return mu_gm.astype(np.float32),sigma_gm.astype(np.float32),standardized_global_mean.astype(np.float32),loadings_gm.astype(np.float32),sigma_scores_gm.astype(np.float32),b_exit + dep_var_list = get_dependent_vars_index(standardized_global_mean, standardized_rank) + num_dep = len(dep_var_list) + new_len = len(new_ex_list) - return mu_gm, sigma_gm, standardized_global_mean, loadings_gm, sigma_scores_gm, b_exit + for i in dep_var_list: + new_ex_list.append(all_var_names[i]) + + if num_dep > 0: + sub_list = new_ex_list[new_len:] + + print('\n') + print( + '********************************************************************************************' + ) + print( + 'Warning: these ', + num_dep, + ' variables are linearly dependent, and will be excluded and added to a copy of the json file (--jsonfile): ', + ) + print('\n') + print((','.join(['"{0}"'.format(item) for item in sub_list]))) + print( + '********************************************************************************************' + ) + print('\n') + + # REMOVE FROM gm, standardized gm and names + indx = [] + for i in sub_list: + indx.append(all_var_names.index(i)) + # now delete the rows in index from gm, std gm, and names from list + gm_del = np.delete(gm, indx, axis=0) + sgm_del = np.delete(standardized_global_mean, indx, axis=0) + all_var_names_del = np.delete(all_var_names, indx).tolist() + + gm = gm_del + standardized_global_mean = sgm_del + all_var_names = all_var_names_del + nvar = gm.shape[0] + + mu_gm = np.average(gm, axis=1) + sigma_gm = np.std(gm, axis=1, ddof=1) + + # COMPUTE PCA + scores_gm = np.zeros(gm.shape, dtype=np.float64) + # find principal components + loadings_gm = princomp(standardized_global_mean) + # now do coord transformation on the standardized means to get the scores + scores_gm = np.dot(loadings_gm.T, standardized_global_mean) + sigma_scores_gm = np.std(scores_gm, axis=1, ddof=1) + + return ( + mu_gm, + sigma_gm, + standardized_global_mean, + loadings_gm, + sigma_scores_gm, + new_ex_list, + gm, + b_exit, + ) # @@ -572,13 +576,14 @@ def calc_Z(val, avg, stddev, count, flag): # -# Read a json file for the excluded/included list of variables -# +# Read a json file for the excluded list of variables +# (no longer allowing include files) def read_jsonlist(metajson, method_name): # method_name = ES for ensemble summary (CAM, MPAS) # = ESP for POP ensemble summary + exclude = True if not os.path.exists(metajson): print('\n') print( @@ -590,7 +595,6 @@ def read_jsonlist(metajson, method_name): ) print('\n') varList = [] - exclude = True return varList, exclude else: fd = open(metajson) @@ -602,12 +606,9 @@ def read_jsonlist(metajson, method_name): exclude = [] return varList, exclude if method_name == 'ES': # CAM or MPAS - exclude = False + exclude = True if 'ExcludedVar' in metainfo: - exclude = True varList = metainfo['ExcludedVar'] - elif 'IncludedVar' in metainfo: - varList = metainfo['IncludedVar'] return varList, exclude elif method_name == 'ESP': # POP var2d = metainfo['Var2d'] @@ -906,7 +907,7 @@ def calc_global_mean_for_onefile_MPAS(fname, weight_dict, var_cell, var_edge, va # compute area_wgts, and then loop through all files to call calc_global_means_for_onefile # o_files are not open for CAM # 12/19 - summary file will now be double precision -def generate_global_mean_for_summary(o_files, var_name3d, var_name2d, is_SE, pepsi_gm, opts_dict): +def generate_global_mean_for_summary(o_files, var_name3d, var_name2d, is_SE, opts_dict): tslice = opts_dict['tslice'] popens = opts_dict['popens'] @@ -957,10 +958,7 @@ def generate_global_mean_for_summary(o_files, var_name3d, var_name2d, is_SE, pep fname.close() - var_list = [] - # some valid CAM vars are all small entries(e.g. DTWR_H2O2 and DTWR_H2O4), so we no longer exclude them via var_list - - return gm3d, gm2d, var_list + return gm3d, gm2d # Calculate global means for one OCN input file @@ -1511,30 +1509,7 @@ def standardized(gm, mu_gm, sigma_gm, loadings_gm, all_var_names, opts_dict, me) sum_std_mean[var] = sum_std_mean[var] + np.abs(standardized_mean[var, file]) new_scores = np.dot(loadings_gm.T.astype(np.float64), standardized_mean) - var_list = [] - sorted_sum_std_mean = np.argsort(sum_std_mean)[::-1] - if opts_dict['printStdMean']: - if me.get_rank() == 0: - print(' ') - print('************************************************************************') - print(' Sum of standardized mean of all variables in decreasing order') - print('************************************************************************') - for var in range(nvar): - var_list.append(all_var_names[sorted_sum_std_mean[var]]) - vname = all_var_names[sorted_sum_std_mean[var]] - if me.get_rank() == 0: - - if isinstance(vname, str): - vname_d = vname - else: - vname_d = vname.decode('utf-8') - - print( - '{:>15}'.format(vname_d), - '{0:9.2e}'.format(sum_std_mean[sorted_sum_std_mean[var]]), - ) - print(' ') - return new_scores, var_list, standardized_mean + return new_scores, sum_std_mean, standardized_mean # @@ -1581,7 +1556,7 @@ def printsummary(results, key, name, namerange, thefilecount, variables, label): if temp < 1: print(' ') print( - f' {strname}: {v[name][thefile]:.2e} outside of of [{variables[k][namerange][0]:2e}, {variables[k][namerange][1]}]' + f' {strname}: {v[name][thefile]:2e} outside of [{variables[k][namerange][0]:2e}, {variables[k][namerange][1]:2e}]' ) @@ -1647,6 +1622,7 @@ def comparePCAscores(ifiles, new_scores, sigma_scores_gm, opts_dict, me): totalcount = 0 sum_index = [] if me.get_rank() == 0: + print('') print('*********************************************** ') print('PCA Test Results') print('*********************************************** ') @@ -1670,6 +1646,11 @@ def comparePCAscores(ifiles, new_scores, sigma_scores_gm, opts_dict, me): totalcount = totalcount + 1 sum_index.append(i + 1) + # save comp_array if filepath is provided + if me.get_rank() == 0: + if len(opts_dict['savePCAMat']) > 0: + np.save(opts_dict['savePCAMat'], comp_array) + # false_positive=check_falsepositive(opts_dict,sum_index) # If the length of sum_index is larger than min_PC_fail, the three runs failed. @@ -1690,7 +1671,9 @@ def comparePCAscores(ifiles, new_scores, sigma_scores_gm, opts_dict, me): sum_index, ) print(' ') - print('These runs ' + decision + ' according to our testing criterion.') + print('These runs ****' + decision + '**** according to our testing criterion.') + print(' ') + elif me.get_rank() == 0: print(' ') print( @@ -1745,6 +1728,10 @@ def comparePCAscores(ifiles, new_scores, sigma_scores_gm, opts_dict, me): else: decision = 'PASSED' + # save eet if filepath is provided + if len(opts_dict['saveEET']) > 0: + np.save(opts_dict['saveEET'], np.array([passes, passes + failures])) + else: for j in range(comp_array.shape[1]): index_list = [] @@ -1796,12 +1783,6 @@ def CECT_usage(): print( ' --printVars : print out variables that fall outside of the global mean ensemble distribution (off by default)' ) - print( - ' --printStdMean : print out sum of standardized mean of all variables in decreasing order. If test returns a FAIL, ' - ) - print( - ' then output associated box plots (off by default) - requires Python seaborn package' - ) print( ' --saveResults : save a netcdf file with scores and std global means from the test runs (savefile.nc). ' ) @@ -1857,7 +1838,7 @@ def EnsSum_usage(): print(' --mach : Machine name used in the metadata (default = cheyenne)') print(' --tslice : the index into the time dimension (default = 1)') print(' --jsonfile : Jsonfile to provide that a list of variables that will ') - print(' be excluded or included (default = exclude_empty.json)') + print(' be excluded (default = exclude_empty.json)') print(' --mpi_disable : Disable mpi mode to run in serial (off by default)') # print( # ' --fIndex : Use this to start at ensemble member instead of 000 (so ' @@ -1886,7 +1867,7 @@ def EnsSumMPAS_usage(): print(' --mach : Machine name used in the metadata (default = cheyenne)') print(' --tslice : the index into the time dimension (default = 0)') print(' --jsonfile : Jsonfile to provide that a list of variables that will ') - print(' be excluded or included (default = empty_excluded.json)') + print(' be excluded (default = empty_excluded.json)') print(' --mpi_disable : Disable mpi mode to run in serial (mpi is enabled by default)') print(' ') diff --git a/pyEnsSum.py b/pyEnsSum.py index 471de9d..ad38900 100644 --- a/pyEnsSum.py +++ b/pyEnsSum.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import configparser import getopt +import json import os import re import sys @@ -19,10 +20,8 @@ def main(argv): - # cumul is not being used currently - # Get command line stuff and store in a dictionary - s = 'tag= compset= esize= tslice= res= sumfile= indir= sumfiledir= mach= verbose jsonfile= mpi_enable maxnorm gmonly popens regx= startMon= endMon= fIndex= mpi_disable' + s = 'tag= compset= esize= tslice= res= sumfile= indir= sumfiledir= mach= verbose jsonfile= mpi_enable maxnorm popens regx= startMon= endMon= fIndex= mpi_disable' optkeys = s.split() try: opts, args = getopt.getopt(argv, 'h', optkeys) @@ -43,14 +42,12 @@ def main(argv): opts_dict['sumfile'] = 'ens.summary.nc' opts_dict['indir'] = './' opts_dict['sumfiledir'] = './' - opts_dict['jsonfile'] = 'exclude_empty.json' + opts_dict['jsonfile'] = 'empty_excluded.json' opts_dict['verbose'] = False opts_dict['mpi_enable'] = True opts_dict['mpi_disable'] = False opts_dict['maxnorm'] = False - opts_dict['gmonly'] = True opts_dict['popens'] = False - opts_dict['cumul'] = False opts_dict['regx'] = 'test' opts_dict['startMon'] = 1 opts_dict['endMon'] = 1 @@ -79,7 +76,6 @@ def main(argv): input_dir = opts_dict['indir'] # The var list that will be excluded ex_varlist = [] - inc_varlist = [] # Create a mpi simplecomm object if opts_dict['mpi_enable']: @@ -94,26 +90,17 @@ def main(argv): print(opts_dict) print('STATUS: Ensemble size for summary = ', esize) - exclude = False if me.get_rank() == 0: if opts_dict['jsonfile']: - inc_varlist = [] - # Read in the excluded or included var list + # Read in the excluded var list ex_varlist, exclude = pyEnsLib.read_jsonlist(opts_dict['jsonfile'], 'ES') if len(ex_varlist) > 0: if ex_varlist[0] == 'JSONERROR': me.abort() - if exclude is False: - inc_varlist = ex_varlist - ex_varlist = [] # Broadcast the excluded var list to each processor if opts_dict['mpi_enable']: - exclude = me.partition(exclude, func=Duplicate(), involved=True) - if exclude: - ex_varlist = me.partition(ex_varlist, func=Duplicate(), involved=True) - else: - inc_varlist = me.partition(inc_varlist, func=Duplicate(), involved=True) + ex_varlist = me.partition(ex_varlist, func=Duplicate(), involved=True) in_files = [] if os.path.exists(input_dir): @@ -150,13 +137,6 @@ def main(argv): print('ERROR: Input directory: ', input_dir, ' not found') sys.exit(2) - if opts_dict['cumul']: - if opts_dict['regx']: - in_files_list = get_cumul_filelist(opts_dict, opts_dict['indir'], opts_dict['regx']) - in_files = me.partition(in_files_list, func=EqualLength(), involved=True) - if me.get_rank() == 0 and verbose: - print('VERBOSE: in_files = ', in_files) - # Check full file names in input directory (don't open yet) full_in_files = [] if me.get_rank() == 0 and opts_dict['verbose']: @@ -225,21 +205,18 @@ def main(argv): print('nlat = ', nlat) print('nlon = ', nlon) + # invarient metadata (will write to sum file later) + lev_data = first_file.variables['lev'] + lev_data_copy = lev_data[:] # doesn't go away when close first_file + # Get 2d vars, 3d vars and all vars (For now include all variables) vars_dict_all = first_file.variables # Remove the excluded variables (specified in json file) from variable dictionary - if exclude: - vars_dict = vars_dict_all - for i in ex_varlist: - if i in vars_dict: - del vars_dict[i] - # Given an included var list, remove all the variables that are not on the list - else: - vars_dict = vars_dict_all.copy() - for k, v in vars_dict_all.items(): - if (k not in inc_varlist) and (vars_dict_all[k].typecode() == 'f'): - del vars_dict[k] + vars_dict = vars_dict_all.copy() + for i in ex_varlist: + if i in vars_dict: + del vars_dict[i] # num_vars = len(vars_dict) @@ -312,7 +289,6 @@ def main(argv): # All vars is 3d vars first (sorted), the 2d vars all_var_names = list(d3_var_names) all_var_names += d2_var_names - # n_all_var_names = len(all_var_names) # Rank 0 - Create new summary ensemble file this_sumfile = opts_dict['sumfile'] @@ -328,8 +304,96 @@ def main(argv): this_sumfile = sum_dir + '/' + this_sumfile + # All: + var3_list_loc = me.partition(d3_var_names, func=EqualStride(), involved=True) + var2_list_loc = me.partition(d2_var_names, func=EqualStride(), involved=True) + + # close first_file + first_file.close() + + # Calculate global means # + if me.get_rank() == 0 and verbose: + print('VERBOSE: Calculating global means .....') + + gm3d, gm2d = pyEnsLib.generate_global_mean_for_summary( + full_in_files, var3_list_loc, var2_list_loc, is_SE, opts_dict + ) + + if me.get_rank() == 0 and verbose: + print('VERBOSE: Finished calculating global means .....') + + # gather to rank = 0 + if opts_dict['mpi_enable']: + + # Gather the 3d variable results from all processors to the master processor + slice_index = get_stride_list(len(d3_var_names), me) + + # Gather global means 3d results + gm3d = gather_npArray(gm3d, me, slice_index, (len(d3_var_names), len(full_in_files))) + + # Gather 2d variable results from all processors to the master processor + slice_index = get_stride_list(len(d2_var_names), me) + + # Gather global means 2d results + gm2d = gather_npArray(gm2d, me, slice_index, (len(d2_var_names), len(full_in_files))) + + # rank =0 : complete calculations for summary file if me.get_rank() == 0: + gmall = np.concatenate((gm3d, gm2d), axis=0) + + # PCA prep and calculation + ( + mu_gm, + sigma_gm, + standardized_global_mean, + loadings_gm, + scores_gm, + new_ex_varlist, + new_gmall, + b_exit, + ) = pyEnsLib.pre_PCA(gmall, all_var_names, ex_varlist, me) + # if PCA calc encounters an error, then remove the summary file and exit + if b_exit: + print('STATUS: Summary could not be created.') + sys.exit(2) + + # update json file? update var 2d and 3d var lists? + + # print('ex_varlist len = ', len(ex_varlist)) + # print('new ex_varlist len = ', len(new_ex_varlist)) + # print(new_ex_varlist) + + if len(ex_varlist) < len(new_ex_varlist): + print('STATUS: Creating an updated JSON file (with prefix "NEW.")') + new_name = 'NEW.' + opts_dict['jsonfile'] + print( + 'STATUS: Adding ', len(new_ex_varlist) - len(ex_varlist), ' variables to ', new_name + ) + jdict = {} + jdict['ExcludedVar'] = new_ex_varlist + with open(new_name, 'w') as outfile: + json.dump(jdict, outfile) + + # update num_2d, num_3d => by removing vars from d3_var_names and d2_var_names + for i in new_ex_varlist: + if i in all_var_names: + all_var_names.remove(i) + if i in d3_var_names: + d3_var_names.remove(i) + elif i in d2_var_names: + d2_var_names.remove(i) + + num_2d = len(d2_var_names) + num_3d = len(d3_var_names) + + nvars = loadings_gm.shape[0] + if nvars != (num_2d + num_3d): + print('DIMENSION ERROR!') + print('STATUS: Summary could not be created.') + sys.exit(2) + + # create the summary file (still rank 0) if verbose: print('VERBOSE: Creating ', this_sumfile, ' ...') @@ -421,89 +485,9 @@ def main(argv): v_var3d[:] = eq_d3_var_names[:] v_var2d[:] = eq_d2_var_names[:] - # Time-invarient metadata - if verbose: - print('VERBOSE: Assigning time invariant metadata .....') - lev_data = first_file.variables['lev'] - v_lev[:] = lev_data[:] - # end of rank=0 work - - # All: - # tslice = opts_dict['tslice'] - if not opts_dict['cumul']: - # Partition the var list - var3_list_loc = me.partition(d3_var_names, func=EqualStride(), involved=True) - var2_list_loc = me.partition(d2_var_names, func=EqualStride(), involved=True) - else: - var3_list_loc = d3_var_names - var2_list_loc = d2_var_names - - # close first_file - first_file.close() - - # Calculate global means # - if me.get_rank() == 0 and verbose: - print('VERBOSE: Calculating global means .....') - if not opts_dict['cumul']: - gm3d, gm2d, var_list = pyEnsLib.generate_global_mean_for_summary( - full_in_files, var3_list_loc, var2_list_loc, is_SE, False, opts_dict - ) - if me.get_rank() == 0 and verbose: - print('VERBOSE: Finished calculating global means .....') - - # gather to rank = 0 - if opts_dict['mpi_enable']: - - if not opts_dict['cumul']: - # Gather the 3d variable results from all processors to the master processor - slice_index = get_stride_list(len(d3_var_names), me) - - # Gather global means 3d results - gm3d = gather_npArray(gm3d, me, slice_index, (len(d3_var_names), len(full_in_files))) - - # Gather 2d variable results from all processors to the master processor - slice_index = get_stride_list(len(d2_var_names), me) - - # Gather global means 2d results - gm2d = gather_npArray(gm2d, me, slice_index, (len(d2_var_names), len(full_in_files))) - - # gather variables ro exclude (in pre_pca) - var_list = gather_list(var_list, me) - - else: - # cumul is not being used and temp1 and temp2 are undefined - # gmall = np.concatenate((temp1, temp2), axis=0) - # gmall = pyEnsLib.gather_npArray_pop( - # gmall, me, (me.get_size(), len(d3_var_names) + len(d2_var_names)) - # ) - print('Warning: cumul not supported') - - # rank =0 : complete calculations for summary file - if me.get_rank() == 0: - if not opts_dict['cumul']: - gmall = np.concatenate((gm3d, gm2d), axis=0) - else: - gmall_temp = np.transpose(gmall[:, :]) - gmall = gmall_temp - - # PCA prep and calculation - ( - mu_gm, - sigma_gm, - standardized_global_mean, - loadings_gm, - scores_gm, - b_exit, - ) = pyEnsLib.pre_PCA(gmall, all_var_names, var_list, me) - - # if PCA calc encounters an error, then remove the summary file and exit - if b_exit: - nc_sumfile.close() - os.unlink(this_sumfile) - print('STATUS: Summary could not be created.') - sys.exit(2) - - v_gm[:, :] = gmall[:, :] + # populate variables + v_lev[:] = lev_data_copy[:] + v_gm[:, :] = new_gmall[:, :] v_standardized_gm[:, :] = standardized_global_mean[:, :] v_mu_gm[:] = mu_gm[:] v_sigma_gm[:] = sigma_gm[:] @@ -530,8 +514,7 @@ def get_cumul_filelist(opts_dict, indir, regx): res = [f for f in os.listdir(indir) if re.search(regx, f)] in_files = sorted(res) all_files.extend(in_files) - # print "all_files=",all_files - # in_files=res + return all_files diff --git a/pyEnsSumMPAS.py b/pyEnsSumMPAS.py index 2dbe53c..89e272f 100644 --- a/pyEnsSumMPAS.py +++ b/pyEnsSumMPAS.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import configparser import getopt +import json import os import re import sys @@ -75,7 +76,6 @@ def main(argv): input_dir = opts_dict['indir'] # The var list that will be excluded ex_varlist = [] - inc_varlist = [] # Create a mpi simplecomm object if opts_dict['mpi_enable']: @@ -93,23 +93,15 @@ def main(argv): exclude = False if me.get_rank() == 0: if opts_dict['jsonfile']: - inc_varlist = [] - # Read in the excluded or included var list + # Read in the excluded var list ex_varlist, exclude = pyEnsLib.read_jsonlist(opts_dict['jsonfile'], 'ES') if len(ex_varlist) > 0: if ex_varlist[0] == 'JSONERROR': me.abort() - if exclude is False: - inc_varlist = ex_varlist - ex_varlist = [] # Broadcast the excluded var list to each processor if opts_dict['mpi_enable']: - exclude = me.partition(exclude, func=Duplicate(), involved=True) - if exclude: - ex_varlist = me.partition(ex_varlist, func=Duplicate(), involved=True) - else: - inc_varlist = me.partition(inc_varlist, func=Duplicate(), involved=True) + ex_varlist = me.partition(ex_varlist, func=Duplicate(), involved=True) in_files = [] if os.path.exists(input_dir): @@ -214,17 +206,10 @@ def main(argv): vars_dict_all = first_file.variables # Remove the excluded variables (specified in json file) from variable dictionary - if exclude: - vars_dict = vars_dict_all.copy() - for i in ex_varlist: - if i in vars_dict: - del vars_dict[i] - # Given an included var list, remove all the variables that are not on the list - else: - vars_dict = vars_dict_all.copy() - for k, v in vars_dict_all.items(): - if (k not in inc_varlist) and (vars_dict_all[k].typecode() == 'f'): - del vars_dict[k] + vars_dict = vars_dict_all.copy() + for i in ex_varlist: + if i in vars_dict: + del vars_dict[i] # We have cell vars and edge vars and vertex vars (and only want time-dependent vars) str_size = 0 # longest var name @@ -237,28 +222,52 @@ def main(argv): v_dim = 'nVertices' # CHECK FOR edge variable u (horizontal wind velocity vector) - + extra_exclude = 0 # sort to cell, edge, and vertex (and grab max str_size) for k, v in vars_dict.items(): # var = k + # get var type + dd = vars_dict[k][:].dtype vd = v.dimensions # all the variable's dimensions (names) + # only car about time dependent vars if t_dim in vd: + # no integers + if dd == 'int32': + ex_varlist.append(k) + extra_exclude = extra_exclude + 1 + if me.get_rank() == 0: + print( + 'WARNING: Variable ', + k, + ' is an integer and should be excluded. Added to json file.', + ) + continue if c_dim in vd: cell_names.append(k) elif e_dim in vd: - edge_names.append(k) if k == 'u': - print( - 'Note: We suggest that variable u (Horizontal normal velocity at edges) be excluded from the summary file in favor of uReconstructZonal and uReconstructMeridional (cell variables).' - ) + # check for uReconstructZonal and uReconstructMeridional + if 'uReconstructZonal' in vars_dict and 'uReconstructMeridional' in vars_dict: + ex_varlist.append(k) + extra_exclude = extra_exclude + 1 + if me.get_rank() == 0: + print( + 'WARNING: We suggest that variable u (Horizontal normal velocity at edges) be excluded from the summary file in favor of uReconstructZonal and uReconstructMeridional (cell variables) Added to json file.' + ) + continue + edge_names.append(k) elif v_dim in vd: vertex_names.append(k) else: - print( - 'Note: variable ', - k, - ' contains time but not cells, edges, or vertices (and will be excluded).', - ) + # add to exclude list + ex_varlist.append(k) + extra_exclude = extra_exclude + 1 + if me.get_rank() == 0: + print( + 'WARNING: variable ', + k, + ' contains time but not cells, edges, or vertices (and will be excluded and added to a new jsonfile).', + ) continue str_size = max(str_size, len(k)) @@ -268,7 +277,7 @@ def main(argv): total = num_cell + num_edge + num_vertex if me.get_rank() == 0 and verbose: - print('VERBOSE: Number of variables found: ', total) + print('VERBOSE: Number of variables (after exclusions) found: ', total) print( 'VERBOSE: Cell variables: ', num_cell, @@ -286,6 +295,7 @@ def main(argv): if esize < total: if me.get_rank() == 0: + print( '**************************************************************************************************' ) @@ -315,6 +325,7 @@ def main(argv): sum_dir = os.path.dirname(this_sumfile) if len(sum_dir) == 0: sum_dir = '.' + if os.path.exists(sum_dir) is False: if me.get_rank() == 0: print('ERROR: Summary file directory: ', sum_dir, ' not found') @@ -322,8 +333,107 @@ def main(argv): this_sumfile = sum_dir + '/' + this_sumfile + varCell_list_loc = me.partition(cell_names, func=EqualStride(), involved=True) + varEdge_list_loc = me.partition(edge_names, func=EqualStride(), involved=True) + varVertex_list_loc = me.partition(vertex_names, func=EqualStride(), involved=True) + + # close first_file + first_file.close() + + # Calculate global means # + if me.get_rank() == 0 and verbose: + print('VERBOSE: Calculating global means .....') + + gmCell, gmEdge, gmVertex = pyEnsLib.generate_global_mean_for_summary_MPAS( + full_in_files, varCell_list_loc, varEdge_list_loc, varVertex_list_loc, opts_dict + ) + + if me.get_rank() == 0 and verbose: + print('VERBOSE: Finished calculating global means .....') + + # gather to rank = 0 + if opts_dict['mpi_enable']: + + # Gather the cell variable results from all processors to the master processor + slice_index = get_stride_list(len(cell_names), me) + # Gather global means cell results + + # print("MYRANK = ", me.get_rank(), slice_index) + gmCell = gather_npArray(gmCell, me, slice_index, (len(cell_names), len(full_in_files))) + # print(gmCell) + + # Gather the edge variable results from all processors to the master processor + slice_index = get_stride_list(len(edge_names), me) + # Gather global means edge results + gmEdge = gather_npArray(gmEdge, me, slice_index, (len(edge_names), len(full_in_files))) + + # Gather the vertex variable results from all processors to the master processor + slice_index = get_stride_list(len(vertex_names), me) + # Gather global means vertex results + gmVertex = gather_npArray( + gmVertex, me, slice_index, (len(vertex_names), len(full_in_files)) + ) + + # rank =0 : complete calculations for summary file if me.get_rank() == 0: + gmall = np.concatenate((gmCell, gmEdge, gmVertex), axis=0) + + # PCA prep and calculation + ( + mu_gm, + sigma_gm, + standardized_global_mean, + loadings_gm, + scores_gm, + new_ex_varlist, + new_gmall, + b_exit, + ) = pyEnsLib.pre_PCA(gmall, all_var_names, ex_varlist, me) + # if PCA calc encounters an error, then remove the summary file and exit + if b_exit: + print('STATUS: Summary could not be created.') + sys.exit(2) + + # update json file? + if len(ex_varlist) < len(new_ex_varlist) or extra_exclude > 0: + print('STATUS: Creating an updated JSON file (with prefix "NEW.")') + new_name = 'NEW.' + opts_dict['jsonfile'] + print( + 'STATUS: Adding ', + len(new_ex_varlist) - len(ex_varlist) + extra_exclude, + ' variables to ', + new_name, + ) + jdict = {} + jdict['ExcludedVar'] = new_ex_varlist + + with open(new_name, 'w') as outfile: + json.dump(jdict, outfile) + + # update ncell, nedge, and nvertex => by removing vars from corresponding names + for i in new_ex_varlist: + if i in all_var_names: + all_var_names.remove(i) + if i in cell_names: + cell_names.remove(i) + elif i in edge_names: + edge_names.remove(i) + elif i in vertex_names: + vertex_names.remove(i) + + num_cell = len(cell_names) + num_edge = len(edge_names) + num_vertex = len(vertex_names) + total = num_cell + num_edge + num_vertex + + nvars = loadings_gm.shape[0] + if nvars != (total): + print('DIMENSION ERROR!') + print('STATUS: Summary could not be created.') + sys.exit(2) + + # create the summary file (still rank 0) if verbose: print('VERBOSE: Creating ', this_sumfile, ' ...') @@ -365,8 +475,6 @@ def main(argv): # Create variables if verbose: print('VERBOSE: Creating variables .....') - # TO DO - any time invarient data that we want? - # v_lev = nc_sumfile.createVariable('lev', 'f8', ('nlev',)) v_vars = nc_sumfile.createVariable('vars', 'S1', ('nvars', 'str_size')) v_varCell = nc_sumfile.createVariable('varCell', 'S1', ('nvarsCell', 'str_size')) @@ -433,91 +541,8 @@ def main(argv): v_varEdge[:] = eq_edge_names[:] v_varVertex[:] = eq_vertex_names[:] - # Time-invarient metadata - # TO DO - if verbose: - print('VERBOSE: Assigning time invariant metadata .....') - # lev_data = first_file.variables['lev'] - # v_lev[:] = lev_data[:] - - # end of rank=0 work - - # All: - - varCell_list_loc = me.partition(cell_names, func=EqualStride(), involved=True) - varEdge_list_loc = me.partition(edge_names, func=EqualStride(), involved=True) - varVertex_list_loc = me.partition(vertex_names, func=EqualStride(), involved=True) - - # close first_file - first_file.close() - - # Calculate global means # - if me.get_rank() == 0 and verbose: - print('VERBOSE: Calculating global means .....') - - gmCell, gmEdge, gmVertex = pyEnsLib.generate_global_mean_for_summary_MPAS( - full_in_files, varCell_list_loc, varEdge_list_loc, varVertex_list_loc, opts_dict - ) - - if me.get_rank() == 0 and verbose: - print('VERBOSE: Finished calculating global means .....') - - # empty - not excluding any vars due to global means at this time - var_list = [] - - # gather to rank = 0 - if opts_dict['mpi_enable']: - - # Gather the cell variable results from all processors to the master processor - slice_index = get_stride_list(len(cell_names), me) - # Gather global means cell results - - # print("MYRANK = ", me.get_rank(), slice_index) - gmCell = gather_npArray(gmCell, me, slice_index, (len(cell_names), len(full_in_files))) - # print(gmCell) - - # Gather the edge variable results from all processors to the master processor - slice_index = get_stride_list(len(edge_names), me) - # Gather global means edge results - gmEdge = gather_npArray(gmEdge, me, slice_index, (len(edge_names), len(full_in_files))) - - # Gather the vertex variable results from all processors to the master processor - slice_index = get_stride_list(len(vertex_names), me) - # Gather global means vertex results - gmVertex = gather_npArray( - gmVertex, me, slice_index, (len(vertex_names), len(full_in_files)) - ) - - ################# - # gather variables to exclude (for pre_pca - currently will be empty) - var_list = gather_list(var_list, me) - - # rank =0 : complete calculations for summary file - if me.get_rank() == 0: - gmall = np.concatenate((gmCell, gmEdge, gmVertex), axis=0) - # save to summary file - v_gm[:, :] = gmall[:, :] - - # print("gmall = ", gmall) - - # PCA prep and calculation - ( - mu_gm, - sigma_gm, - standardized_global_mean, - loadings_gm, - scores_gm, - b_exit, - ) = pyEnsLib.pre_PCA(gmall, all_var_names, var_list, me) - - # if PCA calc encounters an error, then remove the summary file and exit - if b_exit: - nc_sumfile.close() - - os.unlink(this_sumfile) - print('STATUS: Summary could not be created.') - sys.exit(2) - + # populate variables + v_gm[:, :] = new_gmall[:, :] v_standardized_gm[:, :] = standardized_global_mean[:, :] v_mu_gm[:] = mu_gm[:] v_sigma_gm[:] = sigma_gm[:] diff --git a/pyproject.toml b/pyproject.toml index af3bbc7..e78601a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,5 +11,5 @@ pyupgrade = 1 [tool.nbqa.addopts] pyupgrade = ["--py36-plus"] -#[build-system] -#requires = ["setuptools>=45", "wheel", "setuptools_scm>=6.2"] +[build-system] +requires = ["setuptools>=45", "wheel", "setuptools_scm>=6.2"] diff --git a/test_mpas_CECT.sh b/test_mpas_CECT.sh new file mode 100644 index 0000000..b029956 --- /dev/null +++ b/test_mpas_CECT.sh @@ -0,0 +1,17 @@ +#!/bin/tcsh +#PBS -A NTDD0004 +#PBS -N mpas-cect +#PBS -q regular +#PBS -l select=1:ncpus=1:mpiprocs=1 +#PBS -l walltime=0:30:00 +#PBS -j oe +#PBS -M abaker@ucar.edu + +module load conda +conda activate npl + + +setenv TMPDIR /glade/scratch/$USER/temp +mkdir -p $TMPDIR + +python pyCECT.py --sumfile /glade/work/abaker/mpas_data/100_ens_summary/mpas_sum_ts24.nc --indir /glade/scratch/abaker/longer_mpas_hist --tslice 8 --mpas diff --git a/test_pyEnsSum.sh b/test_pyEnsSum.sh index 530d673..e996d38 100644 --- a/test_pyEnsSum.sh +++ b/test_pyEnsSum.sh @@ -1,7 +1,7 @@ #!/bin/tcsh #PBS -A NTDD0004 #PBS -N ensSum -#PBS -q regular +#PBS -q premium #PBS -l select=8:ncpus=9:mpiprocs=9 #PBS -l walltime=0:20:00 #PBS -j oe @@ -14,4 +14,8 @@ conda activate npl setenv TMPDIR /glade/scratch/$USER/temp mkdir -p $TMPDIR -mpiexec python pyEnsSum.py --esize 350 --indir /glade/p/cisl/asap/pycect_sample_data/cam_c1.2.2.1/uf_cam_ens_files --sumfile uf.ens.c1.2.2.1_fc5.ne30.nc --tslice 1 --tag cesm1.2.2.1 --compset FC5 --res ne30_ne30 --mach cheyenne --verbose --jsonfile excluded_varlist.json +#mpiexec python pyEnsSum.py --esize 350 --indir /glade/p/cisl/asap/pycect_sample_data/cam_c1.2.2.1/uf_cam_ens_files --sumfile uf.ens.c1.2.2.1_fc5.ne30.nc --tslice 1 --tag cesm1.2.2.1 --compset FC5 --res ne30_ne30 --mach cheyenne --verbose --jsonfile short.json + +#mpiexec python pyEnsSum.py --esize 350 --indir /glade/p/cisl/asap/pycect_sample_data/cam_c1.2.2.1/uf_cam_ens_files --sumfile uf.ens.c1.2.2.1_fc5.ne30.nc --tslice 1 --tag cesm1.2.2.1 --compset FC5 --res ne30_ne30 --mach cheyenne --verbose --jsonfile excluded_varlist.json + +mpiexec python pyEnsSum.py --esize 350 --indir /glade/p/cisl/asap/pycect_sample_data/cam_c1.2.2.1/uf_cam_ens_files --sumfile uf.ens.c1.2.2.1_fc5.ne30.nc --tslice 1 --tag cesm1.2.2.1 --compset FC5 --res ne30_ne30 --mach cheyenne --verbose --jsonfile one.json diff --git a/test_pyEnsSumMPAS.sh b/test_pyEnsSumMPAS.sh index 878306c..65aa6d8 100644 --- a/test_pyEnsSumMPAS.sh +++ b/test_pyEnsSumMPAS.sh @@ -1,7 +1,7 @@ #!/bin/tcsh #PBS -A NTDD0004 -#PBS -N ensSum -#PBS -q regular +#PBS -N MensSum +#PBS -q premium #PBS -l select=4:ncpus=9:mpiprocs=9 #PBS -l walltime=0:30:00 #PBS -j oe @@ -14,4 +14,4 @@ conda activate npl setenv TMPDIR /glade/scratch/$USER/temp mkdir -p $TMPDIR -mpiexec python pyEnsSumMPAS.py --esize 100 --indir /glade/scratch/abaker/longer_mpas_hist --sumfile mpas_sum_t24_new.nc --tslice 8 --tag v7.1 --model mpas --mach cheyenne --verbose --jsonfile mpas_ex.json +mpiexec python pyEnsSumMPAS.py --esize 100 --indir /glade/p/cisl/asap/abaker/mpas/large_ens --sumfile mpas_sumt4.nc --tslice 4 --tag v7.1 --model mpas --mach cheyenne --verbose --jsonfile test.json